In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("📚 Libraries imported successfully!")
print("🎨 Visualization styles configured!")


📚 Libraries imported successfully!
🎨 Visualization styles configured!


In [2]:
# Load the preprocessed data
try:
    # Try to load processed data first
    df = pd.read_csv('../data/processed/cleaned_banking_reviews.csv')
    print("✅ Loaded preprocessed data!")
    
except FileNotFoundError:
    print("❌ Preprocessed data not found. Please run the preprocessing script first:")
    print("python scripts/data_preprocessing.py")
    
    # Load raw data as fallback
    import glob
    
    # Find latest files
    cbe_files = glob.glob('../data/CBE_reviews_*.csv')
    boa_files = glob.glob('../data/BOA_reviews_*.csv') 
    dashen_files = glob.glob('../data/Dashen_Bank_reviews_*.csv')
    
    if cbe_files and boa_files and dashen_files:
        # Load latest files
        cbe_df = pd.read_csv(max(cbe_files, key=lambda x: x.split('_')[-1]))
        boa_df = pd.read_csv(max(boa_files, key=lambda x: x.split('_')[-1]))
        dashen_df = pd.read_csv(max(dashen_files, key=lambda x: x.split('_')[-1]))
        
        # Combine
        df = pd.concat([cbe_df, boa_df, dashen_df], ignore_index=True)
        
        # Basic cleaning
        df['bank'] = df['bank_name'].map({
            'Commercial Bank of Ethiopia': 'CBE',
            'Bank of Abyssinia': 'BOA',
            'Dashen Bank': 'Dashen Bank'
        })
        df = df.rename(columns={'review_text': 'review'})
        
        print("⚠️ Loaded raw data - consider running preprocessing script for better results")
    else:
        print("❌ No data files found! Please run the scraper first.")

# Display basic info about the dataset
if 'df' in locals():
    print(f"\n📊 Dataset Overview:")
    print(f"   • Total Reviews: {len(df):,}")
    print(f"   • Banks: {df['bank'].nunique()}")
    print(f"   • Date Range: {df['date'].min()} to {df['date'].max()}")
    print(f"   • Columns: {list(df.columns)}")
    
    # Display first few rows
    print(f"\n🔍 Sample Data:")
    display(df.head())


✅ Loaded preprocessed data!

📊 Dataset Overview:
   • Total Reviews: 1,918
   • Banks: 3
   • Date Range: 2024-02-03 to 2025-06-07
   • Columns: ['review', 'rating', 'date', 'bank', 'source', 'review_length', 'word_count', 'rating_category', 'year', 'month', 'quarter', 'rating_valid']

🔍 Sample Data:


Unnamed: 0,review,rating,date,bank,source,review_length,word_count,rating_category,year,month,quarter,rating_valid
0,really am happy to this app it is Siple to use...,5,2025-06-07,CBE,Google Play,57,12,Positive,2025,6,2,True
1,I liked this app. But the User interface is ve...,2,2025-06-07,CBE,Google Play,81,16,Negative,2025,6,2,True
2,"""Why don t your ATMs support account-to-accoun...",4,2025-06-06,CBE,Google Play,111,16,Positive,2025,6,2,True
3,what is this app problem???,1,2025-06-05,CBE,Google Play,27,5,Negative,2025,6,2,True
4,the app is proactive and a good connections.,5,2025-06-05,CBE,Google Play,44,8,Positive,2025,6,2,True
