In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("üìö Libraries imported successfully!")
print("üé® Visualization styles configured!")


üìö Libraries imported successfully!
üé® Visualization styles configured!


In [2]:
# Load the preprocessed data
try:
    # Try to load processed data first
    df = pd.read_csv('../data/processed/cleaned_banking_reviews.csv')
    print("‚úÖ Loaded preprocessed data!")
    
except FileNotFoundError:
    print("‚ùå Preprocessed data not found. Please run the preprocessing script first:")
    print("python scripts/data_preprocessing.py")
    
    # Load raw data as fallback
    import glob
    
    # Find latest files
    cbe_files = glob.glob('../data/CBE_reviews_*.csv')
    boa_files = glob.glob('../data/BOA_reviews_*.csv') 
    dashen_files = glob.glob('../data/Dashen_Bank_reviews_*.csv')
    
    if cbe_files and boa_files and dashen_files:
        # Load latest files
        cbe_df = pd.read_csv(max(cbe_files, key=lambda x: x.split('_')[-1]))
        boa_df = pd.read_csv(max(boa_files, key=lambda x: x.split('_')[-1]))
        dashen_df = pd.read_csv(max(dashen_files, key=lambda x: x.split('_')[-1]))
        
        # Combine
        df = pd.concat([cbe_df, boa_df, dashen_df], ignore_index=True)
        
        # Basic cleaning
        df['bank'] = df['bank_name'].map({
            'Commercial Bank of Ethiopia': 'CBE',
            'Bank of Abyssinia': 'BOA',
            'Dashen Bank': 'Dashen Bank'
        })
        df = df.rename(columns={'review_text': 'review'})
        
        print("‚ö†Ô∏è Loaded raw data - consider running preprocessing script for better results")
    else:
        print("‚ùå No data files found! Please run the scraper first.")

# Display basic info about the dataset
if 'df' in locals():
    print(f"\nüìä Dataset Overview:")
    print(f"   ‚Ä¢ Total Reviews: {len(df):,}")
    print(f"   ‚Ä¢ Banks: {df['bank'].nunique()}")
    print(f"   ‚Ä¢ Date Range: {df['date'].min()} to {df['date'].max()}")
    print(f"   ‚Ä¢ Columns: {list(df.columns)}")
    
    # Display first few rows
    print(f"\nüîç Sample Data:")
    display(df.head())


‚úÖ Loaded preprocessed data!

üìä Dataset Overview:
   ‚Ä¢ Total Reviews: 1,918
   ‚Ä¢ Banks: 3
   ‚Ä¢ Date Range: 2024-02-03 to 2025-06-07
   ‚Ä¢ Columns: ['review', 'rating', 'date', 'bank', 'source', 'review_length', 'word_count', 'rating_category', 'year', 'month', 'quarter', 'rating_valid']

üîç Sample Data:


Unnamed: 0,review,rating,date,bank,source,review_length,word_count,rating_category,year,month,quarter,rating_valid
0,really am happy to this app it is Siple to use...,5,2025-06-07,CBE,Google Play,57,12,Positive,2025,6,2,True
1,I liked this app. But the User interface is ve...,2,2025-06-07,CBE,Google Play,81,16,Negative,2025,6,2,True
2,"""Why don t your ATMs support account-to-accoun...",4,2025-06-06,CBE,Google Play,111,16,Positive,2025,6,2,True
3,what is this app problem???,1,2025-06-05,CBE,Google Play,27,5,Negative,2025,6,2,True
4,the app is proactive and a good connections.,5,2025-06-05,CBE,Google Play,44,8,Positive,2025,6,2,True


In [None]:
# Import the analysis pipeline modules
import sys
sys.path.append('../scripts')

try:
    from analysis_pipeline import AnalysisPipeline
    print("‚úÖ Analysis pipeline modules imported successfully!")
    
    # Display available functionality
    print("\nüîß Available Analysis Components:")
    print("   ‚Ä¢ SentimentAnalyzer: DistilBERT + VADER + TextBlob ensemble")
    print("   ‚Ä¢ ThematicAnalyzer: TF-IDF + spaCy keyword extraction with rule-based clustering")
    print("   ‚Ä¢ AnalysisPipeline: Orchestrates complete sentiment + thematic analysis")
    
except ImportError as e:
    print(f"‚ùå Failed to import analysis modules: {e}")
    print("Please ensure you have installed all required packages:")
    print("pip install transformers torch vaderSentiment textblob spacy scikit-learn nltk")
    print("python -m spacy download en_core_web_sm")


In [None]:
# Run the complete sentiment and thematic analysis pipeline
if 'df' in locals() and len(df) > 0:
    print("üöÄ Starting Comprehensive Analysis Pipeline...")
    print("This may take several minutes depending on your system and dataset size.")
    
    # Initialize pipeline with optimized settings
    pipeline = AnalysisPipeline(
        use_gpu=False,  # Set to True if you have CUDA-enabled GPU
        batch_size=16   # Adjust based on your system memory
    )
    
    # Option 1: Run complete pipeline (recommended)
    print("\nüìä Running complete analysis pipeline...")
    try:
        results = pipeline.run_full_pipeline()
        
        if results:
            print("\n‚úÖ Analysis completed successfully!")
            
            # Store results for further analysis
            sentiment_df = results['sentiment_df']
            theme_df = results['theme_df'] 
            combined_df = results['combined_df']
            pipeline_results = results['pipeline_results']
            
            print(f"\nüìà Quick Summary:")
            print(f"   ‚Ä¢ Sentiment Analysis: {len(sentiment_df)} reviews processed")
            print(f"   ‚Ä¢ Thematic Analysis: {len(theme_df)} reviews processed")
            print(f"   ‚Ä¢ Combined Results: {len(combined_df)} reviews with full analysis")
            
        else:
            print("‚ùå Analysis pipeline failed. Check logs for details.")
            
    except Exception as e:
        print(f"‚ùå Pipeline error: {e}")
        print("You can still run individual components manually (see next cells)")

else:
    print("‚ùå No data available. Please run the data loading cell first.")
