## Sentminet Analysis for Dashn

This notebook is used to perform sentiment analysis on the reviews of DashN

## Setup and Imports

In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys



In [2]:
# Text processing and sentiment analysis
import vaderSentiment
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer



# List all available styles


In [3]:
# List all available styles
print("Available styles:", plt.style.available)

# Use a valid style - try 'seaborn-v0_8' or 'seaborn-darkgrid' if available
plt.style.use('seaborn-v0_8')  # or 'seaborn-darkgrid', 'seaborn-whitegrid', etc.
sns.set_palette("viridis")

Available styles: ['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'petroff10', 'seaborn-v0_8', 'seaborn-v0_8-bright', 'seaborn-v0_8-colorblind', 'seaborn-v0_8-dark', 'seaborn-v0_8-dark-palette', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8-deep', 'seaborn-v0_8-muted', 'seaborn-v0_8-notebook', 'seaborn-v0_8-paper', 'seaborn-v0_8-pastel', 'seaborn-v0_8-poster', 'seaborn-v0_8-talk', 'seaborn-v0_8-ticks', 'seaborn-v0_8-white', 'seaborn-v0_8-whitegrid', 'tableau-colorblind10']


## Load the data

In [6]:
def load_bank_data(bank_name: str = 'dashn') -> Optional[pd.DataFrame]:
    """
    Load and validate preprocessed bank review data with enhanced error reporting.
    """
    try:
        # Normalize bank name and handle common variations
        bank_name = bank_name.lower().strip()
        bank_name = 'dashen' if bank_name == 'dashn' else bank_name
        
        # Use pathlib for cross-platform compatibility
        data_dir = Path('C:/Users/My Device/Desktop/Week-2/data/processed')
        file_path = data_dir / f'cleaned_bank_reviews_{bank_name}.csv'
        
        print(f"üîç Looking for data file at: {file_path}")
        
        if not file_path.exists():
            print(f"‚ùå File not found at: {file_path}")
            available_files = list(data_dir.glob('cleaned_bank_reviews_*.csv'))
            print(f"üìÇ Available files: {[f.name for f in available_files]}")
            return None
        
        # First, read just the first row to check columns
        with open(file_path, 'r', encoding='utf-8') as f:
            first_line = f.readline().strip()
            print(f"üìÑ First line of file: {first_line}")
            
        # Now try to load the full data
        bank_data = pd.read_csv(file_path)
        print("‚úÖ Successfully loaded CSV. Columns found:", bank_data.columns.tolist())
        
        # Convert date column if it exists
        if 'date' in bank_data.columns:
            bank_data['date'] = pd.to_datetime(bank_data['date'])
            print("‚úÖ Converted 'date' column to datetime")
            
        # Rename 'review' to 'review_text' if needed
        if 'review' in bank_data.columns and 'review_text' not in bank_data.columns:
            bank_data = bank_data.rename(columns={'review': 'review_text'})
            print("‚úÖ Renamed 'review' column to 'review_text'")
            
        # Check for required columns
        required_columns = {'date', 'rating', 'review_text'}
        missing = required_columns - set(bank_data.columns)
        if missing:
            raise ValueError(f"Missing required columns: {missing}")
            
        print(f"‚úÖ Successfully processed {len(bank_data)} records")
        return bank_data
        
    except Exception as e:
        print("\n" + "‚ùå" * 20)
        print(f"ERROR DETAILS:\n{str(e)}")
        print("‚ùå" * 20 + "\n")
        import traceback
        traceback.print_exc()
        return None

# Test the function
print("\n" + "="*50)
print("TESTING DATA LOADING...")
print("="*50)
bank_data = load_bank_data('dashn')

if bank_data is not None:
    print("\n" + "="*50)
    print("DATA LOADED SUCCESSFULLY!")
    print("="*50)
    print("\nFirst few rows of data:")
    display(bank_data.head(2))
    print("\nDataFrame info:")
    bank_data.info()
else:
    print("\n" + "="*50)
    print("FAILED TO LOAD DATA")
    print("="*50)


TESTING DATA LOADING...
üîç Looking for data file at: C:\Users\My Device\Desktop\Week-2\data\processed\cleaned_bank_reviews_dashen.csv
üìÑ First line of file: review,rating,date,bank,source
‚úÖ Successfully loaded CSV. Columns found: ['review', 'rating', 'date', 'bank', 'source']
‚úÖ Converted 'date' column to datetime
‚úÖ Renamed 'review' column to 'review_text'
‚úÖ Successfully processed 753 records

DATA LOADED SUCCESSFULLY!

First few rows of data:


Unnamed: 0,review_text,rating,date,bank,source
0,The Dashen Super App is very impressive. It is...,5,2025-11-05,DASHEN,Google Play Store
1,"@Shewangizaw L. As a heavy user of the app , t...",4,2025-10-06,DASHEN,Google Play Store



DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753 entries, 0 to 752
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   review_text  753 non-null    object        
 1   rating       753 non-null    int64         
 2   date         753 non-null    datetime64[ns]
 3   bank         753 non-null    object        
 4   source       753 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 29.5+ KB


## Initialize Sentiment Analyser

In [7]:
class SentimentAnalyzerDashen:
    """
    Optimized Sentiment Analyzer for Dashen Bank reviews with enhanced error handling and performance.
    Customized thresholds for Dashen Bank review patterns.
    """
    def __init__(self):
        """Initialize with VADER and TextBlob with custom lexicon adjustments."""
        from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
        from textblob import TextBlob
        import logging
        
        # Initialize TextBlob
        self.TextBlob = TextBlob
        
        # Initialize VADER with custom lexicon
        self.analyzer = SentimentIntensityAnalyzer()
        self._update_lexicon()
        
        # Setup logging
        self.logger = self._setup_logger()
    
    def _update_lexicon(self):
        """Update VADER lexicon with banking-specific terms for Dashen Bank."""
        banking_lexicon = {
            'super app': 1.5,
            'dashen': 0.5,  # Slightly positive for brand mention
            'app': 0.3,     # Slightly positive for app mention
            'update': 0.2,  # Neutral to slightly positive for updates
            'error': -1.5,  # Strong negative for technical issues
            'slow': -1.2,   # Negative for performance issues
            'crash': -1.8,  # Very negative
            'hack': -2.0,   # Very negative for security concerns
            'thank': 1.0,   # Positive for gratitude
            'helpful': 1.2, # Positive for helpful features
            'easy': 1.0     # Positive for ease of use
        }
        
        # Update the VADER lexicon
        self.analyzer.lexicon.update(banking_lexicon)
    
    def _setup_logger(self):
        """Configure and return a logger instance."""
        import logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        return logging.getLogger('Dashen_Sentiment_Analyzer')
    
    def analyze_textblob(self, text):
        """Analyze sentiment using TextBlob with enhanced error handling."""
        try:
            # Preprocess text - remove common issues
            text = str(text).strip()
            if not text or text.lower() in ['na', 'n/a', 'none', '']:
                return {'polarity': 0, 'subjectivity': 0, 'sentiment': 'neutral'}
                
            analysis = self.TextBlob(text)
            return {
                'polarity': analysis.sentiment.polarity,
                'subjectivity': analysis.sentiment.subjectivity,
                'sentiment': self._categorize_sentiment(analysis.sentiment.polarity)
            }
        except Exception as e:
            self.logger.error(f"TextBlob analysis failed: {str(e)}")
            return {'polarity': 0, 'subjectivity': 0, 'sentiment': 'error'}
    
    def analyze_vader(self, text):
        """Analyze sentiment using VADER with enhanced error handling."""
        try:
            text = str(text).strip()
            if not text or text.lower() in ['na', 'n/a', 'none', '']:
                return {'vader_compound': 0, 'vader_sentiment': 'neutral'}
                
            vs = self.analyzer.polarity_scores(text)
            return {
                'vader_compound': vs['compound'],
                'vader_positive': vs['pos'],
                'vader_negative': vs['neg'],
                'vader_neutral': vs['neu'],
                'vader_sentiment': self._categorize_vader(vs['compound'])
            }
        except Exception as e:
            self.logger.error(f"VADER analysis failed: {str(e)}")
            return {
                'vader_compound': 0,
                'vader_positive': 0,
                'vader_negative': 0,
                'vader_neutral': 1,
                'vader_sentiment': 'error'
            }
    
    @staticmethod
    def _categorize_sentiment(score):
        """Categorize TextBlob score with Dashen-specific thresholds."""
        thresholds = {
            'positive': 0.15,    # Slightly higher threshold for positive
            'negative': -0.15    # Slightly lower threshold for negative
        }
        if score > thresholds['positive']:
            return 'positive'
        if score < thresholds['negative']:
            return 'negative'
        return 'neutral'
    
    @staticmethod
    def _categorize_vader(score):
        """Categorize VADER score with Dashen-specific thresholds."""
        thresholds = {
            'positive': 0.10,    # More sensitive to positive sentiment
            'negative': -0.10    # More sensitive to negative sentiment
        }
        if score >= thresholds['positive']:
            return 'positive'
        if score <= thresholds['negative']:
            return 'negative'
        return 'neutral'
    
    def analyze_reviews(self, reviews, batch_size=100, text_column='review_text'):
        """
        Analyze a collection of reviews with progress tracking.
        
        Args:
            reviews: DataFrame or list of reviews
            batch_size: Number of reviews to process before logging progress
            text_column: Name of the column containing review text (if reviews is a DataFrame)
            
        Returns:
            pd.DataFrame: Original data with added sentiment analysis columns
        """
        import pandas as pd
        from tqdm.notebook import tqdm
        import numpy as np
        
        # Convert to DataFrame if it's a list
        if isinstance(reviews, list):
            reviews = pd.DataFrame({text_column: reviews})
        
        results = []
        self.logger.info(f"Starting analysis of {len(reviews)} reviews...")
        
        # Initialize progress bar
        tqdm.pandas(desc="Analyzing reviews")
        
        # Process each review
        for idx, row in tqdm(reviews.iterrows(), total=len(reviews)):
            review_text = row[text_column] if text_column in row else row
            tb_result = self.analyze_textblob(review_text)
            vader_result = self.analyze_vader(review_text)
            
            # Combine results
            result = {
                'review_id': idx,
                'review_text': review_text,
                **tb_result,
                **vader_result
            }
            
            # Add any additional columns from the original data
            if isinstance(reviews, pd.DataFrame):
                for col in reviews.columns:
                    if col != text_column:
                        result[col] = row[col]
            
            results.append(result)
        
        self.logger.info("Analysis completed successfully")
        return pd.DataFrame(results)

# Initialize the analyzer with logging
print("Initializing Dashen Bank Sentiment Analyzer...")
analyzer = SentimentAnalyzerDashen()
print("Dashen Bank Sentiment Analyzer is ready!")

# Example usage:
# if bank_data is not None:
#     sentiment_results = analyzer.analyze_reviews(bank_data, text_column='review_text')
#     print("\nSentiment Analysis Results:")
#     display(sentiment_results.head(3))

Initializing Dashen Bank Sentiment Analyzer...
Dashen Bank Sentiment Analyzer is ready!


## Performing Sentiment Analyis for Dashn

In [9]:
# First, let's define the SentimentAnalyzerDashen class
class SentimentAnalyzerDashen:
    """
    Optimized Sentiment Analyzer for Dashen Bank reviews with enhanced error handling and performance.
    Customized for banking-specific terminology and review patterns.
    """
    def __init__(self):
        """Initialize with VADER and TextBlob with custom lexicon adjustments."""
        from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
        from textblob import TextBlob
        import logging
        
        # Initialize TextBlob
        self.TextBlob = TextBlob
        
        # Initialize VADER with custom lexicon
        self.analyzer = SentimentIntensityAnalyzer()
        self._update_lexicon()
        
        # Setup logging
        self.logger = self._setup_logger()
    
    def _update_lexicon(self):
        """Update VADER lexicon with banking-specific terms for Dashen Bank."""
        banking_lexicon = {
            'super app': 1.5,
            'dashen': 0.5,  # Slightly positive for brand mention
            'app': 0.3,     # Slightly positive for app mention
            'update': 0.2,  # Neutral to slightly positive for updates
            'error': -1.5,  # Strong negative for technical issues
            'slow': -1.2,   # Negative for performance issues
            'crash': -1.8,  # Very negative
            'hack': -2.0,   # Very negative for security concerns
            'thank': 1.0,   # Positive for gratitude
            'helpful': 1.2, # Positive for helpful features
            'easy': 1.0     # Positive for ease of use
        }
        
        # Update the VADER lexicon
        self.analyzer.lexicon.update(banking_lexicon)
    
    def _setup_logger(self):
        """Configure and return a logger instance."""
        import logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        return logging.getLogger('Dashen_Sentiment_Analyzer')
    
    def analyze_textblob(self, text):
        """Analyze sentiment using TextBlob with enhanced error handling."""
        try:
            text = str(text).strip()
            if not text or text.lower() in ['na', 'n/a', 'none', '']:
                return {'polarity': 0, 'subjectivity': 0, 'sentiment': 'neutral'}
                
            analysis = self.TextBlob(text)
            return {
                'polarity': analysis.sentiment.polarity,
                'subjectivity': analysis.sentiment.subjectivity,
                'sentiment': self._categorize_sentiment(analysis.sentiment.polarity)
            }
        except Exception as e:
            self.logger.error(f"TextBlob analysis failed: {str(e)}")
            return {'polarity': 0, 'subjectivity': 0, 'sentiment': 'error'}
    
    def analyze_vader(self, text):
        """Analyze sentiment using VADER with enhanced error handling."""
        try:
            text = str(text).strip()
            if not text or text.lower() in ['na', 'n/a', 'none', '']:
                return {'vader_compound': 0, 'vader_sentiment': 'neutral'}
                
            vs = self.analyzer.polarity_scores(text)
            return {
                'vader_compound': vs['compound'],
                'vader_positive': vs['pos'],
                'vader_negative': vs['neg'],
                'vader_neutral': vs['neu'],
                'vader_sentiment': self._categorize_vader(vs['compound'])
            }
        except Exception as e:
            self.logger.error(f"VADER analysis failed: {str(e)}")
            return {
                'vader_compound': 0,
                'vader_positive': 0,
                'vader_negative': 0,
                'vader_neutral': 1,
                'vader_sentiment': 'error'
            }
    
    @staticmethod
    def _categorize_sentiment(score):
        """Categorize TextBlob score with Dashen-specific thresholds."""
        thresholds = {
            'positive': 0.15,    # Slightly higher threshold for positive
            'negative': -0.15    # Slightly lower threshold for negative
        }
        if score > thresholds['positive']:
            return 'positive'
        if score < thresholds['negative']:
            return 'negative'
        return 'neutral'
    
    @staticmethod
    def _categorize_vader(score):
        """Categorize VADER score with Dashen-specific thresholds."""
        thresholds = {
            'positive': 0.10,    # More sensitive to positive sentiment
            'negative': -0.10    # More sensitive to negative sentiment
        }
        if score >= thresholds['positive']:
            return 'positive'
        if score <= thresholds['negative']:
            return 'negative'
        return 'neutral'
    
    def analyze_reviews(self, reviews, text_column='review'):
        """
        Analyze a collection of reviews with progress tracking.
        
        Args:
            reviews: DataFrame or list of reviews
            text_column: Name of the column containing review text (if reviews is a DataFrame)
            
        Returns:
            pd.DataFrame: Analysis results
        """
        import pandas as pd
        from tqdm.notebook import tqdm
        
        # Convert to DataFrame if it's a list
        if isinstance(reviews, list):
            reviews = pd.DataFrame({text_column: reviews})
        
        results = []
        self.logger.info(f"Starting analysis of {len(reviews)} reviews...")
        
        # Process each review
        for idx, row in tqdm(reviews.iterrows(), total=len(reviews), desc="Analyzing reviews"):
            review_text = row[text_column] if text_column in row else row
            tb_result = self.analyze_textblob(review_text)
            vader_result = self.analyze_vader(review_text)
            
            # Combine results
            result = {
                'review_id': idx,
                'review_text': review_text,
                **tb_result,
                **vader_result
            }
            
            # Add any additional columns from the original data
            if isinstance(reviews, pd.DataFrame):
                for col in reviews.columns:
                    if col != text_column:
                        result[col] = row[col]
            
            results.append(result)
        
        self.logger.info("Analysis completed successfully")
        return pd.DataFrame(results)

# Now the analyze_dashen_sentiments function
def analyze_dashen_sentiments(dashen_data, sample_size=None, random_state=42, batch_size=50):
    """
    Perform sentiment analysis on Dashen Bank reviews with progress tracking and error handling.
    Optimized for performance and memory efficiency.
    
    Args:
        dashen_data (pd.DataFrame): DataFrame containing Dashen Bank reviews
        sample_size (int, optional): Number of reviews to analyze. If None, uses all reviews.
        random_state (int): Random seed for reproducibility
        batch_size (int): Number of reviews to process in each batch (for memory efficiency)
        
    Returns:
        pd.DataFrame: Analyzed Dashen Bank reviews with sentiment scores
    """
    import pandas as pd
    import logging
    from tqdm.notebook import tqdm
    import numpy as np
    from datetime import datetime
    
    # Setup logging with Dashen-specific formatting
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - DASHEN - %(levelname)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    logger = logging.getLogger('Dashen_Sentiment_Analysis')
    
    if dashen_data is None or dashen_data.empty:
        logger.error("No Dashen Bank data provided or data is empty")
        return None
    
    # Sample the data if sample_size is provided
    if sample_size and len(dashen_data) > sample_size:
        dashen_data = dashen_data.sample(
            min(sample_size, len(dashen_data)), 
            random_state=random_state
        )
        logger.info(f"Analyzing sample of {len(dashen_data):,} Dashen Bank reviews...")
    else:
        logger.info(f"Analyzing all {len(dashen_data):,} Dashen Bank reviews...")
    
    try:
        # Initialize analyzer
        analyzer = SentimentAnalyzerDashen()
        logger.info("Sentiment analyzer initialized successfully")
        
        # Process in batches for better memory management
        results = analyzer.analyze_reviews(dashen_data, text_column='review')
        
        # Add analysis metadata
        results['analysis_timestamp'] = datetime.now()
        results['bank'] = 'DASHEN'
        
        # Calculate sentiment confidence
        results['sentiment_confidence'] = np.abs(
            results['vader_compound'] * results['polarity']
        )
        
        logger.info(f"‚úÖ Successfully analyzed {len(results):,} Dashen Bank reviews")
        
        # Basic sentiment statistics
        sentiment_dist = results['vader_sentiment'].value_counts(normalize=True) * 100
        logger.info("Sentiment Distribution:\n" + 
                   "\n".join([f"  - {k}: {v:.1f}%" for k, v in sentiment_dist.items()]))
        
        return results
        
    except Exception as e:
        logger.error(f"‚ùå Error analyzing Dashen Bank data: {str(e)}", exc_info=True)
        return None

# Example usage:
if __name__ == "__main__":
    # Load Dashen data
    dashen_data = load_bank_data('dashen')  # or 'dashn' based on your file naming
    
    if dashen_data is not None:
        # Analyze all Dashen reviews
        dashen_analyzed = analyze_dashen_sentiments(
            dashen_data,
            sample_size=None,  # Set to a number to analyze a sample
            batch_size=100     # Adjust based on your system's memory
        )
        
        if dashen_analyzed is not None:
            print("\nAnalysis completed successfully!")
            print("\nSample of analyzed data:")
            display(dashen_analyzed[[
                'review_text', 
                'rating', 
                'sentiment', 
                'vader_sentiment',
                'sentiment_confidence'
            ]].head())
            
            print(f"\nTotal reviews analyzed: {len(dashen_analyzed):,}")
            print("\nSentiment distribution:")
            print(dashen_analyzed['vader_sentiment'].value_counts(normalize=True).mul(100))
    else:
        print("Failed to load Dashen Bank data")

2025-11-30 19:40:53,534 - INFO - Analyzing all 753 Dashen Bank reviews...


2025-11-30 19:40:53,579 - INFO - Sentiment analyzer initialized successfully
2025-11-30 19:40:53,584 - INFO - Starting analysis of 753 reviews...


üîç Looking for data file at: C:\Users\My Device\Desktop\Week-2\data\processed\cleaned_bank_reviews_dashen.csv
üìÑ First line of file: review,rating,date,bank,source
‚úÖ Successfully loaded CSV. Columns found: ['review', 'rating', 'date', 'bank', 'source']
‚úÖ Converted 'date' column to datetime
‚úÖ Renamed 'review' column to 'review_text'
‚úÖ Successfully processed 753 records


Analyzing reviews:   0%|          | 0/753 [00:00<?, ?it/s]

2025-11-30 19:40:58,228 - INFO - Analysis completed successfully
2025-11-30 19:40:58,305 - INFO - ‚úÖ Successfully analyzed 753 Dashen Bank reviews
2025-11-30 19:40:58,310 - INFO - Sentiment Distribution:
  - positive: 96.5%
  - neutral: 3.1%
  - negative: 0.4%



Analysis completed successfully!

Sample of analyzed data:


Unnamed: 0,review_text,rating,sentiment,vader_sentiment,sentiment_confidence
0,The Dashen Super App is very impressive. It is...,5,positive,positive,0.6116
1,"@Shewangizaw L. As a heavy user of the app , t...",4,negative,positive,0.12074
2,This might be the worst banking app I've ever ...,1,negative,neutral,0.0431
3,What do you care about my phone's settings? Wh...,1,positive,positive,0.3902
4,The Dashen Super App is very impressive. It is...,5,positive,positive,0.6116



Total reviews analyzed: 753

Sentiment distribution:
vader_sentiment
positive    96.547145
neutral      3.054449
negative     0.398406
Name: proportion, dtype: float64


## Visualize Sentiment Analysis

In [3]:
def plot_dashen_sentiment_analysis(dashen_analyzed, save_path='../reports/figures'):
    """
    Generate comprehensive visualizations for Dashen Bank sentiment analysis
    without using wordcloud.
    """
    # Import required libraries
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from pathlib import Path
    import re
    from collections import Counter
    from matplotlib.ticker import PercentFormatter
    import matplotlib as mpl
    
    # Set styles
    plt.style.use('seaborn-v0_8-whitegrid')
    sns.set_palette("viridis")
    mpl.rcParams['figure.facecolor'] = 'white'
    mpl.rcParams['axes.facecolor'] = 'white'
    
    # Color scheme
    COLORS = {
        'primary': '#00A651',    # Dashen Green
        'secondary': '#FFD700',  # Gold
        'accent': '#003366',     # Dark Blue
        'sentiment': {
            'positive': '#2ecc71',  # Green
            'neutral': '#f39c12',   # Orange
            'negative': '#e74c3c'   # Red
        }
    }
    
    # Create output directory
    save_path = Path(save_path)
    save_path.mkdir(parents=True, exist_ok=True)
    
    # Input validation
    if dashen_analyzed is None or dashen_analyzed.empty:
        print("‚ùå No data available for visualization")
        return
    
    # Helper function to save plots
    def save_plot(fig, filename, dpi=300):
        try:
            filepath = save_path / filename
            fig.savefig(filepath, dpi=dpi, bbox_inches='tight', facecolor='white')
            print(f"‚úÖ Saved: {filepath}")
            return filepath
        except Exception as e:
            print(f"‚ö†Ô∏è Error saving {filename}: {str(e)}")
            return None
    
    # 1. Create main figure with subplots
    fig = plt.figure(figsize=(22, 24))
    gs = fig.add_gridspec(4, 2, height_ratios=[1, 1, 1, 1.2])
    fig.suptitle('Dashen Bank Sentiment Analysis', 
                fontsize=24, y=1.02, weight='bold', color=COLORS['primary'])
    
    # 2. Plot Sentiment Distribution (TextBlob)
    ax1 = fig.add_subplot(gs[0, 0])
    if 'sentiment' in dashen_analyzed.columns:
        sentiment_counts = dashen_analyzed['sentiment'].value_counts()
        colors = [COLORS['sentiment'].get(x, '#999999') for x in sentiment_counts.index]
        
        wedges, texts, autotexts = ax1.pie(
            sentiment_counts, 
            labels=sentiment_counts.index,
            autopct='%1.1f%%',
            startangle=90,
            wedgeprops=dict(width=0.6, edgecolor='white'),
            colors=colors,
            textprops={'fontsize': 12}
        )
        ax1.set_title('Overall Sentiment Distribution (TextBlob)', 
                     fontsize=16, pad=20, weight='bold', color=COLORS['primary'])
    else:
        ax1.text(0.5, 0.5, 'Sentiment data not available', 
                ha='center', va='center', fontsize=14)
        ax1.axis('off')
    
    # 3. Plot VADER Sentiment Distribution
    ax2 = fig.add_subplot(gs[0, 1])
    if 'vader_sentiment' in dashen_analyzed.columns:
        vader_counts = dashen_analyzed['vader_sentiment'].value_counts()
        colors = [COLORS['sentiment'].get(x, '#999999') for x in vader_counts.index]
        
        wedges, texts, autotexts = ax2.pie(
            vader_counts, 
            labels=vader_counts.index,
            autopct='%1.1f%%',
            startangle=90,
            wedgeprops=dict(width=0.6, edgecolor='white'),
            colors=colors,
            textprops={'fontsize': 12}
        )
        ax2.set_title('Sentiment Distribution (VADER)', 
                     fontsize=16, pad=20, weight='bold', color=COLORS['primary'])
    else:
        ax2.text(0.5, 0.5, 'VADER sentiment data not available', 
                ha='center', va='center', fontsize=14)
        ax2.axis('off')
    
    # 4. Sentiment by Rating (Stacked Bar)
    ax3 = fig.add_subplot(gs[1, :])
    if all(col in dashen_analyzed.columns for col in ['rating', 'sentiment']):
        try:
            sentiment_by_rating = pd.crosstab(
                dashen_analyzed['rating'], 
                dashen_analyzed['sentiment'],
                normalize='index'
            ).sort_index(ascending=False) * 100
            
            sentiment_by_rating.plot(
                kind='barh',
                stacked=True, 
                ax=ax3,
                color=[COLORS['sentiment'].get(x, '#999999') for x in sentiment_by_rating.columns],
                edgecolor='white',
                linewidth=0.5
            )
            
            ax3.set_title('Sentiment Distribution by Rating', 
                         fontsize=18, pad=15, weight='bold', color=COLORS['primary'])
            ax3.set_xlabel('Percentage of Reviews', fontsize=14, labelpad=10)
            ax3.set_ylabel('Rating (1-5)', fontsize=14, labelpad=10)
            ax3.legend(title='Sentiment', bbox_to_anchor=(1.02, 1), 
                      loc='upper left', fontsize=12)
            ax3.grid(axis='x', linestyle='--', alpha=0.3)
            ax3.xaxis.set_major_formatter(PercentFormatter(100.0))
            
            for container in ax3.containers:
                ax3.bar_label(
                    container, 
                    label_type='center', 
                    fmt='%.1f%%',
                    color='white' if container.datavalues[0] > 30 else 'black',
                    fontsize=10,
                    fontweight='bold',
                    padding=2
                )
        except Exception as e:
            print(f"‚ö†Ô∏è Error in rating distribution: {str(e)}")
            ax3.text(0.5, 0.5, 'Could not generate rating distribution', 
                    ha='center', va='center', fontsize=14)
            ax3.axis('off')
    else:
        ax3.text(0.5, 0.5, 'Rating or sentiment data not available', 
                ha='center', va='center', fontsize=14)
        ax3.axis('off')
    
    # 5. Average Sentiment Scores by Rating
    ax4 = fig.add_subplot(gs[2, :])
    if 'rating' in dashen_analyzed.columns:
        try:
            avg_scores = dashen_analyzed.groupby('rating')[
                ['polarity', 'vader_compound']
            ].mean().sort_index(ascending=False).reset_index()
            
            melted_scores = pd.melt(
                avg_scores, 
                id_vars=['rating'], 
                var_name='metric', 
                value_name='score'
            )
            
            sns.barplot(
                data=melted_scores,
                y='rating',
                x='score',
                hue='metric',
                palette=[COLORS['primary'], COLORS['secondary']],
                ax=ax4,
                edgecolor='white',
                linewidth=0.5,
                order=sorted(dashen_analyzed['rating'].unique(), reverse=True)
            )
            
            ax4.set_title('Average Sentiment Scores by Rating', 
                         fontsize=18, pad=15, weight='bold', color=COLORS['primary'])
            ax4.set_xlabel('Average Sentiment Score', fontsize=14, labelpad=10)
            ax4.set_ylabel('Rating (1-5)', fontsize=14, labelpad=10)
            ax4.axvline(0, color='black', linewidth=0.8, linestyle='--')
            ax4.legend(
                title='Metric', 
                labels=['TextBlob Polarity', 'VADER Compound'],
                bbox_to_anchor=(1.02, 1), 
                loc='upper left', 
                fontsize=12
            )
            ax4.grid(axis='x', linestyle='--', alpha=0.3)
            
            for p in ax4.patches:
                width = p.get_width()
                if not np.isnan(width):
                    ax4.annotate(
                        f"{width:.2f}",
                        (width, p.get_y() + p.get_height() / 2.),
                        ha='left' if width >= 0 else 'right', 
                        va='center',
                        xytext=(10 if width >= 0 else -10, 0), 
                        textcoords='offset points',
                        fontsize=10,
                        fontweight='bold'
                    )
        except Exception as e:
            print(f"‚ö†Ô∏è Error in average scores: {str(e)}")
            ax4.text(0.5, 0.5, 'Could not generate average scores', 
                    ha='center', va='center', fontsize=14)
            ax4.axis('off')
    else:
        ax4.text(0.5, 0.5, 'Rating data not available', 
                ha='center', va='center', fontsize=14)
        ax4.axis('off')
    
    # 6. Top Words Analysis (Alternative to Word Cloud)
    ax5 = fig.add_subplot(gs[3, :])
    if 'review_text' in dashen_analyzed.columns:
        try:
            # Combine all reviews and clean the text
            text = ' '.join(str(review) for review in dashen_analyzed['review_text'] if pd.notna(review))
            words = re.findall(r'\b\w+\b', text.lower())
            
            # Remove common stopwords (customize this list as needed)
            stopwords = {
                'the', 'and', 'to', 'of', 'in', 'is', 'it', 'that', 'for', 'with', 
                'was', 'as', 'on', 'at', 'by', 'this', 'are', 'be', 'from', 'or',
                'a', 'an', 'my', 'i', 'me', 'you', 'your', 'we', 'our', 'they', 'their',
                'dashen', 'bank', 'app', 'service', 'banking', 'use', 'one', 'have', 'has',
                'had', 'not', 'but', 'so', 'if', 'very', 'just', 'all', 'am', 'pm', 'get'
            }
            
            # Filter words
            words = [word for word in words if word not in stopwords and len(word) > 2]
            
            # Count word frequencies
            word_counts = Counter(words).most_common(15)  # Top 15 words
            
            if word_counts:
                words, counts = zip(*word_counts)
                y_pos = np.arange(len(words))
                
                ax5.barh(y_pos, counts, color=COLORS['primary'], height=0.8)
                ax5.set_yticks(y_pos)
                ax5.set_yticklabels(words, fontsize=12)
                ax5.invert_yaxis()  # Most frequent on top
                ax5.set_title('Most Frequent Words in Reviews', 
                             fontsize=18, pad=15, weight='bold', 
                             color=COLORS['primary'])
                ax5.set_xlabel('Frequency', fontsize=14)
                ax5.grid(axis='x', linestyle='--', alpha=0.3)
                
                # Add count labels
                for i, v in enumerate(counts):
                    ax5.text(v + 0.5, i, str(v), color='black', va='center')
            else:
                ax5.text(0.5, 0.5, 'No words to display', 
                        ha='center', va='center', fontsize=14)
                ax5.axis('off')
                
        except Exception as e:
            print(f"‚ö†Ô∏è Error in word frequency analysis: {str(e)}")
            ax5.text(0.5, 0.5, 'Could not generate word frequency analysis', 
                    ha='center', va='center', fontsize=14)
            ax5.axis('off')
    else:
        ax5.text(0.5, 0.5, 'Review text not available', 
                ha='center', va='center', fontsize=14)
        ax5.axis('off')
    
    # 7. Save and show main figure
    plt.tight_layout()
    save_plot(fig, 'dashen_sentiment_analysis.png')
    plt.show()
    
    # 8. Additional: Time Series Plot
    if 'date' in dashen_analyzed.columns:
        try:
            plt.figure(figsize=(18, 8))
            
            dashen_analyzed['date'] = pd.to_datetime(dashen_analyzed['date'])
            time_series = dashen_analyzed.set_index('date')
            
            for col, label, marker in [
                ('polarity', 'TextBlob Polarity', 'o'),
                ('vader_compound', 'VADER Compound', 's')
            ]:
                if col in time_series.columns:
                    monthly_avg = time_series[col].resample('M').mean()
                    plt.plot(monthly_avg.index, monthly_avg, 
                            label=label,
                            color=COLORS['primary'] if col == 'polarity' else COLORS['secondary'],
                            linewidth=2,
                            marker=marker,
                            markersize=8)
            
            plt.title('Monthly Average Sentiment Scores Over Time', 
                     fontsize=20, pad=20, weight='bold', color=COLORS['primary'])
            plt.xlabel('Date', fontsize=14, labelpad=10)
            plt.ylabel('Average Sentiment Score', fontsize=14, labelpad=10)
            plt.axhline(0, color='black', linewidth=0.8, linestyle='--')
            plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.3)
            plt.xticks(rotation=45, fontsize=12)
            plt.yticks(fontsize=12)
            plt.tight_layout()
            
            save_plot(plt.gcf(), 'dashen_sentiment_over_time.png', dpi=350)
            plt.show()
            
        except Exception as e:
            print(f"‚ö†Ô∏è Error in time series plot: {str(e)}")
    
    print("‚úÖ All visualizations completed successfully!")

# Example usage:
# plot_dashen_sentiment_analysis(your_dataframe)

In [4]:
def plot_dashen_sentiment_analysis(dashen_analyzed, save_path='../reports/figures'):
    """
    Generate comprehensive visualizations for Dashen Bank sentiment analysis.
    
    Args:
        dashen_analyzed (pd.DataFrame): Analyzed Dashen Bank reviews DataFrame
        save_path (str): Directory to save the visualizations
    """
    import os
    import matplotlib.pyplot as plt
    import seaborn as sns
    from pathlib import Path
    import pandas as pd
    
    # Set style and colors
    plt.style.use('seaborn-v0_8')
    sns.set_palette("viridis")
    primary_color = '#00A651'    # Dashen Green
    secondary_color = '#FFD700'  # Gold accent
    
    # Create output directory if it doesn't exist
    Path(save_path).mkdir(parents=True, exist_ok=True)
    
    if dashen_analyzed is None or dashen_analyzed.empty:
        print("‚ùå No Dashen Bank data available for visualization")
        return
    
    # Create figure with subplots
    fig = plt.figure(figsize=(20, 18))
    gs = fig.add_gridspec(3, 2)
    fig.suptitle('Dashen Bank Sentiment Analysis', 
                fontsize=20, y=1.02, weight='bold', color=primary_color)
    
    # 1. Overall Sentiment Distribution (TextBlob)
    ax1 = fig.add_subplot(gs[0, 0])
    sentiment_counts = dashen_analyzed['sentiment'].value_counts()
    wedges, texts, autotexts = ax1.pie(
        sentiment_counts, 
        labels=sentiment_counts.index,
        autopct='%1.1f%%',
        startangle=90,
        wedgeprops=dict(width=0.6, edgecolor='white'),
        colors=['#2ecc71', '#f39c12', '#e74c3c']  # Green, Orange, Red
    )
    ax1.set_title('Overall Sentiment Distribution (TextBlob)', 
                 fontsize=14, pad=20, weight='bold')
    
    # 2. VADER Sentiment Distribution
    ax2 = fig.add_subplot(gs[0, 1])
    vader_counts = dashen_analyzed['vader_sentiment'].value_counts()
    wedges, texts, autotexts = ax2.pie(
        vader_counts, 
        labels=vader_counts.index,
        autopct='%1.1f%%',
        startangle=90,
        wedgeprops=dict(width=0.6, edgecolor='white'),
        colors=['#2ecc71', '#f39c12', '#e74c3c']  # Green, Orange, Red
    )
    ax2.set_title('Sentiment Distribution (VADER)', 
                 fontsize=14, pad=20, weight='bold')
    
    # 3. Sentiment by Rating (Stacked Bar)
    ax3 = fig.add_subplot(gs[1, :])
    if 'rating' in dashen_analyzed.columns:
        sentiment_by_rating = pd.crosstab(
            dashen_analyzed['rating'], 
            dashen_analyzed['sentiment'],
            normalize='index'
        ) * 100
        
        sentiment_by_rating.plot(
            kind='bar', 
            stacked=True, 
            ax=ax3,
            color=['#e74c3c', '#f39c12', '#2ecc71'],  # Red, Yellow, Green
            edgecolor='white',
            linewidth=0.5
        )
        
        # Customize the plot
        ax3.set_title('Sentiment Distribution by Rating', 
                     fontsize=16, pad=15, weight='bold')
        ax3.set_xlabel('Rating (1-5)', fontsize=12, labelpad=10)
        ax3.set_ylabel('Percentage of Reviews', fontsize=12, labelpad=10)
        ax3.legend(title='Sentiment', bbox_to_anchor=(1.02, 1), 
                  loc='upper left', borderaxespad=0.)
        ax3.grid(axis='y', linestyle='--', alpha=0.3)
        
        # Add percentage labels on each segment
        for container in ax3.containers:
            ax3.bar_label(container, 
                         label_type='center', 
                         fmt='%.1f%%',
                         color='black',
                         fontsize=9,
                         padding=2)
    
    # 4. Average Sentiment Scores by Rating
    ax4 = fig.add_subplot(gs[2, :])
    if 'rating' in dashen_analyzed.columns:
        avg_scores = dashen_analyzed.groupby('rating')[
            ['polarity', 'vader_compound']
        ].mean().reset_index()
        
        # Melt for easier plotting
        melted_scores = pd.melt(
            avg_scores, 
            id_vars=['rating'], 
            var_name='metric', 
            value_name='score'
        )
        
        # Create the bar plot
        sns.barplot(
            data=melted_scores,
            x='rating',
            y='score',
            hue='metric',
            palette=[primary_color, secondary_color],
            ax=ax4,
            edgecolor='white',
            linewidth=0.5
        )
        
        # Customize the plot
        ax4.set_title('Average Sentiment Scores by Rating', 
                     fontsize=16, pad=15, weight='bold')
        ax4.set_xlabel('Rating (1-5)', fontsize=12, labelpad=10)
        ax4.set_ylabel('Average Sentiment Score', fontsize=12, labelpad=10)
        ax4.axhline(0, color='black', linewidth=0.8, linestyle='--')
        ax4.legend(title='Metric', 
                  labels=['TextBlob Polarity', 'VADER Compound'],
                  bbox_to_anchor=(1.02, 1), 
                  loc='upper left', 
                  borderaxespad=0.)
        ax4.grid(axis='y', linestyle='--', alpha=0.3)
        
        # Add value labels on top of bars
        for p in ax4.patches:
            ax4.annotate(
                f"{p.get_height():.2f}",
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', 
                va='center', 
                xytext=(0, 10), 
                textcoords='offset points',
                fontsize=9
            )
    
    # Adjust layout and save
    plt.tight_layout()
    
    # Save the figure
    save_file = os.path.join(save_path, 'dashen_sentiment_analysis.png')
    plt.savefig(save_file, dpi=300, bbox_inches='tight')
    print(f"‚úÖ Visualizations saved to {save_file}")
    
    plt.show()
    
    # Additional Analysis: Sentiment Over Time (if date column exists)
    if 'date' in dashen_analyzed.columns:
        plt.figure(figsize=(15, 6))
        
        try:
            # Convert to datetime and set as index
            dashen_analyzed['date'] = pd.to_datetime(dashen_analyzed['date'])
            time_series = dashen_analyzed.set_index('date')
            
            # Resample by month and calculate mean
            monthly_avg = time_series[['polarity', 'vader_compound']].resample('M').mean()
            
            # Plot the time series
            plt.plot(monthly_avg.index, monthly_avg['polarity'], 
                    label='TextBlob Polarity', color=primary_color, linewidth=2.5)
            plt.plot(monthly_avg.index, monthly_avg['vader_compound'], 
                    label='VADER Compound', color=secondary_color, linewidth=2.5)
            
            # Customize the plot
            plt.title('Monthly Average Sentiment Scores Over Time', 
                     fontsize=16, pad=15, weight='bold')
            plt.xlabel('Date', fontsize=12, labelpad=10)
            plt.ylabel('Average Sentiment Score', fontsize=12, labelpad=10)
            plt.axhline(0, color='black', linewidth=0.8, linestyle='--')
            plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0.)
            plt.grid(True, linestyle='--', alpha=0.3)
            plt.xticks(rotation=45)
            plt.tight_layout()
            
            # Save the time series plot
            time_series_file = os.path.join(save_path, 'dashen_sentiment_over_time.png')
            plt.savefig(time_series_file, dpi=300, bbox_inches='tight')
            print(f"‚úÖ Time series visualization saved to {time_series_file}")
            
            plt.show()
        except Exception as e:
            print(f"‚ö†Ô∏è Could not generate time series plot: {str(e)}")

# Example usage:
if __name__ == "__main__":
    # Assuming you have your analyzed Dashen data
    # dashen_analyzed = analyze_dashen_sentiments(dashen_data)
    
    if 'dashen_analyzed' in locals() and dashen_analyzed is not None:
        plot_dashen_sentiment_analysis(dashen_analyzed)
    else:
        print("No Dashen Bank data available for visualization")

No Dashen Bank data available for visualization


## Analysis of BOA Sentiment by Rating

In [5]:
def plot_dashen_sentiment_by_rating(dashen_analyzed, save_path='../reports/figures'):
    """
    Generate detailed sentiment analysis visualizations by rating for Dashen Bank.
    
    Args:
        dashen_analyzed (pd.DataFrame): Analyzed Dashen Bank reviews DataFrame
        save_path (str): Directory to save the visualizations
    """
    import os
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd
    import numpy as np
    from matplotlib.ticker import PercentFormatter
    from pathlib import Path
    from textwrap import fill
    
    # Set style and colors
    plt.style.use('seaborn-v0_8')
    sns.set_palette("viridis")
    
    # Dashen Bank color scheme
    COLORS = {
        'primary': '#00A651',    # Dashen Green
        'secondary': '#FFD700',  # Gold
        'accent': '#003366',     # Dark Blue
        'success': '#2ecc71',    # Green
        'warning': '#f39c12',    # Orange
        'danger': '#e74c3c'      # Red
    }
    
    # Create output directory if it doesn't exist
    save_path = Path(save_path)
    save_path.mkdir(parents=True, exist_ok=True)
    
    if dashen_analyzed is None or dashen_analyzed.empty:
        print("‚ùå No Dashen Bank data available for visualization")
        return
    
    # Create figure with subplots
    fig = plt.figure(figsize=(16, 20))
    gs = fig.add_gridspec(3, 1, height_ratios=[1.5, 1, 1])
    fig.suptitle('Dashen Bank - Sentiment Analysis by Rating', 
                fontsize=22, y=1.02, weight='bold', 
                color=COLORS['primary'])
    
    # 1. Sentiment Distribution by Rating (Stacked Bar Chart)
    ax1 = fig.add_subplot(gs[0])
    if 'rating' in dashen_analyzed.columns:
        # Create cross-tabulation
        sentiment_by_rating = pd.crosstab(
            dashen_analyzed['rating'], 
            dashen_analyzed['sentiment'],
            normalize='index'
        ).sort_index(ascending=False) * 100
        
        # Define colors for each sentiment
        sentiment_colors = {
            'positive': COLORS['success'],
            'neutral': COLORS['warning'],
            'negative': COLORS['danger']
        }
        
        # Reorder columns to ensure consistent color mapping
        sentiment_order = ['negative', 'neutral', 'positive']
        sentiment_by_rating = sentiment_by_rating[sentiment_order]
        
        # Plot stacked bars
        sentiment_by_rating.plot(
            kind='barh', 
            stacked=True, 
            ax=ax1,
            color=[sentiment_colors[s] for s in sentiment_order],
            edgecolor='white',
            linewidth=0.8,
            width=0.85
        )
        
        # Customize the plot
        ax1.set_title('Sentiment Distribution by Rating', 
                     fontsize=16, pad=20, weight='bold', 
                     color=COLORS['primary'])
        ax1.set_xlabel('Percentage of Reviews', fontsize=12, labelpad=10)
        ax1.set_ylabel('Rating (1-5)', fontsize=12, labelpad=10)
        ax1.legend(title='Sentiment', bbox_to_anchor=(1.02, 1), 
                  loc='upper left', borderaxespad=0.)
        ax1.grid(axis='x', linestyle='--', alpha=0.3)
        ax1.xaxis.set_major_formatter(PercentFormatter(100))
        
        # Add percentage labels on each segment
        for container in ax1.containers:
            ax1.bar_label(
                container, 
                label_type='center', 
                fmt='%.1f%%',
                color='white' if container.datavalues[0] > 30 else 'black',
                fontsize=9,
                fontweight='bold',
                padding=2
            )
    
    # 2. Average Sentiment Scores by Rating (Line Plot)
    ax2 = fig.add_subplot(gs[1])
    if 'rating' in dashen_analyzed.columns:
        # Calculate average scores
        avg_scores = dashen_analyzed.groupby('rating').agg({
            'polarity': ['mean', 'sem'],
            'vader_compound': ['mean', 'sem']
        }).reset_index()
        
        # Flatten column names
        avg_scores.columns = ['_'.join(col).strip('_') for col in avg_scores.columns.values]
        
        # Create line plot with error bands
        for metric, color, label in [
            ('polarity', COLORS['primary'], 'TextBlob Polarity'),
            ('vader_compound', COLORS['secondary'], 'VADER Compound')
        ]:
            # Plot the main line
            ax2.plot(
                avg_scores['rating'], 
                avg_scores[f'{metric}_mean'],
                color=color,
                marker='o' if metric == 'polarity' else 's',
                markersize=8,
                linewidth=2.5,
                label=label
            )
            
            # Add error bands (standard error of the mean)
            ax2.fill_between(
                avg_scores['rating'],
                avg_scores[f'{metric}_mean'] - avg_scores[f'{metric}_sem'],
                avg_scores[f'{metric}_mean'] + avg_scores[f'{metric}_sem'],
                color=color,
                alpha=0.2
            )
        
        # Customize the plot
        ax2.set_title('Average Sentiment Scores by Rating', 
                     fontsize=16, pad=20, weight='bold', 
                     color=COLORS['primary'])
        ax2.set_xlabel('Rating (1-5)', fontsize=12, labelpad=10)
        ax2.set_ylabel('Average Sentiment Score', fontsize=12, labelpad=10)
        ax2.axhline(0, color='black', linewidth=0.8, linestyle='--')
        ax2.legend(
            bbox_to_anchor=(1.02, 1), 
            loc='upper left', 
            borderaxespad=0.,
            fontsize=11
        )
        ax2.grid(True, linestyle='--', alpha=0.3)
        
        # Add value labels
        for _, row in avg_scores.iterrows():
            for metric, color, offset in [
                ('polarity', COLORS['primary'], 0.03),
                ('vader_compound', COLORS['secondary'], -0.03)
            ]:
                ax2.text(
                    row['rating'], 
                    row[f'{metric}_mean'] + offset, 
                    f"{row[f'{metric}_mean']:.2f}",
                    ha='center',
                    va='bottom' if metric == 'polarity' else 'top',
                    fontsize=10,
                    fontweight='bold',
                    color=color,
                    bbox=dict(
                        facecolor='white',
                        edgecolor=color,
                        alpha=0.8,
                        boxstyle='round,pad=0.2'
                    )
                )
    
    # 3. Sentiment Heatmap by Rating
    ax3 = fig.add_subplot(gs[2])
    if all(col in dashen_analyzed.columns for col in ['rating', 'sentiment']):
        # Create cross-tabulation for heatmap
        heatmap_data = pd.crosstab(
            dashen_analyzed['rating'], 
            dashen_analyzed['sentiment'],
            normalize='index'
        ).sort_index(ascending=False)
        
        # Create heatmap
        sns.heatmap(
            heatmap_data,
            annot=True,
            fmt='.1%',
            cmap='YlGnBu',
            cbar_kws={'label': 'Proportion'},
            ax=ax3
        )
        
        # Customize the plot
        ax3.set_title('Sentiment Heatmap by Rating', 
                     fontsize=16, pad=20, weight='bold', 
                     color=COLORS['primary'])
        ax3.set_xlabel('Sentiment', fontsize=12, labelpad=10)
        ax3.set_ylabel('Rating (1-5)', fontsize=12, labelpad=10)
        ax3.tick_params(axis='both', which='major', labelsize=10)
    
    # Adjust layout
    plt.tight_layout()
    
    # Save the figure
    save_file = save_path / 'dashen_sentiment_by_rating.png'
    plt.savefig(save_file, dpi=300, bbox_inches='tight', facecolor='white')
    print(f"‚úÖ Visualizations saved to {save_file}")
    
    plt.show()
    
    # 4. Create a detailed table of sentiment distribution
    if 'rating' in dashen_analyzed.columns:
        # Calculate statistics
        stats = dashen_analyzed.groupby('rating').agg({
            'sentiment': [
                ('Total', 'count'),
                ('Positive %', lambda x: (x == 'positive').mean() * 100),
                ('Neutral %', lambda x: (x == 'neutral').mean() * 100),
                ('Negative %', lambda x: (x == 'negative').mean() * 100)
            ],
            'polarity': [('Avg. Polarity', 'mean')],
            'vader_compound': [('Avg. VADER', 'mean')]
        }).round(2)
        
        # Flatten multi-index columns
        stats.columns = [' '.join(col).strip() for col in stats.columns.values]
        
        # Display the table with styling
        def color_negative_red(val):
            """
            Takes a scalar and returns a string with
            the css property `'color: red'` for negative
            strings, black otherwise.
            """
            if isinstance(val, (int, float)):
                if val < 0:
                    return 'color: red'
                elif val > 0:
                    return 'color: green'
            return ''

        # Apply styling
        styled_stats = stats.style\
            .background_gradient(subset=['Avg. Polarity', 'Avg. VADER'], 
                               cmap='RdYlGn', 
                               vmin=-1, vmax=1)\
            .format('{:.1f}%', subset=['Positive %', 'Neutral %', 'Negative %'])\
            .format('{:.2f}', subset=['Avg. Polarity', 'Avg. VADER'])
        
        print("\nDetailed Sentiment Analysis by Rating:")
        display(styled_stats)

# Example usage:
if __name__ == "__main__":
    # Assuming you have your analyzed Dashen data
    # dashen_analyzed = analyze_dashen_sentiments(dashen_data)
    
    if 'dashen_analyzed' in locals() and dashen_analyzed is not None:
        plot_dashen_sentiment_by_rating(dashen_analyzed)
    else:
        print("No Dashen Bank data available for visualization")

No Dashen Bank data available for visualization


## Save the Results

In [None]:
def save_boa_analysis(boa_analyzed, output_dir='../reports/boa'):
    """
    Save BOA analysis results to a CSV file.
    
    Args:
        boa_analyzed (pd.DataFrame): Analyzed BOA reviews DataFrame
        output_dir (str): Directory to save the output file
    """
    from pathlib import Path
    from datetime import datetime
    
    if boa_analyzed is None or boa_analyzed.empty:
        print("‚ùå No BOA data to save")
        return None
    
    try:
        # Create output directory if it doesn't exist
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        
        # Create filename with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"sentiment_analysis_boa_{timestamp}.csv"
        filepath = output_path / filename
        
        # Save to CSV
        boa_analyzed.to_csv(filepath, index=False)
        print(f"‚úÖ Saved {len(boa_analyzed)} records to {filepath}")
        return str(filepath)
        
    except Exception as e:
        print(f"‚ùå Error saving BOA analysis: {str(e)}")
        return None

# Example usage:
if __name__ == "__main__":
    # Assuming you have your analyzed BOA data
    # boa_analyzed = analyze_boa_sentiments(boa_data)
    
    if 'boa_analyzed' in locals() and boa_analyzed is not None:
        saved_file = save_boa_analysis(boa_analyzed)
        if saved_file:
            print(f"Analysis saved to: {saved_file}")
    else:
        print("No BOA data available to save")

## General Summary Report

In [None]:
def generate_boa_summary(boa_analyzed, output_file='../reports/boa_sentiment_summary.md'):
    """Generate a markdown summary report for BOA sentiment analysis"""
    from pathlib import Path
    from datetime import datetime
    
    if boa_analyzed is None or boa_analyzed.empty:
        print("‚ùå No BOA data available for report generation")
        return

    # Setup output directory
    report_path = Path(output_file)
    report_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Calculate statistics
    total = len(boa_analyzed)
    tb_sent = boa_analyzed['sentiment'].value_counts()
    vd_sent = boa_analyzed['vader_sentiment'].value_counts()
    rating_dist = boa_analyzed['rating'].value_counts().sort_index() if 'rating' in boa_analyzed else {}
    date_range = f"{boa_analyzed['date'].min()} to {boa_analyzed['date'].max()}" if 'date' in boa_analyzed else "N/A"

    # Generate report
    report = f"""# Bank of America Sentiment Analysis Report

## Overview
This report summarizes the sentiment analysis of {total:,} BOA reviews.

## Key Statistics
- **Total Reviews**: {total:,}
- **Date Range**: {date_range}

## Sentiment Analysis

### TextBlob Analysis
{f"- Positive: {tb_sent.get('positive', 0):,} ({(tb_sent.get('positive', 0)/total):.1%})"}
{f"- Neutral: {tb_sent.get('neutral', 0):,} ({(tb_sent.get('neutral', 0)/total):.1%})"}
{f"- Negative: {tb_sent.get('negative', 0):,} ({(tb_sent.get('negative', 0)/total):.1%})"}

### VADER Analysis
{f"- Positive: {vd_sent.get('positive', 0):,} ({(vd_sent.get('positive', 0)/total):.1%})"}
{f"- Neutral: {vd_sent.get('neutral', 0):,} ({(vd_sent.get('neutral', 0)/total):.1%})"}
{f"- Negative: {vd_sent.get('negative', 0):,} ({(vd_sent.get('negative', 0)/total):.1%})"}

## Rating Distribution
{chr(10).join(f"- {star}‚òÖ: {count:,} ({(count/total):.1%})" for star, count in rating_dist.items())}

## Visualizations
![Sentiment by Rating](../reports/figures/boa_sentiment_by_rating.png)
![Average Scores](../reports/figures/boa_avg_scores_by_rating.png)

---

*Report generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*
"""

    # Save the report
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write(report)
    
    print(f"‚úÖ Report generated: {report_path}")

# Example usage:
if __name__ == "__main__":
    if 'boa_analyzed' in locals() and boa_analyzed is not None:
        generate_boa_summary(boa_analyzed)
    else:
        print("No BOA data available for report generation")