In [83]:
## Setup and Imports

import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import os

In [3]:
# Create results directory if it doesn't exist
os.makedirs('results', exist_ok=True)

print("Sentiment Analysis Setup Complete!")


Sentiment Analysis Setup Complete!


In [89]:
# Load the preprocessed data
try:
    df = pd.read_csv('cleaned_reviews.csv')
    print(f"Loaded {len(df)} reviews from cleaned data")
except FileNotFoundError:
    print("Cleaned reviews file not found. Loading raw data...")
    df = pd.read_csv('data/raw_reviews.csv')  # Adjust path as needed
    
    # Basic cleaning
    df = df.dropna(subset=['review_text'])
    df['review_text'] = df['review_text'].astype(str)
    if 'rating' in df.columns:
        df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
        df = df.dropna(subset=['rating'])

print("Data shape:", df.shape)
print("Required columns present:", ['review_text' in df.columns])

Loaded 1200 reviews from cleaned data
Data shape: (1200, 6)
Required columns present: [False]


In [86]:
print(df.columns)

Index(['review_id', 'review', 'rating', 'date', 'bank', 'source'], dtype='object')


In [90]:
df = pd.read_csv('cleaned_reviews.csv')
df = df.rename(columns={'review': 'review_text'})

In [91]:
# Initialize VADER analyzer
analyzer = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
    """Get VADER sentiment scores for a given text"""
    if pd.isna(text) or text == "":
        return {'pos': 0, 'neg': 0, 'neu': 1, 'compound': 0}
    
    scores = analyzer.polarity_scores(str(text))
    return scores

def classify_sentiment(compound_score):
    """Classify sentiment based on VADER compound score"""
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

print("Analyzing sentiment for all reviews...")

Analyzing sentiment for all reviews...


In [92]:
# Apply sentiment analysis
sentiment_results = df['review_text'].apply(get_vader_sentiment)

# Extract individual scores
df['sentiment_positive'] = sentiment_results.apply(lambda x: x['pos'])
df['sentiment_negative'] = sentiment_results.apply(lambda x: x['neg']) 
df['sentiment_neutral'] = sentiment_results.apply(lambda x: x['neu'])
df['sentiment_compound'] = sentiment_results.apply(lambda x: x['compound'])

# Classify overall sentiment
df['sentiment_label'] = df['sentiment_compound'].apply(classify_sentiment)

print("Sentiment analysis complete!")
print(f"Processed {len(df)} reviews")
print(f"Sentiment distribution:\n{df['sentiment_label'].value_counts()}")


Sentiment analysis complete!
Processed 1200 reviews
Sentiment distribution:
sentiment_label
positive    661
negative    332
neutral     207
Name: count, dtype: int64


In [95]:
# Aggregate sentiment by bank
if 'bank' in df.columns:
    print("\nSentiment aggregation by bank:")
    bank_sentiment = df.groupby('bank').agg({
        'sentiment_compound': ['mean', 'count'],
        'sentiment_positive': 'mean',
        'sentiment_negative': 'mean',
        'sentiment_neutral': 'mean'
    }).round(3)
    
    bank_sentiment.columns = ['avg_compound', 'review_count', 'avg_positive', 'avg_negative', 'avg_neutral']
    print(bank_sentiment)
    
    # Sentiment distribution by bank
    bank_sentiment_dist = pd.crosstab(df['bank'], df['sentiment_label'])
    print("\nSentiment count by bank:")
    print(bank_sentiment_dist)
else:
    print("Bank column not found, skipping bank aggregation")


Sentiment aggregation by bank:
                             avg_compound  review_count  avg_positive  \
bank                                                                    
Bank of Abyssinia                  -0.032           400         0.109   
Commercial Bank of Ethiopia         0.163           400         0.134   
Dashen Bank                         0.456           400         0.351   

                             avg_negative  avg_neutral  
bank                                                    
Bank of Abyssinia                   0.108        0.783  
Commercial Bank of Ethiopia         0.068        0.798  
Dashen Bank                         0.024        0.625  

Sentiment count by bank:
sentiment_label              negative  neutral  positive
bank                                                    
Bank of Abyssinia                 175       78       147
Commercial Bank of Ethiopia       132       47       221
Dashen Bank                        25       82       293


In [98]:
# Aggregate sentiment by rating
if 'rating' in df.columns:
    print("\nSentiment aggregation by rating:")
    rating_sentiment = df.groupby('rating').agg({
        'sentiment_compound': ['mean', 'count'],
        'sentiment_positive': 'mean', 
        'sentiment_negative': 'mean',
        'sentiment_neutral': 'mean'
    }).round(3)
    
    rating_sentiment.columns = ['avg_compound', 'review_count', 'avg_positive', 'avg_negative', 'avg_neutral']
    print(rating_sentiment)
       # Special focus on 1-star reviews as mentioned in task
    one_star_reviews = df[df['rating'] == 1]
    if len(one_star_reviews) > 0:
        print(f"\n1-star reviews sentiment summary:")
        print(f"Count: {len(one_star_reviews)}")
        print(f"Mean sentiment: {one_star_reviews['sentiment_compound'].mean():.3f}")
        print(f"Sentiment distribution: {one_star_reviews['sentiment_label'].value_counts().to_dict()}")
else:
    print("Rating column not found, skipping rating aggregation")


Sentiment aggregation by rating:
        avg_compound  review_count  avg_positive  avg_negative  avg_neutral
rating                                                                     
1             -0.196           416         0.063         0.128        0.809
2             -0.012            87         0.089         0.088        0.824
3              0.191           111         0.115         0.063        0.822
4              0.400           114         0.192         0.041        0.767
5              0.531           472         0.359         0.015        0.626

1-star reviews sentiment summary:
Count: 416
Mean sentiment: -0.196
Sentiment distribution: {'negative': 232, 'positive': 104, 'neutral': 80}


In [99]:
## Save Sentiment Results
# Add review_id if not present
if 'review_id' not in df.columns:
    df['review_id'] = range(1, len(df) + 1)

# Prepare sentiment output
sentiment_output = df[[
    'review_id', 
    'review_text', 
    'sentiment_label', 
    'sentiment_compound',
    'sentiment_positive',
    'sentiment_negative', 
    'sentiment_neutral'
]].copy()

In [101]:
# Add bank and rating if available
if 'bank_name' in df.columns:
    sentiment_output['bank_name'] = df['bank_name']
if 'rating' in df.columns:
    sentiment_output['rating'] = df['rating']

# Save sentiment results
import os
os.makedirs('results', exist_ok=True)

# Now save the file
sentiment_output.to_csv('results/sentiment_analysis_results.csv', index=False)
print(f"\nSentiment results saved to 'results/sentiment_analysis_results.csv'")
print(f"Output shape: {sentiment_output.shape}")


Sentiment results saved to 'results/sentiment_analysis_results.csv'
Output shape: (1200, 8)


In [102]:
# Calculate coverage
total_reviews = len(df)
successful_sentiment = len(df[df['sentiment_label'].notna()])
coverage_percentage = (successful_sentiment / total_reviews) * 100

print(f"\nSentiment Analysis Coverage: {successful_sentiment}/{total_reviews} ({coverage_percentage:.1f}%)")
print("✅ KPI Met: Sentiment scores for 90%+ reviews" if coverage_percentage >= 90 else "❌ KPI Not Met: Coverage below 90%")


Sentiment Analysis Coverage: 1200/1200 (100.0%)
✅ KPI Met: Sentiment scores for 90%+ reviews


In [103]:
print("\n" + "="*50)
print("SENTIMENT ANALYSIS SUMMARY")
print("="*50)

print(f"Total reviews processed: {len(df):,}")
print(f"Sentiment analysis coverage: {coverage_percentage:.1f}%")

print(f"\nOverall sentiment distribution:")
for sentiment, count in df['sentiment_label'].value_counts().items():
    percentage = (count / len(df)) * 100
    print(f"  {sentiment.title()}: {count:,} ({percentage:.1f}%)")

print(f"\nAverage sentiment scores:")
print(f"  Compound: {df['sentiment_compound'].mean():.3f}")
print(f"  Positive: {df['sentiment_positive'].mean():.3f}")
print(f"  Negative: {df['sentiment_negative'].mean():.3f}")
print(f"  Neutral: {df['sentiment_neutral'].mean():.3f}")

if 'rating' in df.columns:
    print(f"\nRating-Sentiment correlation: {df['rating'].corr(df['sentiment_compound']):.3f}")

print("\n✅ Sentiment Analysis Complete!")


SENTIMENT ANALYSIS SUMMARY
Total reviews processed: 1,200
Sentiment analysis coverage: 100.0%

Overall sentiment distribution:
  Positive: 661 (55.1%)
  Negative: 332 (27.7%)
  Neutral: 207 (17.2%)

Average sentiment scores:
  Compound: 0.196
  Positive: 0.198
  Negative: 0.067
  Neutral: 0.735

Rating-Sentiment correlation: 0.592

✅ Sentiment Analysis Complete!
