# Exploratory Data Analysis
## Ethiopian Bank Reviews Analysis

This notebook contains exploratory analysis of the bank reviews data.


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('../data/processed/reviews_with_themes.csv')

print(f"Total reviews: {len(df)}")
print(f"\nColumns: {list(df.columns)}")


Total reviews: 1200

Columns: ['review_text', 'rating', 'date', 'bank', 'source', 'review_id', 'sentiment_label', 'sentiment_score', 'theme']


## Data Overview


In [None]:
# Display basic statistics
print(f"Dataset Shape: {df.shape}")
print(f"\nMissing Values:")
print(df.isnull().sum())
print(f"\nData Types:")
print(df.dtypes)


In [None]:
# Display first few rows
df.head()


## Sentiment Distribution


In [None]:
# Sentiment distribution
sentiment_counts = df['sentiment_label'].value_counts()
plt.figure(figsize=(10, 6))
colors = {'POSITIVE': 'green', 'NEGATIVE': 'red', 'NEUTRAL': 'gray'}
plt.bar(sentiment_counts.index, sentiment_counts.values, 
        color=[colors.get(x, 'blue') for x in sentiment_counts.index])
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

print("Sentiment percentages:")
for label, count in sentiment_counts.items():
    print(f"{label}: {count/len(df)*100:.1f}%")


## Theme Distribution


In [None]:
# Theme distribution
theme_counts = df['theme'].value_counts()
plt.figure(figsize=(12, 6))
plt.barh(theme_counts.index, theme_counts.values, color='teal')
plt.title('Theme Distribution')
plt.xlabel('Count')
plt.ylabel('Theme')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


## Rating Distribution by Bank


In [None]:
# Rating distribution by bank
bank_rating = pd.crosstab(df['bank'], df['rating'])
bank_rating.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Rating Distribution by Bank')
plt.xlabel('Bank')
plt.ylabel('Count')
plt.legend(title='Rating', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## Sentiment vs Rating Analysis


In [None]:
# Compare sentiment with ratings
sentiment_rating = pd.crosstab(df['sentiment_label'], df['rating'])
sentiment_rating.plot(kind='bar', figsize=(12, 6))
plt.title('Sentiment vs Rating')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.legend(title='Rating', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## Key Insights


In [None]:
# Summary statistics
print("=" * 70)
print("KEY INSIGHTS")
print("=" * 70)

print(f"\n1. Total Reviews: {len(df)}")
print(f"2. Average Rating: {df['rating'].mean():.2f}")
print(f"3. Positive Sentiment: {(df['sentiment_label'] == 'POSITIVE').sum()} ({(df['sentiment_label'] == 'POSITIVE').sum()/len(df)*100:.1f}%)")
print(f"4. Most Common Theme: {df['theme'].value_counts().index[0]}")

print(f"\n5. Bank Performance:")
for bank in df['bank'].unique():
    bank_df = df[df['bank'] == bank]
    avg_rating = bank_df['rating'].mean()
    pos_sentiment = (bank_df['sentiment_label'] == 'POSITIVE').sum() / len(bank_df) * 100
    print(f"   {bank}: Avg Rating {avg_rating:.2f}, {pos_sentiment:.1f}% Positive")
