# Task Results Viewer

This notebook displays results from Task 1 (Data Collection) and Task 2 (Sentiment & Thematic Analysis).


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


## Task 1: Data Collection & Preprocessing Results


In [None]:
# Load cleaned data
df_cleaned = pd.read_csv('../data/cleaned/cleaned_reviews.csv')

print("=" * 70)
print("TASK 1 RESULTS - DATA COLLECTION & PREPROCESSING")
print("=" * 70)
print(f"\n‚úÖ Total Reviews Collected: {len(df_cleaned)}")
print(f"\nüìä Reviews per Bank:")
bank_counts = df_cleaned['bank'].value_counts()
for bank, count in bank_counts.items():
    print(f"   {bank}: {count} reviews")

print(f"\n‚≠ê Rating Distribution:")
rating_counts = df_cleaned['rating'].value_counts().sort_index()
for rating, count in rating_counts.items():
    percentage = (count / len(df_cleaned)) * 100
    print(f"   {int(rating)} stars: {count} ({percentage:.1f}%)")

print(f"\nüìÖ Date Range:")
print(f"   From: {df_cleaned['date'].min()}")
print(f"   To: {df_cleaned['date'].max()}")


In [None]:
# Visualize rating distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Rating distribution
rating_counts = df_cleaned['rating'].value_counts().sort_index()
axes[0].bar(rating_counts.index, rating_counts.values, color='steelblue')
axes[0].set_xlabel('Rating (Stars)')
axes[0].set_ylabel('Count')
axes[0].set_title('Rating Distribution')
axes[0].set_xticks(range(1, 6))

# Reviews per bank
bank_counts = df_cleaned['bank'].value_counts()
axes[1].bar(bank_counts.index, bank_counts.values, color='coral')
axes[1].set_xlabel('Bank')
axes[1].set_ylabel('Number of Reviews')
axes[1].set_title('Reviews per Bank')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# Sample reviews
print("üìù Sample Reviews:")
sample = df_cleaned[['review_text', 'rating', 'bank']].head(5)
for idx, row in sample.iterrows():
    print(f"\n   [{row['bank']}] Rating: {row['rating']} stars")
    print(f"   Review: {str(row['review_text'])[:80]}...")


## Task 2: Sentiment Analysis Results


In [None]:
# Load sentiment data
df_sentiment = pd.read_csv('../data/processed/reviews_with_sentiment.csv')

print("=" * 70)
print("TASK 2 RESULTS - SENTIMENT ANALYSIS")
print("=" * 70)
print(f"\n‚úÖ Total Reviews Analyzed: {len(df_sentiment)}")

print(f"\nüòä Sentiment Distribution:")
sent_counts = df_sentiment['sentiment_label'].value_counts()
for label, count in sent_counts.items():
    percentage = (count / len(df_sentiment)) * 100
    print(f"   {label}: {count} ({percentage:.1f}%)")

print(f"\nüìà Average Sentiment Scores:")
avg_scores = df_sentiment.groupby('sentiment_label')['sentiment_score'].mean()
for label, score in avg_scores.items():
    print(f"   {label}: {score:.3f}")


In [None]:
# Visualize sentiment distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Sentiment distribution
sent_counts = df_sentiment['sentiment_label'].value_counts()
colors = {'POSITIVE': 'green', 'NEGATIVE': 'red', 'NEUTRAL': 'gray'}
axes[0].bar(sent_counts.index, sent_counts.values, 
            color=[colors.get(x, 'blue') for x in sent_counts.index])
axes[0].set_xlabel('Sentiment')
axes[0].set_ylabel('Count')
axes[0].set_title('Sentiment Distribution')

# Sentiment by bank
bank_sent = pd.crosstab(df_sentiment['bank'], df_sentiment['sentiment_label'])
bank_sent.plot(kind='bar', ax=axes[1], color=['red', 'gray', 'green'])
axes[1].set_xlabel('Bank')
axes[1].set_ylabel('Count')
axes[1].set_title('Sentiment by Bank')
axes[1].legend(title='Sentiment')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


## Task 2: Thematic Analysis Results


In [None]:
# Load theme data
df_themes = pd.read_csv('../data/processed/reviews_with_themes.csv')

print("=" * 70)
print("TASK 2 RESULTS - THEMATIC ANALYSIS")
print("=" * 70)
print(f"\n‚úÖ Total Reviews: {len(df_themes)}")

print(f"\nüè∑Ô∏è  Theme Distribution:")
theme_counts = df_themes['theme'].value_counts()
for theme, count in theme_counts.items():
    percentage = (count / len(df_themes)) * 100
    print(f"   {theme}: {count} ({percentage:.1f}%)")


In [None]:
# Visualize theme distribution
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Theme distribution
theme_counts = df_themes['theme'].value_counts()
axes[0].barh(theme_counts.index, theme_counts.values, color='teal')
axes[0].set_xlabel('Count')
axes[0].set_ylabel('Theme')
axes[0].set_title('Theme Distribution')
axes[0].invert_yaxis()

# Themes by bank
bank_theme = pd.crosstab(df_themes['bank'], df_themes['theme'])
bank_theme.plot(kind='bar', ax=axes[1], stacked=True)
axes[1].set_xlabel('Bank')
axes[1].set_ylabel('Count')
axes[1].set_title('Themes by Bank')
axes[1].legend(title='Theme', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# Sample reviews with analysis
print("üìù Sample Reviews with Analysis:")
sample = df_themes[['review_text', 'rating', 'sentiment_label', 'theme', 'bank']].head(5)
for idx, row in sample.iterrows():
    print(f"\n   [{row['bank']}] ‚≠ê{row['rating']} | {row['sentiment_label']} | Theme: {row['theme']}")
    review_text = str(row['review_text'])[:100]
    print(f"   Review: {review_text}...")


## Summary


In [None]:
print("=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"‚úÖ Task 1: Collected and cleaned {len(df_cleaned)} reviews")
print(f"‚úÖ Task 2: Analyzed sentiment for {len(df_sentiment)} reviews")
print(f"‚úÖ Task 2: Identified themes for {len(df_themes)} reviews")
print("\nüéâ All tasks completed successfully!")
