# Comprehensive Sentiment Analysis

This notebook demonstrates comprehensive sentiment analysis using VADER, domain-specific sentiment patterns, sentiment trend analysis, and advanced visualization techniques.

## Setup and Data Preparation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

from src.analysis import preprocess_df, add_sentiment_scores
from src.preprocess import clean_text

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# Load and preprocess data
DATA = Path('data') / 'synthetic_texts.csv'
if not DATA.exists():
    from generate_data import generate
    generate(1000)

df = pd.read_csv(DATA)
df = preprocess_df(df)

# Add sentiment scores
df = add_sentiment_scores(df)

print(f"Dataset shape: {df.shape}")
print(f"Domains: {df['domain'].unique()}")
print(f"Labels: {df['label'].unique()}")
print(f"\nSentiment columns: {[col for col in df.columns if 'sentiment' in col.lower()]}")

## Sentiment Distribution Analysis

In [None]:
print("Sentiment Distribution Analysis")
print("=" * 35)

# Overall sentiment distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Overall sentiment distribution
axes[0,0].hist(df['sentiment_score'], bins=30, alpha=0.7, color='skyblue')
axes[0,0].set_title('Overall Sentiment Distribution')
axes[0,0].set_xlabel('Sentiment Score')
axes[0,0].set_ylabel('Frequency')

# Sentiment by domain
domains = df['domain'].unique()
for domain in domains:
    domain_data = df[df['domain'] == domain]
    axes[0,1].hist(domain_data['sentiment_score'], bins=20, alpha=0.6, label=domain)
axes[0,1].set_title('Sentiment Distribution by Domain')
axes[0,1].set_xlabel('Sentiment Score')
axes[0,1].set_ylabel('Frequency')
axes[0,1].legend()

# Sentiment by label
labels = df['label'].unique()
for label in labels:
    label_data = df[df['label'] == label]
    axes[1,0].hist(label_data['sentiment_score'], bins=20, alpha=0.6, label=label)
axes[1,0].set_title('Sentiment Distribution by Label')
axes[1,0].set_xlabel('Sentiment Score')
axes[1,0].set_ylabel('Frequency')
axes[1,0].legend()

# Sentiment compound distribution
axes[1,1].hist(df['sentiment_compound'], bins=30, alpha=0.7, color='lightcoral')
axes[1,1].set_title('Compound Sentiment Distribution')
axes[1,1].set_xlabel('Compound Sentiment Score')
axes[1,1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Summary statistics
print("\nSentiment Summary Statistics:")
print(df[['sentiment_score', 'sentiment_compound']].describe())

# Sentiment classification counts
print("\nSentiment Classification Counts:")
print(df['sentiment_label'].value_counts())

## Domain-Specific Sentiment Analysis

In [None]:
print("Domain-Specific Sentiment Analysis")
print("=" * 40)

# Sentiment statistics by domain
domain_sentiment = df.groupby('domain')['sentiment_score'].agg([
    'mean', 'std', 'min', 'max', 'count'
]).round(3)

print("Sentiment Statistics by Domain:")
print(domain_sentiment)

# Visualize domain sentiment differences
plt.figure(figsize=(12, 6))

# Box plot
plt.subplot(1, 2, 1)
sns.boxplot(data=df, x='domain', y='sentiment_score')
plt.title('Sentiment Distribution by Domain')
plt.xticks(rotation=45)

# Violin plot
plt.subplot(1, 2, 2)
sns.violinplot(data=df, x='domain', y='sentiment_score')
plt.title('Sentiment Density by Domain')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Sentiment by domain and label
print("\nSentiment by Domain and Label:")
domain_label_sentiment = df.groupby(['domain', 'label'])['sentiment_score'].agg([
    'mean', 'std', 'count'
]).round(3)
print(domain_label_sentiment)

## Sentiment Trend Analysis

In [None]:
print("Sentiment Trend Analysis")
print("=" * 30)

# Create synthetic time data for trend analysis
df['date'] = pd.date_range(start='2024-01-01', periods=len(df), freq='H')
df['day'] = df['date'].dt.date
df['hour'] = df['date'].dt.hour

# Daily sentiment trends
daily_sentiment = df.groupby('day')['sentiment_score'].agg(['mean', 'std', 'count'])

plt.figure(figsize=(15, 5))
plt.plot(daily_sentiment.index, daily_sentiment['mean'], marker='o', linewidth=2)
plt.fill_between(daily_sentiment.index, 
                 daily_sentiment['mean'] - daily_sentiment['std'],
                 daily_sentiment['mean'] + daily_sentiment['std'],
                 alpha=0.3)
plt.title('Daily Sentiment Trend')
plt.xlabel('Date')
plt.ylabel('Average Sentiment Score')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Hourly sentiment patterns
hourly_sentiment = df.groupby('hour')['sentiment_score'].mean()

plt.figure(figsize=(12, 4))
plt.bar(hourly_sentiment.index, hourly_sentiment.values, alpha=0.7)
plt.title('Hourly Sentiment Pattern')
plt.xlabel('Hour of Day')
plt.ylabel('Average Sentiment Score')
plt.xticks(range(0, 24))
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Sentiment trends by domain
plt.figure(figsize=(15, 8))
for domain in domains:
    domain_data = df[df['domain'] == domain]
    daily_domain = domain_data.groupby('day')['sentiment_score'].mean()
    plt.plot(daily_domain.index, daily_domain.values, marker='o', linewidth=2, label=domain)

plt.title('Daily Sentiment Trends by Domain')
plt.xlabel('Date')
plt.ylabel('Average Sentiment Score')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Advanced Sentiment Analysis

In [None]:
print("Advanced Sentiment Analysis")
print("=" * 35)

# Sentiment correlation with text length
df['text_length'] = df['text'].str.len()

plt.figure(figsize=(10, 6))
plt.scatter(df['text_length'], df['sentiment_score'], alpha=0.6)
plt.title('Sentiment vs Text Length')
plt.xlabel('Text Length (characters)')
plt.ylabel('Sentiment Score')
plt.grid(True, alpha=0.3)
plt.show()

# Calculate correlation
correlation = df['text_length'].corr(df['sentiment_score'])
print(f"Correlation between text length and sentiment: {correlation:.3f}")

# Sentiment by word count
df['word_count'] = df['text'].str.split().str.len()

plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x=pd.cut(df['word_count'], bins=5), y='sentiment_score')
plt.title('Sentiment by Word Count Categories')
plt.xlabel('Word Count Range')
plt.ylabel('Sentiment Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Sentiment extremes analysis
print("\nMost Positive Texts:")
most_positive = df.nlargest(5, 'sentiment_score')
for idx, row in most_positive.iterrows():
    print(f"\nScore: {row['sentiment_score']:.3f}")
    print(f"Text: {row['text'][:100]}...")
    print(f"Domain: {row['domain']}, Label: {row['label']}")

print("\nMost Negative Texts:")
most_negative = df.nsmallest(5, 'sentiment_score')
for idx, row in most_negative.iterrows():
    print(f"\nScore: {row['sentiment_score']:.3f}")
    print(f"Text: {row['text'][:100]}...")
    print(f"Domain: {row['domain']}, Label: {row['label']}")

## Sentiment Classification Performance

In [None]:
print("Sentiment Classification Performance")
print("=" * 40)

# Analyze sentiment classification consistency
print("Sentiment Label Distribution:")
print(df['sentiment_label'].value_counts())

# Compare with manual labels (if we had them)
# For synthetic data, we can analyze patterns

# Sentiment by domain and manual label
cross_tab = pd.crosstab(df['domain'], df['sentiment_label'], normalize='index')
print("\nSentiment Distribution by Domain:")
print(cross_tab.round(3))

# Visualize sentiment label distribution
plt.figure(figsize=(12, 6))

# Stacked bar chart
cross_tab_abs = pd.crosstab(df['domain'], df['sentiment_label'])
cross_tab_abs.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Sentiment Label Distribution by Domain')
plt.xlabel('Domain')
plt.ylabel('Count')
plt.legend(title='Sentiment Label')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Sentiment classification accuracy analysis
# (This would require ground truth labels for real data)
print("\nNote: For real-world data, you would compare VADER sentiment labels")
print("with human-annotated labels to calculate classification accuracy.")

## Sentiment Analysis Summary

This comprehensive sentiment analysis demonstrates:

1. **Overall Sentiment Patterns**: Distribution of sentiment scores across the dataset
2. **Domain-Specific Analysis**: How sentiment varies across different domains
3. **Temporal Trends**: Daily and hourly sentiment patterns
4. **Advanced Correlations**: Relationship between sentiment and text characteristics
5. **Extreme Analysis**: Examination of most positive and negative texts
6. **Classification Performance**: Analysis of sentiment label distribution

Key insights:
- VADER provides effective sentiment analysis for short texts
- Different domains show distinct sentiment patterns
- Temporal trends can reveal important patterns in customer feedback
- Text length and sentiment show interesting correlations
- Extreme sentiment cases provide valuable insights for business improvement

Next steps could include:
- Integration with custom sentiment dictionaries
- Real-time sentiment monitoring
- Sentiment-based alert systems
- Comparative analysis with other sentiment analysis tools
- Multi-language sentiment analysis