In [None]:
# Cell 1: Setup and Load Processed Data
import sys
sys.path.append('../src')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Load processed data
merged_data = pd.read_csv('../data/processed/merged_data.csv')
print(f"Loaded merged data: {merged_data.shape}")

# Cell 2: Sentiment Overview
print("=== SENTIMENT ANALYSIS OVERVIEW ===")
sentiment_summary = merged_data['Classification'].value_counts()
print("Sentiment Distribution:")
print(sentiment_summary)
print(f"\nSentiment Distribution (%):")
print((sentiment_summary / len(merged_data) * 100).round(2))

# Visualize sentiment distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Pie chart
ax1.pie(sentiment_summary.values, labels=sentiment_summary.index, autopct='%1.1f%%', startangle=90)
ax1.set_title('Market Sentiment Distribution')

# Bar chart over time
if 'date' in merged_data.columns:
    merged_data['date'] = pd.to_datetime(merged_data['date'])
    daily_sentiment = merged_data.groupby(['date', 'Classification']).size().unstack(fill_value=0)
    daily_sentiment.plot(kind='area', ax=ax2, alpha=0.7)
    ax2.set_title('Sentiment Distribution Over Time')
    ax2.legend(title='Sentiment')

plt.tight_layout()
plt.show()

# Cell 3: Performance by Sentiment
print("=== PERFORMANCE BY SENTIMENT ===")
sentiment_performance = merged_data.groupby('Classification').agg({
    'closedPnL': ['count', 'sum', 'mean', 'std', 'median'],
    'size': ['mean', 'sum'],
    'leverage': 'mean',
    'is_profitable': 'mean'
}).round(4)

print("Performance Metrics by Sentiment:")
print(sentiment_performance)

# Statistical significance test
fear_pnl = merged_data[merged_data['Classification'] == 'Fear']['closedPnL'].dropna()
greed_pnl = merged_data[merged_data['Classification'] == 'Greed']['closedPnL'].dropna()

if len(fear_pnl) > 0 and len(greed_pnl) > 0:
    t_stat, p_value = stats.ttest_ind(fear_pnl, greed_pnl)
    print(f"\n=== STATISTICAL TEST RESULTS ===")
    print(f"T-test for PnL difference between Fear and Greed:")
    print(f"T-statistic: {t_stat:.4f}")
    print(f"P-value: {p_value:.4f}")
    print(f"Statistically significant (p < 0.05): {p_value < 0.05}")
    
    # Effect size (Cohen's d)
    pooled_std = np.sqrt(((len(fear_pnl) - 1) * fear_pnl.std()**2 + 
                         (len(greed_pnl) - 1) * greed_pnl.std()**2) / 
                        (len(fear_pnl) + len(greed_pnl) - 2))
    cohens_d = (greed_pnl.mean() - fear_pnl.mean()) / pooled_std
    print(f"Cohen's d (effect size): {cohens_d:.4f}")

# Cell 4: Detailed PnL Analysis by Sentiment
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Box plot of PnL by sentiment
sns.boxplot(data=merged_data, x='Classification', y='closedPnL', ax=axes[0,0])
axes[0,0].set_title('PnL Distribution by Sentiment')
axes[0,0].set_ylabel('Closed PnL')

# Violin plot for more detailed distribution
sns.violinplot(data=merged_data, x='Classification', y='closedPnL', ax=axes[0,1])
axes[0,1].set_title('PnL Distribution Density by Sentiment')
axes[0,1].set_ylabel('Closed PnL')

# Win rate by sentiment
win_rates = merged_data.groupby('Classification')['is_profitable'].mean()
axes[1,0].bar(win_rates.index, win_rates.values, color=['red', 'green'], alpha=0.7)
axes[1,0].set_title('Win Rate by Sentiment')
axes[1,0].set_ylabel('Win Rate')
for i, v in enumerate(win_rates.values):
    axes[1,0].text(i, v + 0.01, f'{v:.3f}', ha='center')

# Average trade size by sentiment
avg_sizes = merged_data.groupby('Classification')['size'].mean()
axes[1,1].bar(avg_sizes.index, avg_sizes.values, color=['red', 'green'], alpha=0.7)
axes[1,1].set_title('Average Trade Size by Sentiment')
axes[1,1].set_ylabel('Average Size')

plt.tight_layout()
plt.show()

# Cell 5: Leverage Analysis by Sentiment
if 'leverage' in merged_data.columns:
    print("=== LEVERAGE ANALYSIS BY SENTIMENT ===")
    leverage_analysis = merged_data.groupby('Classification')['leverage'].agg([
        'count', 'mean', 'std', 'min', 'max', 'median'
    ]).round(4)
    print(leverage_analysis)
    
    # Leverage distribution by sentiment
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Box plot
    sns.boxplot(data=merged_data, x='Classification', y='leverage', ax=ax1)
    ax1.set_title('Leverage Distribution by Sentiment')
    
    # Histogram
    for sentiment in merged_data['Classification'].unique():
        if pd.notna(sentiment):
            subset = merged_data[merged_data['Classification'] == sentiment]['leverage'].dropna()
            ax2.hist(subset, alpha=0.7, label=sentiment, bins=30)
    ax2.set_title('Leverage Histogram by Sentiment')
    ax2.set_xlabel('Leverage')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

# Cell 6: Time-based Sentiment Analysis
if 'date' in merged_data.columns:
    print("=== TIME-BASED SENTIMENT ANALYSIS ===")
    
    # Daily PnL by sentiment
    daily_pnl = merged_data.groupby(['date', 'Classification'])['closedPnL'].sum().unstack(fill_value=0)
    
    # Rolling averages
    daily_pnl_rolling = daily_pnl.rolling(window=7).mean()
    
    # Interactive plot
    fig = make_subplots(rows=2, cols=1, 
                       subplot_titles=('Daily PnL by Sentiment', 'Rolling 7-day Average PnL'))
    
    for sentiment in daily_pnl.columns:
        fig.add_trace(
            go.Scatter(x=daily_pnl.index, y=daily_pnl[sentiment], 
                      mode='lines', name=f'{sentiment} (Daily)'),
            row=1, col=1
        )
        fig.add_trace(
            go.Scatter(x=daily_pnl_rolling.index, y=daily_pnl_rolling[sentiment],
                      mode='lines', name=f'{sentiment} (7-day avg)'),
            row=2, col=1
        )
    
    fig.update_layout(height=800, title_text="PnL Time Series by Sentiment")
    fig.show()

# Cell 7: Trader Behavior Analysis by Sentiment
print("=== TRADER BEHAVIOR BY SENTIMENT ===")

# Analyze trading patterns
behavior_analysis = merged_data.groupby(['account', 'Classification']).agg({
    'closedPnL': ['sum', 'mean', 'count'],
    'size': 'mean',
    'leverage': 'mean',
    'is_profitable': 'mean'
}).round(4)

# Count traders who trade in both sentiments
traders_both_sentiments = behavior_analysis.index.get_level_values(0).value_counts()
traders_both = traders_both_sentiments[traders_both_sentiments > 1].index

print(f"Traders who trade in both Fear and Greed periods: {len(traders_both)}")
print(f"Total unique traders: {merged_data['account'].nunique()}")

# Cell 8: Comparative Analysis for Multi-Sentiment Traders
if len(traders_both) > 0:
    print("=== COMPARATIVE ANALYSIS: MULTI-SENTIMENT TRADERS ===")
    
    multi_sentiment_data = merged_data[merged_data['account'].isin(traders_both)]
    
    # Compare performance of same traders in different sentiments
    trader_comparison = multi_sentiment_data.groupby(['account', 'Classification']).agg({
        'closedPnL': 'mean',
        'is_profitable': 'mean',
        'size': 'mean'
    }).unstack()
    
    # Calculate differences
    if 'Fear' in trader_comparison.columns.get_level_values(1) and 'Greed' in trader_comparison.columns.get_level_values(1):
        trader_comparison[('pnl_diff', 'Greed_minus_Fear')] = (
            trader_comparison[('closedPnL', 'Greed')] - trader_comparison[('closedPnL', 'Fear')]
        )
        
        print("Sample of trader performance comparison:")
        print(trader_comparison.head())
        
        # Plot the differences
        pnl_diff = trader_comparison[('pnl_diff', 'Greed_minus_Fear')].dropna()
        
        plt.figure(figsize=(12, 6))
        plt.hist(pnl_diff, bins=30, alpha=0.7, edgecolor='black')
        plt.axvline(0, color='red', linestyle='--', label='No difference')
        plt.axvline(pnl_diff.mean(), color='green', linestyle='--', label=f'Mean: {pnl_diff.mean():.4f}')
        plt.title('Distribution of PnL Difference (Greed - Fear) for Multi-Sentiment Traders')
        plt.xlabel('PnL Difference')
        plt.ylabel('Number of Traders')
        plt.legend()
        plt.show()
        
        print(f"Average PnL difference (Greed - Fear): {pnl_diff.mean():.4f}")
        print(f"Traders performing better in Greed: {(pnl_diff > 0).sum()}")
        print(f"Traders performing better in Fear: {(pnl_diff < 0).sum()}")

# Cell 9: Risk Analysis by Sentiment
print("=== RISK ANALYSIS BY SENTIMENT ===")

# Calculate risk metrics
risk_metrics = merged_data.groupby('Classification').agg({
    'closedPnL': ['std', lambda x: np.percentile(x, 5), lambda x: np.percentile(x, 95)],
    'size': 'std',
    'leverage': 'std'
}).round(4)

risk_metrics.columns = ['PnL_Std', 'PnL_5th_Percentile', 'PnL_95th_Percentile', 'Size_Std', 'Leverage_Std']
print("Risk Metrics by Sentiment:")
print(risk_metrics)

# Sharpe ratio approximation (assuming risk-free rate = 0)
returns_by_sentiment = merged_data.groupby('Classification')['closedPnL'].agg(['mean', 'std'])
returns_by_sentiment['sharpe_ratio'] = returns_by_sentiment['mean'] / returns_by_sentiment['std']
print("\nSharpe Ratio by Sentiment:")
print(returns_by_sentiment['sharpe_ratio'])

# Cell 10: Summary and Insights
print("=== SENTIMENT ANALYSIS SUMMARY ===")

# Key findings
fear_mean = merged_data[merged_data['Classification'] == 'Fear']['closedPnL'].mean()
greed_mean = merged_data[merged_data['Classification'] == 'Greed']['closedPnL'].mean()
fear_winrate = merged_data[merged_data['Classification'] == 'Fear']['is_profitable'].mean()
greed_winrate = merged_data[merged_data['Classification'] == 'Greed']['is_profitable'].mean()

print(f"\nKEY FINDINGS:")
print(f"1. Average PnL during Fear: {fear_mean:.4f}")
print(f"2. Average PnL during Greed: {greed_mean:.4f}")
print(f"3. Win rate during Fear: {fear_winrate:.4f}")
print(f"4. Win rate during Greed: {greed_winrate:.4f}")
print(f"5. Performance difference: {greed_mean - fear_mean:.4f}")

# Save results
sentiment_results = {
    'sentiment_performance': sentiment_performance,
    'statistical_test': {'t_stat': t_stat, 'p_value': p_value} if 't_stat' in locals() else None,
    'risk_metrics': risk_metrics,
    'key_findings': {
        'fear_avg_pnl': fear_mean,
        'greed_avg_pnl': greed_mean,
        'fear_winrate': fear_winrate,
        'greed_winrate': greed_winrate
    }
}

import pickle
with open('../data/outputs/sentiment_analysis_results.pkl', 'wb') as f:
    pickle.dump(sentiment_results, f)

print("\n✅ Sentiment analysis complete! Results saved to outputs folder.")