In [None]:
# Cell 1: Setup and Load All Data
import sys
sys.path.append('../src')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import warnings
warnings.filterwarnings('ignore')

# Load all processed data
merged_data = pd.read_csv('../data/processed/merged_data.csv')
print(f"Loaded merged data: {merged_data.shape}")

# Cell 2: Advanced Trader Segmentation
print("=== ADVANCED TRADER SEGMENTATION ===")

# Calculate comprehensive trader metrics
trader_metrics = merged_data.groupby('account').agg({
    'closedPnL': ['sum', 'mean', 'std', 'count', 'min', 'max'],
    'size': ['mean', 'sum', 'std'],
    'leverage': ['mean', 'std'],
    'is_profitable': ['mean', 'sum'],
    'sentiment_score': 'mean'
}).round(4)

# Flatten column names
trader_metrics.columns = ['_'.join(col).strip() for col in trader_metrics.columns]
trader_metrics = trader_metrics.reset_index()

# Calculate additional metrics
trader_metrics['total_pnl'] = trader_metrics['closedPnL_sum']
trader_metrics['avg_pnl'] = trader_metrics['closedPnL_mean']
trader_metrics['win_rate'] = trader_metrics['is_profitable_mean']
trader_metrics['trade_count'] = trader_metrics['closedPnL_count']
trader_metrics['pnl_volatility'] = trader_metrics['closedPnL_std']
trader_metrics['sharpe_ratio'] = trader_metrics['avg_pnl'] / trader_metrics['pnl_volatility']
trader_metrics['sharpe_ratio'] = trader_metrics['sharpe_ratio'].replace([np.inf, -np.inf], 0)

print(f"Analyzed {len(trader_metrics)} unique traders")
print("\nTrader metrics summary:")
print(trader_metrics[['total_pnl', 'win_rate', 'trade_count', 'sharpe_ratio']].describe())

# Cell 3: Trader Clustering Analysis
print("=== TRADER CLUSTERING ANALYSIS ===")

# Select features for clustering
clustering_features = ['total_pnl', 'win_rate', 'trade_count', 'avg_pnl', 'pnl_volatility']
clustering_data = trader_metrics[clustering_features].fillna(0)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(clustering_data)

# Determine optimal number of clusters using elbow method
inertias = []
K_range = range(2, 11)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(K_range, inertias, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.show()

# Use k=4 clusters (adjust based on elbow curve)
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
trader_metrics['cluster'] = kmeans.fit_predict(X_scaled)

# Analyze clusters
print(f"\nCluster Analysis (k={optimal_k}):")
cluster_summary = trader_metrics.groupby('cluster')[clustering_features].mean().round(4)
print(cluster_summary)

# Visualize clusters
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Cluster distribution
cluster_counts = trader_metrics['cluster'].value_counts().sort_index()
axes[0,0].bar(cluster_counts.index, cluster_counts.values, color='skyblue')
axes[0,0].set_title('Trader Distribution by Cluster')
axes[0,0].set_xlabel('Cluster')
axes[0,0].set_ylabel('Number of Traders')

# PnL vs Win Rate by cluster
scatter = axes[0,1].scatter(trader_metrics['win_rate'], trader_metrics['total_pnl'], 
                           c=trader_metrics['cluster'], cmap='viridis', alpha=0.7)
axes[0,1].set_xlabel('Win Rate')
axes[0,1].set_ylabel('Total PnL')
axes[0,1].set_title('Traders by Win Rate vs Total PnL')
plt.colorbar(scatter, ax=axes[0,1])

# Trade Count vs PnL by cluster
scatter2 = axes[1,0].scatter(trader_metrics['trade_count'], trader_metrics['total_pnl'], 
                            c=trader_metrics['cluster'], cmap='viridis', alpha=0.7)
axes[1,0].set_xlabel('Trade Count')
axes[1,0].set_ylabel('Total PnL')
axes[1,0].set_title('Traders by Trade Count vs Total PnL')
plt.colorbar(scatter2, ax=axes[1,0])

# Cluster characteristics heatmap
cluster_chars = trader_metrics.groupby('cluster')[clustering_features].mean()
sns.heatmap(cluster_chars.T, annot=True, cmap='RdYlBu_r', ax=axes[1,1])
axes[1,1].set_title('Cluster Characteristics Heatmap')

plt.tight_layout()
plt.show()

# Cell 4: Predictive Modeling
print("=== PREDICTIVE MODELING ===")

# Prepare data for modeling
modeling_data = merged_data.dropna(subset=['closedPnL', 'size', 'leverage', 'sentiment_score'])

# Feature engineering for modeling
features = ['size', 'leverage', 'sentiment_score']
if 'hour' in modeling_data.columns:
    features.append('hour')
if 'day_of_week' in modeling_data.columns:
    features.append('day_of_week')

X = modeling_data[features]
y = modeling_data['closedPnL']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"MSE: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nFeature Importance:")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for PnL Prediction')
plt.gca().invert_yaxis()
plt.show()

# Cell 5: Sentiment Impact Deep Dive
print("=== SENTIMENT IMPACT DEEP DIVE ===")

# Analyze sentiment impact by trader cluster
sentiment_cluster_analysis = merged_data.merge(
    trader_metrics[['account', 'cluster']], 
    on='account', 
    how='left'
)

cluster_sentiment_performance = sentiment_cluster_analysis.groupby(['cluster', 'Classification']).agg({
    'closedPnL': ['mean', 'std', 'count'],
    'is_profitable': 'mean',
    'size': 'mean'
}).round(4)

print("Performance by Cluster and Sentiment:")
print(cluster_sentiment_performance)

# Visualize sentiment impact by cluster
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# PnL by sentiment and cluster
pivot_pnl = sentiment_cluster_analysis.pivot_table(
    values='closedPnL', 
    index='cluster', 
    columns='Classification', 
    aggfunc='mean'
)
sns.heatmap(pivot_pnl, annot=True, cmap='RdYlGn', ax=axes[0,0])
axes[0,0].set_title('Average PnL by Cluster and Sentiment')

# Win rate by sentiment and cluster
pivot_winrate = sentiment_cluster_analysis.pivot_table(
    values='is_profitable', 
    index='cluster', 
    columns='Classification', 
    aggfunc='mean'
)
sns.heatmap(pivot_winrate, annot=True, cmap='RdYlGn', ax=axes[0,1])
axes[0,1].set_title('Win Rate by Cluster and Sentiment')

# Box plot of PnL by cluster for each sentiment
for i, sentiment in enumerate(['Fear', 'Greed']):
    sentiment_data = sentiment_cluster_analysis[sentiment_cluster_analysis['Classification'] == sentiment]
    if not sentiment_data.empty:
        sns.boxplot(data=sentiment_data, x='cluster', y='closedPnL', ax=axes[1,i])
        axes[1,i].set_title(f'PnL Distribution by Cluster ({sentiment})')

plt.tight_layout()
plt.show()

# Cell 6: Market Regime Analysis
print("=== MARKET REGIME ANALYSIS ===")

# Analyze performance in different market conditions
if 'date' in merged_data.columns:
    merged_data['date'] = pd.to_datetime(merged_data['date'])
    
    # Calculate rolling metrics
    daily_metrics = merged_data.groupby('date').agg({
        'closedPnL': ['sum', 'mean', 'std'],
        'size': 'sum',
        'Classification': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown'
    })
    
    daily_metrics.columns = ['daily_pnl_sum', 'daily_pnl_mean', 'daily_pnl_std', 'daily_volume', 'dominant_sentiment']
    
    # Calculate rolling volatility
    daily_metrics['volatility_7d'] = daily_metrics['daily_pnl_std'].rolling(7).mean()
    daily_metrics['pnl_trend_7d'] = daily_metrics['daily_pnl_sum'].rolling(7).mean()
    
    # Define market regimes
    vol_median = daily_metrics['volatility_7d'].median()
    trend_median = daily_metrics['pnl_trend_7d'].median()
    
    conditions = [
        (daily_metrics['volatility_7d'] <= vol_median) & (daily_metrics['pnl_trend_7d'] >= trend_median),
        (daily_metrics['volatility_7d'] <= vol_median) & (daily_metrics['pnl_trend_7d'] < trend_median),
        (daily_metrics['volatility_7d'] > vol_median) & (daily_metrics['pnl_trend_7d'] >= trend_median),
        (daily_metrics['volatility_7d'] > vol_median) & (daily_metrics['pnl_trend_7d'] < trend_median)
    ]
    
    choices = ['Low_Vol_Positive', 'Low_Vol_Negative', 'High_Vol_Positive', 'High_Vol_Negative']
    daily_metrics['market_regime'] = np.select(conditions, choices, default='Undefined')
    
    # Merge back with main data
    merged_data_regime = merged_data.merge(
        daily_metrics[['market_regime']].reset_index(),
        on='date',
        how='left'
    )
    
    # Analyze performance by market regime
    regime_performance = merged_data_regime.groupby('market_regime').agg({
        'closedPnL': ['count', 'mean', 'std'],
        'is_profitable': 'mean',
        'size': 'mean'
    }).round(4)
    
    print("Performance by Market Regime:")
    print(regime_performance)
    
    # Visualize market regimes
    plt.figure(figsize=(15, 8))
    regime_colors = {'Low_Vol_Positive': 'green', 'Low_Vol_Negative': 'lightcoral', 
                    'High_Vol_Positive': 'darkgreen', 'High_Vol_Negative': 'red'}
    
    for regime in daily_metrics['market_regime'].unique():
        if regime != 'Undefined':
            regime_data = daily_metrics[daily_metrics['market_regime'] == regime]
            plt.scatter(regime_data.index, regime_data['daily_pnl_sum'], 
                       label=regime, color=regime_colors.get(regime, 'gray'), alpha=0.7)
    
    plt.xlabel('Date')
    plt.ylabel('Daily PnL Sum')
    plt.title('Market Regimes and Daily PnL')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Cell 7: Risk-Adjusted Performance Analysis
print("=== RISK-ADJUSTED PERFORMANCE ANALYSIS ===")

# Calculate risk-adjusted metrics for each trader
trader_risk_metrics = merged_data.groupby('account').agg({
    'closedPnL': ['mean', 'std', 'sum', 'count'],
    'size': 'mean',
    'leverage': 'mean'
}).round(4)

trader_risk_metrics.columns = ['avg_pnl', 'pnl_std', 'total_pnl', 'trade_count', 'avg_size', 'avg_leverage']
trader_risk_metrics = trader_risk_metrics.reset_index()

# Calculate additional risk metrics
trader_risk_metrics['sharpe_ratio'] = trader_risk_metrics['avg_pnl'] / trader_risk_metrics['pnl_std']
trader_risk_metrics['sharpe_ratio'] = trader_risk_metrics['sharpe_ratio'].replace([np.inf, -np.inf], 0)

# Calculate downside deviation
downside_returns = merged_data[merged_data['closedPnL'] < 0].groupby('account')['closedPnL'].std()
trader_risk_metrics = trader_risk_metrics.merge(
    downside_returns.reset_index().rename(columns={'closedPnL': 'downside_std'}),
    on='account',
    how='left'
)
trader_risk_metrics['downside_std'] = trader_risk_metrics['downside_std'].fillna(0)
trader_risk_metrics['sortino_ratio'] = trader_risk_metrics['avg_pnl'] / trader_risk_metrics['downside_std']
trader_risk_metrics['sortino_ratio'] = trader_risk_metrics['sortino_ratio'].replace([np.inf, -np.inf], 0)

# Maximum drawdown calculation (simplified)
def calculate_max_drawdown(account_data):
    cumulative = account_data['closedPnL'].cumsum()
    running_max = cumulative.expanding().max()
    drawdown = (cumulative - running_max) / running_max
    return drawdown.min() if len(drawdown) > 0 else 0

max_drawdowns = merged_data.groupby('account').apply(calculate_max_drawdown)
trader_risk_metrics = trader_risk_metrics.merge(
    max_drawdowns.reset_index().rename(columns={0: 'max_drawdown'}),
    on='account',
    how='left'
)

print("Risk-Adjusted Performance Metrics:")
print(trader_risk_metrics[['account', 'sharpe_ratio', 'sortino_ratio', 'max_drawdown']].head(10))

# Risk-return scatter plot
plt.figure(figsize=(12, 8))
scatter = plt.scatter(trader_risk_metrics['pnl_std'], trader_risk_metrics['avg_pnl'], 
                     c=trader_risk_metrics['sharpe_ratio'], cmap='RdYlGn', alpha=0.7)
plt.colorbar(scatter, label='Sharpe Ratio')
plt.xlabel('PnL Standard Deviation (Risk)')
plt.ylabel('Average PnL (Return)')
plt.title('Risk-Return Profile of Traders')
plt.grid(True, alpha=0.3)
plt.show()

# Cell 8: Advanced Pattern Recognition
print("=== ADVANCED PATTERN RECOGNITION ===")

# Identify trading patterns
# Pattern 1: Sentiment contrarians (perform better when sentiment is opposite)
trader_sentiment_performance = merged_data.groupby(['account', 'Classification'])['closedPnL'].mean().unstack(fill_value=0)

if 'Fear' in trader_sentiment_performance.columns and 'Greed' in trader_sentiment_performance.columns:
    # Calculate sentiment preference
    trader_sentiment_performance['fear_preference'] = (
        trader_sentiment_performance['Fear'] - trader_sentiment_performance['Greed']
    )
    
    # Identify contrarians (perform better during fear)
    contrarians = trader_sentiment_performance[trader_sentiment_performance['fear_preference'] > 0].index
    momentum_traders = trader_sentiment_performance[trader_sentiment_performance['fear_preference'] < 0].index
    
    print(f"Contrarian traders (better in Fear): {len(contrarians)}")
    print(f"Momentum traders (better in Greed): {len(momentum_traders)}")
    
    # Analyze contrarian vs momentum trader characteristics
    trader_types = pd.DataFrame({
        'account': list(contrarians) + list(momentum_traders),
        'type': ['Contrarian'] * len(contrarians) + ['Momentum'] * len(momentum_traders)
    })
    
    trader_type_analysis = merged_data.merge(trader_types, on='account', how='inner')
    
    type_performance = trader_type_analysis.groupby('type').agg({
        'closedPnL': ['mean', 'std', 'sum'],
        'leverage': 'mean',
        'size': 'mean',
        'is_profitable': 'mean'
    }).round(4)
    
    print("\nTrader Type Performance:")
    print(type_performance)

# Pattern 2: High-frequency vs Low-frequency traders
trade_frequency = merged_data.groupby('account').size()
freq_quartiles = trade_frequency.quantile([0.25, 0.75])

high_freq_traders = trade_frequency[trade_frequency >= freq_quartiles[0.75]].index
low_freq_traders = trade_frequency[trade_frequency <= freq_quartiles[0.25]].index

freq_analysis = merged_data[merged_data['account'].isin(list(high_freq_traders) + list(low_freq_traders))].copy()
freq_analysis['frequency_type'] = freq_analysis['account'].apply(
    lambda x: 'High_Frequency' if x in high_freq_traders else 'Low_Frequency'
)

freq_performance = freq_analysis.groupby('frequency_type').agg({
    'closedPnL': ['mean', 'std', 'sum'],
    'leverage': 'mean',
    'is_profitable': 'mean'
}).round(4)

print(f"\nFrequency Analysis:")
print(f"High-frequency traders: {len(high_freq_traders)}")
print(f"Low-frequency traders: {len(low_freq_traders)}")
print(freq_performance)

# Cell 9: Portfolio-Level Analysis
print("=== PORTFOLIO-LEVEL ANALYSIS ===")

# Simulate a portfolio based on different strategies
if 'date' in merged_data.columns:
    # Strategy 1: Follow top performers
    top_performers = trader_metrics.nlargest(10, 'total_pnl')['account'].tolist()
    
    # Strategy 2: Follow contrarians during fear
    if len(contrarians) > 0:
        top_contrarians = trader_sentiment_performance.loc[contrarians].nlargest(5, 'fear_preference').index.tolist()
    else:
        top_contrarians = []
    
    # Calculate portfolio performance
    portfolio_strategies = {
        'Top_Performers': top_performers,
        'Contrarians': top_contrarians if top_contrarians else top_performers[:5]
    }
    
    portfolio_results = {}
    
    for strategy, traders in portfolio_strategies.items():
        strategy_data = merged_data[merged_data['account'].isin(traders)]
        daily_portfolio = strategy_data.groupby('date')['closedPnL'].sum()
        
        portfolio_results[strategy] = {
            'total_return': daily_portfolio.sum(),
            'daily_volatility': daily_portfolio.std(),
            'sharpe_ratio': daily_portfolio.mean() / daily_portfolio.std() if daily_portfolio.std() > 0 else 0,
            'max_daily_loss': daily_portfolio.min(),
            'max_daily_gain': daily_portfolio.max()
        }
    
    print("Portfolio Strategy Performance:")
    portfolio_df = pd.DataFrame(portfolio_results).T
    print(portfolio_df)
    
    # Plot portfolio performance
    plt.figure(figsize=(12, 8))
    for strategy, traders in portfolio_strategies.items():
        strategy_data = merged_data[merged_data['account'].isin(traders)]
        daily_portfolio = strategy_data.groupby('date')['closedPnL'].sum().cumsum()
        plt.plot(daily_portfolio.index, daily_portfolio.values, label=strategy, linewidth=2)
    
    plt.xlabel('Date')
    plt.ylabel('Cumulative PnL')
    plt.title('Portfolio Strategy Performance Comparison')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Cell 10: Final Insights and Recommendations
print("=== FINAL INSIGHTS AND RECOMMENDATIONS ===")

# Generate comprehensive insights
insights = {
    'trader_segmentation': {
        'total_traders': len(trader_metrics),
        'profitable_traders': len(trader_metrics[trader_metrics['total_pnl'] > 0]),
        'top_10_pct_pnl': trader_metrics['total_pnl'].quantile(0.9),
        'avg_trades_per_trader': trader_metrics['trade_count'].mean()
    },
    'sentiment_impact': {
        'fear_avg_pnl': merged_data[merged_data['Classification'] == 'Fear']['closedPnL'].mean(),
        'greed_avg_pnl': merged_data[merged_data['Classification'] == 'Greed']['closedPnL'].mean(),
        'sentiment_correlation': merged_data[['closedPnL', 'sentiment_score']].corr().iloc[0,1]
    },
    'risk_patterns': {
        'avg_sharpe_ratio': trader_risk_metrics['sharpe_ratio'].mean(),
        'high_sharpe_threshold': trader_risk_metrics['sharpe_ratio'].quantile(0.8),
        'avg_max_drawdown': trader_risk_metrics['max_drawdown'].mean()
    }
}

print("KEY INSIGHTS:")
print(f"1. {insights['trader_segmentation']['profitable_traders']}/{insights['trader_segmentation']['total_traders']} traders were profitable")
print(f"2. Top 10% threshold PnL: {insights['trader_segmentation']['top_10_pct_pnl']:.2f}")
print(f"3. Average PnL during Fear: {insights['sentiment_impact']['fear_avg_pnl']:.4f}")
print(f"4. Average PnL during Greed: {insights['sentiment_impact']['greed_avg_pnl']:.4f}")
print(f"5. PnL-Sentiment correlation: {insights['sentiment_impact']['sentiment_correlation']:.4f}")
print(f"6. Average Sharpe ratio: {insights['risk_patterns']['avg_sharpe_ratio']:.4f}")

# Recommendations
print("\nRECOMMENDATIONS:")
print("1. Focus on traders in the top-performing clusters for strategy insights")
print("2. Consider sentiment as a factor in trading strategies")
print("3. Implement risk management based on identified drawdown patterns")
print("4. Diversify across different trader types (contrarian vs momentum)")
print("5. Monitor market regime changes for strategy adaptation")

# Save all results
final_results = {
    'trader_metrics': trader_metrics,
    'cluster_analysis': cluster_summary,
    'model_performance': {'mse': mse, 'r2': r2, 'feature_importance': feature_importance},
    'insights': insights,
    'portfolio_results': portfolio_results if 'portfolio_results' in locals() else None
}

# Save to pickle file
with open('../data/outputs/final_analysis_results.pkl', 'wb') as f:
    pickle.dump(final_results, f)

# Save key dataframes
trader_metrics.to_csv('../data/outputs/final_trader_metrics.csv', index=False)
if 'trader_risk_metrics' in locals():
    trader_risk_metrics.to_csv('../data/outputs/trader_risk_metrics.csv', index=False)

print("\n Final analysis complete! All results saved to outputs folder.")
print(" Check the following files:")
print("- final_analysis_results.pkl (comprehensive results)")
print("- final_trader_metrics.csv (trader performance metrics)")
print("- trader_risk_metrics.csv (risk-adjusted metrics)")