In [None]:
# Cell 1: Setup and Imports
import sys
sys.path.append('../src')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data_loader import DataLoader
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)

# Cell 2: Load Data
print("Loading datasets...")
loader = DataLoader(
    '../data/raw/historical_data.csv',
    '../data/raw/fear_greed_index.csv'
)
trader_data, sentiment_data = loader.load_all_data()

# Cell 3: Trader Data Exploration
print("=== TRADER DATA EXPLORATION ===")
print(f"Shape: {trader_data.shape}")
print(f"Columns: {list(trader_data.columns)}")
print("\nData Types:")
print(trader_data.dtypes)
print("\nFirst 5 rows:")
trader_data.head()

# Cell 4: Trader Data Summary Statistics
print("=== TRADER DATA SUMMARY STATISTICS ===")
trader_data.describe()

# Cell 5: Sentiment Data Exploration
print("=== SENTIMENT DATA EXPLORATION ===")
print(f"Shape: {sentiment_data.shape}")
print(f"Columns: {list(sentiment_data.columns)}")
print("\nData Types:")
print(sentiment_data.dtypes)
print("\nFirst 5 rows:")
sentiment_data.head()

# Cell 6: Data Quality Assessment
print("=== DATA QUALITY ASSESSMENT ===")
print("Missing values in trader data:")
print(trader_data.isnull().sum())
print(f"Missing percentage:")
print((trader_data.isnull().sum() / len(trader_data) * 100).round(2))

print("\nMissing values in sentiment data:")
print(sentiment_data.isnull().sum())

# Cell 7: Unique Values Analysis
print("=== UNIQUE VALUES ANALYSIS ===")
print(f"Unique traders: {trader_data['account'].nunique()}")
print(f"Unique symbols: {trader_data['symbol'].nunique()}")
if 'side' in trader_data.columns:
    print(f"Trade sides: {trader_data['side'].value_counts()}")
print(f"Sentiment classifications: {sentiment_data['Classification'].value_counts()}")

# Cell 8: Basic Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# PnL distribution
axes[0,0].hist(trader_data['closedPnL'].dropna(), bins=50, alpha=0.7)
axes[0,0].set_title('Distribution of Closed PnL')
axes[0,0].set_xlabel('Closed PnL')

# Size distribution
axes[0,1].hist(trader_data['size'].dropna(), bins=50, alpha=0.7)
axes[0,1].set_title('Distribution of Trade Sizes')
axes[0,1].set_xlabel('Size')

# Leverage distribution
if 'leverage' in trader_data.columns:
    axes[1,0].hist(trader_data['leverage'].dropna(), bins=30, alpha=0.7)
    axes[1,0].set_title('Distribution of Leverage')
    axes[1,0].set_xlabel('Leverage')

# Sentiment over time
sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date'])
sentiment_counts = sentiment_data.groupby(['Date', 'Classification']).size().unstack(fill_value=0)
sentiment_counts.plot(kind='area', ax=axes[1,1], alpha=0.7)
axes[1,1].set_title('Sentiment Over Time')
axes[1,1].set_ylabel('Count')

plt.tight_layout()
plt.show()

# Cell 9: Time Series Analysis
if 'time' in trader_data.columns:
    trader_data['time'] = pd.to_datetime(trader_data['time'])
    trader_data['date'] = trader_data['time'].dt.date
    
    # Daily aggregations
    daily_stats = trader_data.groupby('date').agg({
        'closedPnL': ['sum', 'mean', 'count'],
        'size': 'sum',
        'leverage': 'mean'
    }).round(4)
    
    print("=== DAILY TRADING STATISTICS ===")
    print(daily_stats.head(10))

# Cell 10: Export Exploration Results
trader_data.to_csv('../data/processed/trader_data_explored.csv', index=False)
sentiment_data.to_csv('../data/processed/sentiment_data_explored.csv', index=False)
print("Exploration complete! Data saved to processed folder.")