# Hull Tactical Market Prediction - Exploratory Data Analysis

This notebook explores the training data to understand:
- Target variable distribution and characteristics
- Feature distributions and correlations
- Temporal patterns and trends
- Missing value patterns
- Outliers and anomalies

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
%matplotlib inline

## Load Data

In [None]:
# Define paths
data_dir = Path.cwd().parent / "data" / "raw"
train_csv = data_dir / "train.csv"
test_csv = data_dir / "test.csv"

print(f"Loading data from {data_dir}")
df_train = pd.read_csv(train_csv)
df_test = pd.read_csv(test_csv)

print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")
print(f"\nColumns: {list(df_train.columns[:10])}...")
df_train.head()

## Feature Groups

The features are organized into groups:
- **D (Date):** D1-D9 - Date/time features
- **E (Economic):** E1-E20 - Economic indicators
- **I (Interest):** I1-I9 - Interest rate features
- **M (Market):** M1-M18 - Market features
- **P (Price):** P1-P13 - Price features
- **S (Spread):** S1-S12 - Spread features
- **V (Volatility):** V1-V13 - Volatility features

In [None]:
# Categorize features
feature_groups = {
    'Date': [c for c in df_train.columns if c.startswith('D')],
    'Economic': [c for c in df_train.columns if c.startswith('E')],
    'Interest': [c for c in df_train.columns if c.startswith('I')],
    'Market': [c for c in df_train.columns if c.startswith('M')],
    'Price': [c for c in df_train.columns if c.startswith('P')],
    'Spread': [c for c in df_train.columns if c.startswith('S')],
    'Volatility': [c for c in df_train.columns if c.startswith('V')],
}

print("Feature Group Sizes:")
for group, cols in feature_groups.items():
    print(f"  {group:12s}: {len(cols):2d} features")

target_cols = ['forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
print(f"\nTarget columns: {target_cols}")

## Target Variable Analysis

In [None]:
# Target statistics
print("Target Variable Statistics:")
print("=" * 80)
df_train[target_cols].describe()

In [None]:
# Distribution plots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, col in enumerate(target_cols):
    ax = axes[idx]
    df_train[col].hist(bins=50, ax=ax, edgecolor='black', alpha=0.7)
    ax.set_title(f'{col} Distribution')
    ax.set_xlabel('Value')
    ax.set_ylabel('Count')
    ax.axvline(df_train[col].median(), color='red', linestyle='--', 
               label=f'Median: {df_train[col].median():.4f}')
    ax.axvline(df_train[col].mean(), color='green', linestyle='--',
               label=f'Mean: {df_train[col].mean():.4f}')
    ax.legend()

plt.tight_layout()
plt.show()

# Check for normality
print("\nNormality Tests (Shapiro-Wilk):")
for col in target_cols:
    stat, p_value = stats.shapiro(df_train[col].dropna()[:5000])  # Sample for speed
    print(f"  {col:35s}: p-value = {p_value:.6f} {'(Normal)' if p_value > 0.05 else '(Not Normal)'}")

## Temporal Analysis

In [None]:
# Plot forward returns over time
fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Forward returns time series
axes[0].plot(df_train['date_id'], df_train['forward_returns'], linewidth=0.8, alpha=0.7)
axes[0].set_title('Forward Returns Over Time')
axes[0].set_xlabel('Date ID')
axes[0].set_ylabel('Forward Returns')
axes[0].axhline(0, color='red', linestyle='--', linewidth=1, alpha=0.5)
axes[0].grid(True, alpha=0.3)

# Rolling statistics
window = 21  # ~1 month
rolling_mean = df_train['forward_returns'].rolling(window=window).mean()
rolling_std = df_train['forward_returns'].rolling(window=window).std()

axes[1].plot(df_train['date_id'], rolling_mean, label=f'{window}-day Mean', linewidth=1.5)
axes[1].fill_between(df_train['date_id'], 
                      rolling_mean - rolling_std, 
                      rolling_mean + rolling_std, 
                      alpha=0.3, label=f'{window}-day Std')
axes[1].set_title(f'Forward Returns - {window}-Day Rolling Statistics')
axes[1].set_xlabel('Date ID')
axes[1].set_ylabel('Forward Returns')
axes[1].axhline(0, color='red', linestyle='--', linewidth=1, alpha=0.5)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Missing Value Analysis

In [None]:
# Calculate missing value percentages
missing_pct = (df_train.isnull().sum() / len(df_train) * 100).sort_values(ascending=False)
missing_pct = missing_pct[missing_pct > 0]

if len(missing_pct) > 0:
    print(f"Features with missing values: {len(missing_pct)}")
    print("\nTop 20 features by missing percentage:")
    print(missing_pct.head(20))
    
    # Plot missing values
    plt.figure(figsize=(12, 6))
    missing_pct.head(30).plot(kind='bar', color='steelblue', edgecolor='black')
    plt.title('Missing Value Percentage by Feature (Top 30)')
    plt.xlabel('Feature')
    plt.ylabel('Missing %')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("âœ“ No missing values in the dataset!")

## Feature Correlation Analysis

In [None]:
# Correlation with target
feature_cols = [c for c in df_train.columns if c not in ['date_id'] + target_cols]
correlations = df_train[feature_cols + ['forward_returns']].corr()['forward_returns'].drop('forward_returns')
correlations = correlations.abs().sort_values(ascending=False)

print("Top 20 Features by Correlation with Forward Returns:")
print(correlations.head(20))

# Plot top correlations
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Top positive correlations
top_pos = df_train[feature_cols + ['forward_returns']].corr()['forward_returns'].drop('forward_returns').sort_values(ascending=False).head(15)
top_pos.plot(kind='barh', ax=axes[0], color='green', edgecolor='black')
axes[0].set_title('Top 15 Positive Correlations with Forward Returns')
axes[0].set_xlabel('Correlation')
axes[0].grid(axis='x', alpha=0.3)

# Top negative correlations
top_neg = df_train[feature_cols + ['forward_returns']].corr()['forward_returns'].drop('forward_returns').sort_values(ascending=True).head(15)
top_neg.plot(kind='barh', ax=axes[1], color='red', edgecolor='black')
axes[1].set_title('Top 15 Negative Correlations with Forward Returns')
axes[1].set_xlabel('Correlation')
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

## Feature Group Analysis

In [None]:
# Average correlation by feature group
group_correlations = {}
for group, cols in feature_groups.items():
    valid_cols = [c for c in cols if c in df_train.columns]
    if valid_cols:
        corrs = df_train[valid_cols + ['forward_returns']].corr()['forward_returns'].drop('forward_returns')
        group_correlations[group] = corrs.abs().mean()

# Plot
plt.figure(figsize=(10, 6))
pd.Series(group_correlations).sort_values(ascending=False).plot(kind='bar', color='purple', edgecolor='black')
plt.title('Average Absolute Correlation with Forward Returns by Feature Group')
plt.xlabel('Feature Group')
plt.ylabel('Average |Correlation|')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nFeature Group Importance (by avg correlation):")
for group, corr in sorted(group_correlations.items(), key=lambda x: x[1], reverse=True):
    print(f"  {group:12s}: {corr:.4f}")

## Sharpe Ratio Analysis

In [None]:
# Calculate Sharpe ratio over time (rolling window)
window = 252  # ~1 year

rolling_sharpe = (
    df_train['forward_returns'].rolling(window=window).mean() / 
    df_train['forward_returns'].rolling(window=window).std()
)

plt.figure(figsize=(14, 6))
plt.plot(df_train['date_id'], rolling_sharpe, linewidth=1.5, label=f'{window}-day Rolling Sharpe')
plt.axhline(0, color='red', linestyle='--', linewidth=1, alpha=0.5)
plt.axhline(1, color='green', linestyle='--', linewidth=1, alpha=0.5, label='Sharpe = 1')
plt.axhline(-1, color='orange', linestyle='--', linewidth=1, alpha=0.5, label='Sharpe = -1')
plt.title(f'Rolling Sharpe Ratio ({window}-day window)')
plt.xlabel('Date ID')
plt.ylabel('Sharpe Ratio')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Overall Sharpe ratio
overall_sharpe = df_train['forward_returns'].mean() / df_train['forward_returns'].std()
print(f"\nOverall Sharpe Ratio: {overall_sharpe:.4f}")

## Summary Statistics

In [None]:
print("=" * 80)
print("Hull Tactical Market Prediction - Dataset Summary")
print("=" * 80)
print(f"Training samples: {len(df_train)}")
print(f"Test samples: {len(df_test)}")
print(f"Total features: {len(feature_cols)}")
print(f"\nFeature Groups:")
for group, cols in feature_groups.items():
    print(f"  {group:12s}: {len(cols):2d} features")
print(f"\nTarget Statistics:")
print(f"  Forward Returns:")
print(f"    Mean: {df_train['forward_returns'].mean():.6f}")
print(f"    Std:  {df_train['forward_returns'].std():.6f}")
print(f"    Min:  {df_train['forward_returns'].min():.6f}")
print(f"    Max:  {df_train['forward_returns'].max():.6f}")
print(f"  Sharpe Ratio: {overall_sharpe:.4f}")
print(f"\nMissing Values: {len(missing_pct)} features with missing data")
print("=" * 80)