In [6]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# -----------------------------------
# PART A: Aggregate Contract-Level Data to Daily Data Using Moments
# -----------------------------------
# Load the raw contract-level data.
df = pd.read_csv('qqq_merged_data_engineered.csv')

# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Drop any columns with NA values (so we only keep columns with complete data)
df = df.dropna(axis=1)

# Use all columns except 'Date' as features.
contract_feature_cols = [col for col in df.columns if col != 'Date']

# Aggregate by date:
agg_rows = []
for date, group in df.groupby('Date'):
    agg = {'Date': date}
    for col in contract_feature_cols:
        # Convert to float and compute moments
        vals = group[col].values.astype(float)
        agg[f'{col}_mean'] = np.mean(vals)
        agg[f'{col}_std'] = np.std(vals)
        agg[f'{col}_skew'] = skew(vals)
        agg[f'{col}_kurtosis'] = kurtosis(vals)
    agg_rows.append(agg)

daily_df = pd.DataFrame(agg_rows)
daily_df = daily_df.sort_values('Date').reset_index(drop=True)
print("Aggregated daily data shape:", daily_df.shape)
print("Unique dates in aggregated daily data:", daily_df['Date'].nunique())

# Create the target:
# Assume we use the aggregated "Close/Last_mean" as the basis.
if 'Close/Last_mean' not in daily_df.columns:
    raise ValueError("Expected aggregated column 'Close/Last_mean' not found.")
daily_df['Next_Close'] = daily_df['Close/Last_mean'].shift(-1)
daily_df = daily_df.dropna(subset=['Next_Close']).reset_index(drop=True)
# Define binary Direction target: 1 if next day's aggregated close > today's, else 0.
daily_df['Direction'] = (daily_df['Next_Close'] > daily_df['Close/Last_mean']).astype(int)

# -----------------------------------
# PART B: Walk-Forward Sliding Window with Bootstrapping
# -----------------------------------
# For this example we treat each daily row as one sample.
# We need to design a walk-forward approach since there are only ~20 dates.
# We also use bootstrap sampling within each training window to alleviate the small sample size.

# Define features to use (exclude Date, Next_Close, and Direction).
excluded_cols = ['Date', 'Next_Close', 'Direction']
features = [col for col in daily_df.columns if col not in excluded_cols]

# Set initial training window size. With ~20 dates, we may choose 10.
initial_window = 10
bootstrap_iterations = 50

results = []

# Walk-forward: starting from day index = initial_window up to the end.
for i in range(initial_window, len(daily_df)):
    train_df = daily_df.iloc[:i]       # All days before (and including) day i-1.
    test_df = daily_df.iloc[i:i+1]       # Day i is our forecast date.
    
    X_train = train_df[features].values
    y_train = train_df['Direction'].values
    X_test = test_df[features].values
    y_true = test_df['Direction'].values[0]
    
    # Scale features: fit StandardScaler on training data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Use bootstrapping to generate an ensemble forecast for the test sample.
    boot_probs = []
    for b in range(bootstrap_iterations):
        # Resample with replacement from training data.
        indices = np.random.choice(len(X_train_scaled), size=len(X_train_scaled), replace=True)
        X_boot = X_train_scaled[indices]
        y_boot = y_train[indices]
        
        # Train XGBClassifier with fixed hyperparameters (tune as needed).
        model = XGBClassifier(objective='binary:logistic',
                              eval_metric='logloss',
                              use_label_encoder=False,
                              n_estimators=50,
                              max_depth=3,
                              learning_rate=0.01,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              random_state=42)
        model.fit(X_boot, y_boot)
        prob = model.predict_proba(X_test_scaled)[0, 1]
        boot_probs.append(prob)
    
    avg_prob = np.mean(boot_probs)
    # Here we use a fixed threshold of 0.5. Alternatively, you could tune this threshold on the training set.
    pred = 1 if avg_prob >= 0.5 else 0
    
    results.append({
        'Date': test_df['Date'].iloc[0],
        'True_Direction': y_true,
        'Predicted_Direction': pred,
        'Avg_Probability': avg_prob
    })
    print(f"Forecast for {test_df['Date'].iloc[0].strftime('%Y-%m-%d')}: True={y_true}, Pred={pred}, Avg_Prob={avg_prob:.3f}")

results_df = pd.DataFrame(results)
print("\nWalk-forward Bootstrapped Results:")
print(results_df)
print("\nClassification Report (walk-forward):")
from sklearn.metrics import classification_report
print(classification_report(results_df['True_Direction'], results_df['Predicted_Direction']))

# Optionally, save the results and models.
results_df.to_csv('bootstrapped_direction_walkforward.csv', index=False)
joblib.dump(results_df, 'bootstrapped_direction_walkforward_results.pkl')


Aggregated daily data shape: (21, 173)
Unique dates in aggregated daily data: 21
Forecast for 2025-03-28: True=0, Pred=1, Avg_Prob=0.501
Forecast for 2025-03-31: True=1, Pred=0, Avg_Prob=0.472
Forecast for 2025-04-01: True=1, Pred=0, Avg_Prob=0.494
Forecast for 2025-04-02: True=0, Pred=1, Avg_Prob=0.554
Forecast for 2025-04-03: True=0, Pred=1, Avg_Prob=0.510
Forecast for 2025-04-04: True=1, Pred=0, Avg_Prob=0.494
Forecast for 2025-04-07: True=0, Pred=1, Avg_Prob=0.541
Forecast for 2025-04-08: True=1, Pred=1, Avg_Prob=0.512
Forecast for 2025-04-09: True=0, Pred=0, Avg_Prob=0.496
Forecast for 2025-04-10: True=1, Pred=0, Avg_Prob=0.458

Walk-forward Bootstrapped Results:
        Date  True_Direction  Predicted_Direction  Avg_Probability
0 2025-03-28               0                    1         0.500886
1 2025-03-31               1                    0         0.471660
2 2025-04-01               1                    0         0.494326
3 2025-04-02               0                    1      

['bootstrapped_direction_walkforward_results.pkl']