In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data and Prepare Features

In [None]:
# Load preprocessed data
df = pd.read_csv('../data/preprocessed_final.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()[:10]}...")  # Show first 10 columns
df.head()

In [None]:
# Define features (same as used in individual models)
metadata_cols = ['date', 'speaker', 'text', 'text_clean', 'text_clean_nostop']
target_cols = [col for col in df.columns if col.startswith('target_') or col.startswith('class_')]
feature_cols = [col for col in df.columns if col not in metadata_cols + target_cols]

print(f"Number of features: {len(feature_cols)}")
print(f"Features: {feature_cols[:15]}...")  # Show first 15

In [None]:
# Create binary targets
y_1d = (df['target_SP500_1d'] > 0).astype(int)
y_5d = (df['target_SP500_5d'] > 0).astype(int)

print("1-Day Binary Target Distribution:")
print(y_1d.value_counts())
print(f"\n5-Day Binary Target Distribution:")
print(y_5d.value_counts())

In [None]:
# Prepare features
X = df[feature_cols].fillna(0)

# Time-series split (80/20)
split_index = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_1d_train, y_1d_test = y_1d.iloc[:split_index], y_1d.iloc[split_index:]
y_5d_train, y_5d_test = y_5d.iloc[:split_index], y_5d.iloc[split_index:]

print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

## Load Pre-trained Models

Based on previous results:
- **Best 1-day model**: XGBoost (54.90% accuracy, 0.512 F1)
- **Best 5-day model**: Logistic Regression (58.26% accuracy, 0.440 F1)

In [None]:
# Load saved models
try:
    with open('../results/xgboost_model.pkl', 'rb') as f:
        xgb_model = pickle.load(f)
    print("✓ Loaded XGBoost model")
except:
    print("✗ XGBoost model not found - will train new one")
    xgb_model = None

try:
    with open('../results/logistic_regression_model.pkl', 'rb') as f:
        lr_model = pickle.load(f)
    print("✓ Loaded Logistic Regression model")
except:
    print("✗ Logistic Regression model not found - will train new one")
    lr_model = None

try:
    with open('../results/scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)
    print("✓ Loaded scaler")
except:
    print("✗ Scaler not found - will create new one")
    scaler = None

In [None]:
# If models don't exist, train them
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

if scaler is None:
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
else:
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

if xgb_model is None:
    print("Training XGBoost for 1-day predictions...")
    xgb_model = xgb.XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss'
    )
    xgb_model.fit(X_train, y_1d_train)
    print("✓ XGBoost trained")

if lr_model is None:
    print("Training Logistic Regression for 5-day predictions...")
    lr_model = LogisticRegression(
        C=1.0,
        penalty='l2',
        solver='lbfgs',
        max_iter=1000,
        random_state=42
    )
    lr_model.fit(X_train_scaled, y_5d_train)
    print("✓ Logistic Regression trained")

## Generate Predictions from Individual Models

In [None]:
# Get probability predictions
xgb_proba_1d = xgb_model.predict_proba(X_test)[:, 1]
lr_proba_5d = lr_model.predict_proba(X_test_scaled)[:, 1]

# Get binary predictions with optimal thresholds (from previous results)
xgb_pred_1d = (xgb_proba_1d >= 0.35).astype(int)  # Optimal threshold for XGBoost 1-day
lr_pred_5d = (lr_proba_5d >= 0.45).astype(int)    # Optimal threshold for LR 5-day

print("Individual Model Performance:")
print("="*60)
print(f"\nXGBoost (1-day):")
print(f"  Accuracy: {accuracy_score(y_1d_test, xgb_pred_1d):.4f}")
print(f"  F1 Score: {f1_score(y_1d_test, xgb_pred_1d, average='weighted'):.4f}")

print(f"\nLogistic Regression (5-day):")
print(f"  Accuracy: {accuracy_score(y_5d_test, lr_pred_5d):.4f}")
print(f"  F1 Score: {f1_score(y_5d_test, lr_pred_5d, average='weighted'):.4f}")

## Ensemble Strategy 1: Averaged Probabilities

In [None]:
# Average the probabilities for a combined prediction
ensemble_proba_avg = (xgb_proba_1d + lr_proba_5d) / 2

# Test different thresholds
print("Ensemble (Average Probabilities) - Testing Thresholds:")
print("="*60)

best_acc = 0
best_threshold = 0.5
results = []

for threshold in np.arange(0.30, 0.71, 0.05):
    ensemble_pred = (ensemble_proba_avg >= threshold).astype(int)
    
    # Evaluate against both targets
    acc_1d = accuracy_score(y_1d_test, ensemble_pred)
    acc_5d = accuracy_score(y_5d_test, ensemble_pred)
    f1_1d = f1_score(y_1d_test, ensemble_pred, average='weighted')
    f1_5d = f1_score(y_5d_test, ensemble_pred, average='weighted')
    
    avg_acc = (acc_1d + acc_5d) / 2
    
    results.append({
        'threshold': threshold,
        'acc_1d': acc_1d,
        'acc_5d': acc_5d,
        'f1_1d': f1_1d,
        'f1_5d': f1_5d,
        'avg_acc': avg_acc
    })
    
    if avg_acc > best_acc:
        best_acc = avg_acc
        best_threshold = threshold
    
    print(f"Threshold {threshold:.2f}: 1d={acc_1d:.4f} (F1={f1_1d:.3f}), 5d={acc_5d:.4f} (F1={f1_5d:.3f}), Avg={avg_acc:.4f}")

results_df = pd.DataFrame(results)
print(f"\nBest Average Accuracy: {best_acc:.4f} at threshold {best_threshold:.2f}")

## Ensemble Strategy 2: Weighted Average (Favor Better Model)

In [None]:
# Weight XGBoost more for 1-day, LR more for 5-day
# Try different weight combinations

print("Ensemble (Weighted Probabilities):")
print("="*60)

weight_configs = [
    (0.7, 0.3, "70% XGB, 30% LR"),
    (0.6, 0.4, "60% XGB, 40% LR"),
    (0.5, 0.5, "50% XGB, 50% LR"),
    (0.4, 0.6, "40% XGB, 60% LR"),
    (0.3, 0.7, "30% XGB, 70% LR"),
]

weighted_results = []

for w_xgb, w_lr, label in weight_configs:
    ensemble_proba_weighted = w_xgb * xgb_proba_1d + w_lr * lr_proba_5d
    
    # Find best threshold for this weighting
    best_acc_weighted = 0
    best_thresh_weighted = 0.5
    
    for threshold in np.arange(0.30, 0.71, 0.05):
        ensemble_pred = (ensemble_proba_weighted >= threshold).astype(int)
        acc_1d = accuracy_score(y_1d_test, ensemble_pred)
        acc_5d = accuracy_score(y_5d_test, ensemble_pred)
        avg_acc = (acc_1d + acc_5d) / 2
        
        if avg_acc > best_acc_weighted:
            best_acc_weighted = avg_acc
            best_thresh_weighted = threshold
    
    # Evaluate at best threshold
    ensemble_pred = (ensemble_proba_weighted >= best_thresh_weighted).astype(int)
    acc_1d = accuracy_score(y_1d_test, ensemble_pred)
    acc_5d = accuracy_score(y_5d_test, ensemble_pred)
    f1_1d = f1_score(y_1d_test, ensemble_pred, average='weighted')
    f1_5d = f1_score(y_5d_test, ensemble_pred, average='weighted')
    
    weighted_results.append({
        'weights': label,
        'w_xgb': w_xgb,
        'w_lr': w_lr,
        'threshold': best_thresh_weighted,
        'acc_1d': acc_1d,
        'acc_5d': acc_5d,
        'f1_1d': f1_1d,
        'f1_5d': f1_5d,
        'avg_acc': (acc_1d + acc_5d) / 2
    })
    
    print(f"{label} (threshold={best_thresh_weighted:.2f}):")
    print(f"  1-day: {acc_1d:.4f} (F1={f1_1d:.3f})")
    print(f"  5-day: {acc_5d:.4f} (F1={f1_5d:.3f})")
    print(f"  Average: {(acc_1d + acc_5d) / 2:.4f}")
    print()

weighted_results_df = pd.DataFrame(weighted_results)

## Ensemble Strategy 3: Majority Voting

In [None]:
# Simple majority voting (both models must agree)
ensemble_vote = ((xgb_pred_1d + lr_pred_5d) >= 2).astype(int)  # Both predict 1

print("Ensemble (Majority Voting):")
print("="*60)

acc_1d_vote = accuracy_score(y_1d_test, ensemble_vote)
acc_5d_vote = accuracy_score(y_5d_test, ensemble_vote)
f1_1d_vote = f1_score(y_1d_test, ensemble_vote, average='weighted')
f1_5d_vote = f1_score(y_5d_test, ensemble_vote, average='weighted')

print(f"1-day: Accuracy={acc_1d_vote:.4f}, F1={f1_1d_vote:.3f}")
print(f"5-day: Accuracy={acc_5d_vote:.4f}, F1={f1_5d_vote:.3f}")
print(f"Average: {(acc_1d_vote + acc_5d_vote) / 2:.4f}")

print(f"\nPrediction distribution:")
print(pd.Series(ensemble_vote).value_counts())

## Final Comparison: Individual vs Ensemble Models

In [None]:
# Calculate baselines
baseline_1d = accuracy_score(y_1d_test, np.full(len(y_1d_test), y_1d_train.mode()[0]))
baseline_5d = accuracy_score(y_5d_test, np.full(len(y_5d_test), y_5d_train.mode()[0]))

# Best ensemble from weighted approach
best_weighted = weighted_results_df.loc[weighted_results_df['avg_acc'].idxmax()]
best_ensemble_proba = best_weighted['w_xgb'] * xgb_proba_1d + best_weighted['w_lr'] * lr_proba_5d
best_ensemble_pred = (best_ensemble_proba >= best_weighted['threshold']).astype(int)

print("="*90)
print(" "*25 + "FINAL MODEL COMPARISON")
print("="*90)

comparison = pd.DataFrame([
    {
        'Model': 'Baseline (1-day)',
        '1-Day Acc': baseline_1d,
        '1-Day F1': 0.0,
        '5-Day Acc': 0.0,
        '5-Day F1': 0.0,
        'Avg Acc': baseline_1d,
        'Strategy': 'Always predict majority class'
    },
    {
        'Model': 'Baseline (5-day)',
        '1-Day Acc': 0.0,
        '1-Day F1': 0.0,
        '5-Day Acc': baseline_5d,
        '5-Day F1': 0.0,
        'Avg Acc': baseline_5d,
        'Strategy': 'Always predict majority class'
    },
    {
        'Model': 'XGBoost (1-day specialist)',
        '1-Day Acc': accuracy_score(y_1d_test, xgb_pred_1d),
        '1-Day F1': f1_score(y_1d_test, xgb_pred_1d, average='weighted'),
        '5-Day Acc': accuracy_score(y_5d_test, xgb_pred_1d),
        '5-Day F1': f1_score(y_5d_test, xgb_pred_1d, average='weighted'),
        'Avg Acc': (accuracy_score(y_1d_test, xgb_pred_1d) + accuracy_score(y_5d_test, xgb_pred_1d)) / 2,
        'Strategy': 'Threshold=0.35'
    },
    {
        'Model': 'LogReg (5-day specialist)',
        '1-Day Acc': accuracy_score(y_1d_test, lr_pred_5d),
        '1-Day F1': f1_score(y_1d_test, lr_pred_5d, average='weighted'),
        '5-Day Acc': accuracy_score(y_5d_test, lr_pred_5d),
        '5-Day F1': f1_score(y_5d_test, lr_pred_5d, average='weighted'),
        'Avg Acc': (accuracy_score(y_1d_test, lr_pred_5d) + accuracy_score(y_5d_test, lr_pred_5d)) / 2,
        'Strategy': 'Threshold=0.45'
    },
    {
        'Model': 'Ensemble (Best Weighted)',
        '1-Day Acc': best_weighted['acc_1d'],
        '1-Day F1': best_weighted['f1_1d'],
        '5-Day Acc': best_weighted['acc_5d'],
        '5-Day F1': best_weighted['f1_5d'],
        'Avg Acc': best_weighted['avg_acc'],
        'Strategy': f"{best_weighted['weights']}, threshold={best_weighted['threshold']:.2f}"
    }
])

print("\n" + comparison.to_string(index=False))
print("\n" + "="*90)

# Highlight the best
best_1d_idx = comparison['1-Day Acc'].idxmax()
best_5d_idx = comparison['5-Day Acc'].idxmax()
best_avg_idx = comparison['Avg Acc'].idxmax()

print("\nBEST PERFORMERS:")
print(f"  1-Day Prediction: {comparison.loc[best_1d_idx, 'Model']} ({comparison.loc[best_1d_idx, '1-Day Acc']:.4f})")
print(f"  5-Day Prediction: {comparison.loc[best_5d_idx, 'Model']} ({comparison.loc[best_5d_idx, '5-Day Acc']:.4f})")
print(f"  Overall (Average): {comparison.loc[best_avg_idx, 'Model']} ({comparison.loc[best_avg_idx, 'Avg Acc']:.4f})")
print("="*90)

## Visualization: Model Performance Comparison

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 1-day accuracy comparison
models_for_plot = comparison[comparison['1-Day Acc'] > 0].copy()
ax1 = axes[0]
bars1 = ax1.barh(models_for_plot['Model'], models_for_plot['1-Day Acc'], color='steelblue', alpha=0.7)
ax1.axvline(baseline_1d, color='red', linestyle='--', label=f'Baseline ({baseline_1d:.3f})')
ax1.set_xlabel('Accuracy')
ax1.set_title('1-Day Return Prediction Performance')
ax1.legend()
ax1.set_xlim([0.45, 0.60])

# 5-day accuracy comparison
models_for_plot_5d = comparison[comparison['5-Day Acc'] > 0].copy()
ax2 = axes[1]
bars2 = ax2.barh(models_for_plot_5d['Model'], models_for_plot_5d['5-Day Acc'], color='darkgreen', alpha=0.7)
ax2.axvline(baseline_5d, color='red', linestyle='--', label=f'Baseline ({baseline_5d:.3f})')
ax2.set_xlabel('Accuracy')
ax2.set_title('5-Day Return Prediction Performance')
ax2.legend()
ax2.set_xlim([0.45, 0.65])

plt.tight_layout()
plt.savefig('../results/ensemble_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Saved visualization to results/ensemble_comparison.png")

## Save Best Ensemble Model

In [None]:
# Save ensemble predictions and configuration
ensemble_results = pd.DataFrame({
    'xgb_proba_1d': xgb_proba_1d,
    'lr_proba_5d': lr_proba_5d,
    'ensemble_proba': best_ensemble_proba,
    'ensemble_pred': best_ensemble_pred,
    'true_1d': y_1d_test.values,
    'true_5d': y_5d_test.values
})

ensemble_results.to_csv('../results/ensemble_predictions.csv', index=False)
print("✓ Saved ensemble predictions to results/ensemble_predictions.csv")

# Save ensemble configuration
ensemble_config = {
    'xgb_weight': best_weighted['w_xgb'],
    'lr_weight': best_weighted['w_lr'],
    'threshold': best_weighted['threshold'],
    'xgb_threshold': 0.35,
    'lr_threshold': 0.45,
    'performance': {
        '1d_accuracy': best_weighted['acc_1d'],
        '5d_accuracy': best_weighted['acc_5d'],
        'avg_accuracy': best_weighted['avg_acc'],
        '1d_f1': best_weighted['f1_1d'],
        '5d_f1': best_weighted['f1_5d']
    }
}

with open('../results/ensemble_config.pkl', 'wb') as f:
    pickle.dump(ensemble_config, f)
print("✓ Saved ensemble configuration to results/ensemble_config.pkl")

print("\nEnsemble Configuration:")
for key, value in ensemble_config.items():
    if key != 'performance':
        print(f"  {key}: {value}")
print("  Performance:")
for key, value in ensemble_config['performance'].items():
    print(f"    {key}: {value:.4f}")