In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
import lightgbm as lgb

# 1️⃣ Load preprocessed feature data
df = pd.read_csv(r"C:\Users\mrity\OneDrive\Desktop\Alternative-Data-Alpha-Mining\data\raw\processed\ffff.csv", parse_dates=['date'])

# 2️⃣ Advanced Feature Engineering
df['reddit_filtered'] = df['reddit_sentiment'].where(df['reddit_sentiment'].abs() > 0.2, 0)
df['news_filtered'] = df['news_sentiment'].where(df['news_sentiment'].abs() > 0.2, 0)
# Ensure roll3 and roll7 exist
for col in ['trend_score', 'reddit_sentiment', 'news_sentiment']:
    df[f'{col}_roll3'] = df.groupby('ticker')[col].transform(lambda x: x.shift(1).rolling(3, min_periods=1).mean())
    df[f'{col}_roll7'] = df.groupby('ticker')[col].transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean())

# Now momentum works
df['trend_momentum'] = df['trend_score_roll3'] - df['trend_score_roll7']

df['trend_momentum'] = df['trend_score_roll3'] - df['trend_score_roll7']
df['return_roll3'] = df.groupby('ticker')['return'].transform(lambda x: x.shift(1).rolling(3).mean())
df['return_delta'] = df.groupby('ticker')['return'].diff()
df['signal_agreement'] = ((df['reddit_sentiment'] > 0) & (df['trend_score'] > 0)).astype(int) - \
                         ((df['reddit_sentiment'] < 0) & (df['trend_score'] < 0)).astype(int)
# ⏱ Rolling volatility (risk)
df['return_volatility_5'] = df.groupby('ticker')['return'].transform(lambda x: x.shift(1).rolling(5).std())


# 🧠 Interaction terms
df['reddit_trend_agree'] = df['reddit_sentiment'] * df['trend_score']
df['news_trend_agree'] = df['news_sentiment'] * df['trend_score']

# 3️⃣ Clean and prepare
exclude_cols = ['date', 'ticker', 'return', 'target_return', 'target_up']
feature_cols = [col for col in df.columns if col not in exclude_cols]
df = df.dropna(subset=feature_cols + ['target_up'])
X = df[feature_cols]
y = df['target_up']

# 4️⃣ Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# 5️⃣ Class weights
weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.array([0,1]), y=y_train)
class_weights_dict = {0: weights[0], 1: weights[1]}

# 6️⃣ Train Random Forest
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=1,
    class_weight=class_weights_dict,
    random_state=42
)
rf.fit(X_train, y_train)
rf_probs = rf.predict_proba(X_test)[:, 1]

# 7️⃣ Feature Importance & Selection (Top 15)
importances = pd.Series(rf.feature_importances_, index=feature_cols)
top_features = importances.sort_values(ascending=False).head(15).index.tolist()
X_train_sel = X_train[top_features]
X_test_sel = X_test[top_features]

# 8️⃣ Train LightGBM
lgb_model = lgb.LGBMClassifier(class_weight='balanced', random_state=42)
lgb_model.fit(X_train_sel, y_train)
lgb_probs = lgb_model.predict_proba(X_test_sel)[:, 1]

# 9️⃣ Evaluate at Different Thresholds
thresholds = [(0.35, 0.65), (0.4, 0.6), (0.45, 0.55)]

print("🔍 Evaluation of Confident Predictions Only")
for low, high in thresholds:
    for name, probs in [('Random Forest', rf_probs), ('LightGBM', lgb_probs)]:
        confident_up = probs > high
        confident_down = probs < low
        confident_mask = confident_up | confident_down
        y_pred = np.where(confident_up, 1, 0)
        y_true = y_test[confident_mask]
        y_pred = y_pred[confident_mask]
        print(f"\n📊 {name} | Thresholds: low={low}, high={high}")
        print(f"Confident Predictions: {len(y_true)} / {len(y_test)}")
        print(confusion_matrix(y_true, y_pred))
        print(classification_report(y_true, y_pred))


[LightGBM] [Info] Number of positive: 49084, number of negative: 45360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3435
[LightGBM] [Info] Number of data points in the train set: 94444, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
🔍 Evaluation of Confident Predictions Only

📊 Random Forest | Thresholds: low=0.35, high=0.65
Confident Predictions: 89 / 23612
[[28 16]
 [41  4]]
              precision    recall  f1-score   support

           0       0.41      0.64      0.50        44
           1       0.20      0.09      0.12        45

    accuracy                           0.36        89
   macro avg       0.30      0.36      0.31        89
weighted avg       0.30  

In [20]:

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight

def rolling_backtest_calendar(df, feature_cols, label_col='target_up',
                              train_days=60, test_days=20,
                              thresholds=(0.35, 0.65)):

    df = df.sort_values(['date']).reset_index(drop=True)
    unique_dates = df['date'].sort_values().unique()
    all_results = []

    for start_idx in range(0, len(unique_dates) - train_days - test_days + 1, test_days):
        # Calendar-based rolling window
        train_start = unique_dates[start_idx]
        train_end = unique_dates[start_idx + train_days - 1]
        test_start = unique_dates[start_idx + train_days]
        test_end = unique_dates[start_idx + train_days + test_days - 1]

        train_df = df[(df['date'] >= train_start) & (df['date'] <= train_end)]
        test_df = df[(df['date'] >= test_start) & (df['date'] <= test_end)]

        if train_df.empty or test_df.empty:
            continue

        X_train_full = train_df[feature_cols]
        y_train = train_df[label_col]
        X_test_full = test_df[feature_cols]
        y_test = test_df[label_col].values
        tickers = test_df['ticker'].values
        test_dates = test_df['date'].values

        # Skip if one class only
        if len(np.unique(y_train)) == 1:
            continue

        # Compute class weights
        weights = class_weight.compute_class_weight('balanced', classes=np.array([0,1]), y=y_train)
        class_weights_dict = {0: weights[0], 1: weights[1]}

        # Feature selection: initial RF
        rf_initial = RandomForestClassifier(n_estimators=100, max_depth=10,
                                            min_samples_split=5, min_samples_leaf=1,
                                            class_weight=class_weights_dict, random_state=42)
        rf_initial.fit(X_train_full, y_train)
        top_features = pd.Series(rf_initial.feature_importances_, index=feature_cols)\
                        .sort_values(ascending=False).head(15).index.tolist()

        # Retrain final RF with top features
        rf = RandomForestClassifier(n_estimators=100, max_depth=10,
                                    min_samples_split=5, min_samples_leaf=1,
                                    class_weight=class_weights_dict, random_state=42)
        rf.fit(X_train_full[top_features], y_train)
        rf_probs = rf.predict_proba(X_test_full[top_features])[:, 1]

        # Confident predictions
        low, high = thresholds
        confident_up = rf_probs > high
        confident_down = rf_probs < low
        confident_mask = confident_up | confident_down
        if confident_mask.sum() == 0:
            continue

        y_pred_conf = np.where(confident_up[confident_mask], 1, 0)
        y_true_conf = y_test[confident_mask]
        date_conf = test_dates[confident_mask]
        ticker_conf = tickers[confident_mask]
        target_returns = test_df['target_return'].values[confident_mask]

        for p, t, d, tk, r in zip(y_pred_conf, y_true_conf, date_conf, ticker_conf, target_returns):
            all_results.append({
                'date': d, 'ticker': tk, 'pred': p, 'true': t,
                'target_return': r, 'pnl': r * (1 if p == 1 else -1)
            })

        print(f"✅ Trained {train_start} → {train_end} | Tested {test_start} → {test_end} | Trades: {len(y_pred_conf)}")

    return pd.DataFrame(all_results)

def compute_portfolio_metrics(results_df, capital=100000):
    results_df['date'] = pd.to_datetime(results_df['date'])
    daily_pnl = results_df.groupby('date')['pnl'].mean()  # equally weighted across tickers
    cumulative = (1 + daily_pnl).cumprod()

    # Sharpe (annualized)
    sharpe = np.sqrt(252) * daily_pnl.mean() / (daily_pnl.std() + 1e-8)
    # Max Drawdown
    max_dd = (cumulative / cumulative.cummax() - 1).min()
    # CAGR (annualized)
    total_days = (cumulative.index[-1] - cumulative.index[0]).days
    cagr = cumulative.iloc[-1]**(365/total_days) - 1
    # Total return
    total_return = cumulative.iloc[-1] - 1

    return {
        'Sharpe': sharpe,
        'Max Drawdown': max_dd,
        'CAGR': cagr,
        'Total Return': total_return
    }


In [21]:
# Example:
results = rolling_backtest_calendar(df, feature_cols)
results.to_csv(r'C:\Users\mrity\OneDrive\Desktop\Alternative-Data-Alpha-Mining\data\backtest_result')
metrics = compute_portfolio_metrics(results)
print(metrics)


✅ Trained 2020-01-10 00:00:00 → 2020-04-06 00:00:00 | Tested 2020-04-07 00:00:00 → 2020-05-05 00:00:00 | Trades: 1721
✅ Trained 2020-02-10 00:00:00 → 2020-05-05 00:00:00 | Tested 2020-05-06 00:00:00 → 2020-06-03 00:00:00 | Trades: 616
✅ Trained 2020-03-10 00:00:00 → 2020-06-03 00:00:00 | Tested 2020-06-04 00:00:00 → 2020-07-01 00:00:00 | Trades: 767
✅ Trained 2020-04-07 00:00:00 → 2020-07-01 00:00:00 | Tested 2020-07-02 00:00:00 → 2020-07-30 00:00:00 | Trades: 189
✅ Trained 2020-05-06 00:00:00 → 2020-07-30 00:00:00 | Tested 2020-07-31 00:00:00 → 2020-08-27 00:00:00 | Trades: 202
✅ Trained 2020-06-04 00:00:00 → 2020-08-27 00:00:00 | Tested 2020-08-28 00:00:00 → 2020-09-25 00:00:00 | Trades: 345
✅ Trained 2020-07-02 00:00:00 → 2020-09-25 00:00:00 | Tested 2020-09-28 00:00:00 → 2020-10-23 00:00:00 | Trades: 52
✅ Trained 2020-07-31 00:00:00 → 2020-10-23 00:00:00 | Tested 2020-10-26 00:00:00 → 2020-11-20 00:00:00 | Trades: 173
✅ Trained 2020-08-28 00:00:00 → 2020-11-20 00:00:00 | Tested 202