# Libaries

In [None]:
import warnings

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")

# Read and split data

In [None]:
df = pd.read_parquet("../data/feature/stock_eod_features.parquet")

# Split dataset based on time - keep 10% test for final model evaluation
cutoff_date = df["date"].quantile(0.9)
print("Train/Test cutoff date:", cutoff_date.strftime('%Y-%m-%d'))

train_df = df[df["date"] < cutoff_date].copy()
test_df  = df[df["date"] >= cutoff_date].copy()

In [None]:
# Exclude non-feature columns to calculate correlations
exclude = ['date', 'symbol', 'target']

feat_cols = [c for c in train_df.columns 
             if c not in exclude]

corr_with_target = train_df[feat_cols + ['target']].corr()['target'].sort_values(ascending=False)
corr_with_target

# Preprocessing

In [None]:
# One-hot encode 'symbol' categorical feature
train_df["symbol"] = train_df["symbol"].astype("category")
test_df["symbol"]  = test_df["symbol"].astype("category")

train_df = pd.get_dummies(train_df, columns=["symbol"], drop_first=False)
test_df  = pd.get_dummies(test_df,  columns=["symbol"], drop_first=False)

train_df = train_df.reindex(sorted(train_df.columns), axis=1)
test_df  = test_df.reindex(sorted(test_df.columns), axis=1)

train_df.head()

In [None]:
# Prepare feature matrix and target vector
target_column = 'target'
exclude = ['date', target_column]

feature_columns = [column for column in train_df.columns if column not in exclude]

X_train = train_df[feature_columns].values
y_train = train_df[target_column].values

X_test = test_df[feature_columns].values
y_test = test_df[target_column].values

# Baseline model

In [None]:
%%time
tscv = TimeSeriesSplit(n_splits=5)
dummy_fold_acc = []

for fold, (tr_idx, val_idx) in enumerate(tscv.split(X_train), start=1):
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    dummy = DummyClassifier(strategy="most_frequent")
    dummy.fit(X_tr, y_tr)
    y_pred = dummy.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    dummy_fold_acc.append(acc)

# Train dummy model as baseline
print("Dummy | Mean CV accuracy:", sum(dummy_fold_acc) / len(dummy_fold_acc))

In [None]:
# Find the top features using permutation importance
xgb = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
).fit(X_train, y_train)


result = permutation_importance(
    xgb,
    X_train,
    y_train,
    n_repeats=10,
    random_state=42,
    scoring="accuracy"
)

perm_importance = pd.Series(result.importances_mean, index=feature_columns)
perm_importance = perm_importance.sort_values(ascending=False)

# Optinal: print the output
#perm_importance

In [None]:
K = 20
ticker_columns = [column for column in perm_importance.index if column.startswith("symbol_")]

non_symbol_importance = perm_importance[~perm_importance.index.str.startswith("symbol_")]
top_k_non_symbol = non_symbol_importance.head(K).index.tolist()

selected_features = top_k_non_symbol + ticker_columns
print("Final selected features:", selected_features)

In [None]:
def run_ts_cv(model, X, y, n_splits=5, model_name="model"):
    """
    Runs TimeSeriesSplit CV for any classifier.
    Computes:
        - Accuracy
        - ROC-AUC

    :Arguments:
        model: model to train
        X: feature matrix
        y: target vector
        n_splits: number of CV splits
        model_name: name of the model (for printing)

    :Return: Results dict.
    """
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    acc_scores = []
    auc_scores = []

    fold = 1
    for train_idx, val_idx in tscv.split(X):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)

        y_prob = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_prob)
        acc = accuracy_score(y_val, y_pred)

        acc_scores.append(acc)
        auc_scores.append(auc)

        fold += 1

    print(f"\n{model_name} | Mean ACC = {np.mean(acc_scores):.4f}, Mean AUC = {np.nanmean(auc_scores):.4f}")
    
    return {
        "model": model_name,
        "acc_scores": acc_scores,
        "auc_scores": auc_scores,
        "mean_acc": np.mean(acc_scores),
        "mean_auc": np.nanmean(auc_scores)
    }

In [None]:
rf_baseline = RandomForestClassifier(
    random_state=42,
    n_jobs=-1
)

xgb_baseline = XGBClassifier(
    random_state=42,
    n_estimators=100,
    n_jobs=-1
)

lgbm_baseline = LGBMClassifier(
    random_state=42,
    n_estimators=100,
    n_jobs=-1,
    verbose=-1
)

cat_baseline = CatBoostClassifier(
    random_state=42,
    iterations=100,
    verbose=False
)

In [None]:
rf_results = run_ts_cv(
    rf_baseline,
    X_train,
    y_train,
    model_name="RandomForest (ALL features)"
)

xgb_results = run_ts_cv(
    xgb_baseline,
    X_train,
    y_train,
    model_name="XGBoost (ALL features)"
)

lgbm_results = run_ts_cv(
    lgbm_baseline,
    X_train,
    y_train,
    model_name="LightGBM (ALL features)"
)

cat_results = run_ts_cv(
    cat_baseline,
    X_train,
    y_train,
    model_name="CatBoost (ALL features)"
)

In [None]:
# Build new X matrices
X_train_sel = train_df[selected_features].values

rf_results = run_ts_cv(
    rf_baseline,
    X_train_sel,
    y_train,
    model_name="RandomForest (TOP features)"
)

xgb_results = run_ts_cv(
    xgb_baseline,
    X_train_sel,
    y_train,
    model_name="XGBoost (TOP features)"
)

lgbm_results = run_ts_cv(
    lgbm_baseline,
    X_train_sel,
    y_train,
    model_name="LightGBM (TOP features)"
)

cat_results = run_ts_cv(
    cat_baseline,
    X_train_sel,
    y_train,
    model_name="CatBoost (TOP features)"
)