In [None]:
import os
import sys
import pandas as pd

# Ensure project root on path
ROOT = os.path.abspath('.')
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)

from src.training.trainer import ModelTrainer
from src.data.feature_store import FeatureEngineer

# Load data (either processed features or raw -> engineer)
proc_parquet = os.path.join('data', 'processed', 'features.parquet')
raw_csv = os.path.join('data', 'raw', 'prices.csv')

if os.path.exists(proc_parquet):
    df = pd.read_parquet(proc_parquet)
else:
    prices = pd.read_csv(raw_csv, parse_dates=['date'])
    fe = FeatureEngineer({})
    df = fe.create_features(prices)
    df = fe.create_target(df)

df = df.dropna()
config = {
    'mlflow_uri': 'sqlite:///mlflow.db',
    'experiment_name': 'stock_prediction',
    'registered_model_name': 'stock_predictor',
    'model_params': {
        'objective': 'multiclass',
        'num_class': 3,
        'metric': 'multi_logloss',
        'learning_rate': 0.05,
        'num_leaves': 31,
    }
}

trainer = ModelTrainer(config)
result = trainer.train(df, model_type='lgbm')
print('Trained model with', len(result['feature_cols']), 'features')


In [None]:
# 1) Dataset split and feature list
from sklearn.model_selection import TimeSeriesSplit

df = df.sort_values(['symbol','date']).reset_index(drop=True)
feature_cols = [c for c in df.columns if c not in ['symbol','date','target','target_return','future_return']]
X = df[feature_cols]
y = df['target']
print('Features:', len(feature_cols), 'Rows:', len(df))
print('Feature sample:', feature_cols[:10])


In [None]:
# 2) TimeSeriesSplit CV
from sklearn.metrics import accuracy_score, f1_score

ts = TimeSeriesSplit(n_splits=3)
folds = []
for fold, (tr, va) in enumerate(ts.split(X)):
    folds.append({'fold': fold, 'train': len(tr), 'val': len(va)})
print(folds)

# Show a small preview of indices
print('Fold 0 train head:', tr[:5] if len(tr) else [])


In [None]:
# 3) Baseline model (LightGBM via ModelTrainer)
from src.training.trainer import ModelTrainer

config = {
    'mlflow_uri': 'sqlite:///mlflow.db',
    'experiment_name': 'stock_prediction',
    'registered_model_name': 'stock_predictor',
    'model_params': {
        'objective': 'multiclass',
        'num_class': 3,
        'metric': 'multi_logloss',
        'learning_rate': 0.05,
        'num_leaves': 31,
    }
}
trainer = ModelTrainer(config)
res = trainer.train(df, model_type='lgbm')
print('CV folds:', len(res['cv_scores']))


In [None]:
# 4) Evaluate baseline on a holdout (last 10%)
cut = int(len(df) * 0.9)
X_train, X_test = X.iloc[:cut], X.iloc[cut:]
y_train, y_test = y.iloc[:cut], y.iloc[cut:]

# Use the trained model from trainer res
model = res['model']
probs = model.predict(X_test)
preds = (probs.argmax(axis=1) - 1)
acc = (preds == y_test.values).mean()
print('Holdout accuracy:', round(float(acc), 4))


In [None]:
# 5) Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, preds, labels=[-1,0,1])
print('Confusion matrix (rows=true, cols=pred):')
print(cm)

# Per-class accuracy
per_class_acc = (cm.diagonal() / cm.sum(axis=1).clip(min=1)).round(3)
print('Per-class accuracy (-1,0,1):', per_class_acc)


In [None]:
# 6) Feature importance (top 20)
import pandas as pd
import numpy as np

importance = model.feature_importance(importance_type='gain')
fi = pd.DataFrame({'feature': res['feature_cols'], 'importance': importance})
fi = fi.sort_values('importance', ascending=False).head(20)
print(fi)


In [None]:
# 7) Simple probability calibration check
# Probability mass should sum ~1 across three classes
row0 = probs[0]
print('First prob vector:', row0, 'Sum:', float(row0.sum()))

# Check distribution of max confidence
max_conf = probs.max(axis=1)
print('Max confidence stats:', pd.Series(max_conf).describe())


In [None]:
# 8) Backtesting integration (optional)
from src.training.backtester import Backtester

cfg = {
    'initial_capital': 100000,
    'transaction_cost': 0.0005,
    'train_window_days': 60,
    'test_window_days': 20,
    'retrain_frequency_days': 20,
    'confidence_threshold': 0.5
}

# Use a reduced subset to keep runtime moderate
sub = df[['symbol','date','close'] + feature_cols].tail(5000).copy()
bt = Backtester(cfg)
summary = bt.run(sub, model, feature_cols)
print({k: v for k, v in summary.items() if k not in ['daily_values','all_trades']})


In [None]:
# 9) Save model artifacts
import mlflow
import mlflow.lightgbm

run = mlflow.last_active_run()
if run is not None:
    print('Last MLflow run:', run.info.run_id)
else:
    print('No active MLflow run context available here.')


In [None]:
# 10) Simple hyperparameter sweep placeholder
# (For a real run, implement in src/training/hyperparameter.py)
leaves = [15, 31]
lrs = [0.05, 0.1]
results = []
for nl in leaves:
    for lr in lrs:
        print('Trying num_leaves=', nl, 'lr=', lr)
        # Placeholder: you would instantiate a trainer with these params and evaluate
        results.append({'num_leaves': nl, 'learning_rate': lr, 'score': None})
print('Sweep grid size:', len(results))


In [None]:
# 11) Persist evaluation split for reproducibility
out_eval = 'data/processed/holdout_eval.parquet'
try:
    pd.DataFrame({'y_true': y_test.values, 'y_pred': preds}).to_parquet(out_eval, index=False)
    print('Wrote', out_eval)
except Exception as e:
    out_eval_csv = 'data/processed/holdout_eval.csv'
    pd.DataFrame({'y_true': y_test.values, 'y_pred': preds}).to_csv(out_eval_csv, index=False)
    print('Parquet unavailable, wrote', out_eval_csv, 'Error:', e)


In [None]:
# 12) Summary
print('Training complete.')
print('CV scores sample:', res['cv_scores'][:2])
print('Holdout accuracy:', round(float(acc), 4))
print('Top-5 features by importance:')
print(fi.head(5))
print('Artifacts saved (if MLflow configured).')
