# ML Pipeline for Trading Bot

This notebook provides a runnable skeleton ML pipeline for a trading bot. It includes: data loading, EDA, preprocessing, feature engineering, labeling, a simple model training example (sklearn), and a minimal backtester for signal evaluation.

Notes:
- This notebook is designed to be a starting point.
- It uses CSVs in the `data/` folder in this workspace. Update `DATA_PATH` if needed.
- After exploration, you can swap the model with LightGBM/XGBoost or LSTM-based models.

In [None]:
# Install minimal packages if running in a fresh environment (uncomment to run)
# !pip install -q pandas numpy scikit-learn matplotlib mplfinance ta lightgbm

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import classification_report, accuracy_score

pd.set_option('display.max_columns', 200)

In [None]:
# ---- DATA PATH: automatically find a CSV in the workspace data/ folder ----
import glob
WORKDIR = '/workspaces/POC_Safem0de_IS'
data_dir = os.path.join(WORKDIR, 'data') if os.path.isdir(os.path.join(WORKDIR, 'data')) else os.path.join(os.getcwd(), 'data')
csv_files = sorted(glob.glob(os.path.join(data_dir, '*.csv'))) if os.path.isdir(data_dir) else []
if not csv_files:
    raise FileNotFoundError(f'No CSV files found in data directory: {data_dir}. Please add CSVs to the data/ folder or update DATA_PATH.')

print('Found data files:')
for i,f in enumerate(csv_files):
    print(i, '-', f)

# Default: pick the first CSV. You can change this to csv_files[n] to select another file.
DATA_PATH = csv_files[0]
print('Loading', DATA_PATH)
df = pd.read_csv(DATA_PATH)

# standardize column names if needed
if len(df.columns) >= 6:
    df.columns = ['datetime','open','high','low','close','volume'] + list(df.columns[6:])

# parse datetime and sort
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values('datetime').reset_index(drop=True)
df.set_index('datetime', inplace=True)

print('shape', df.shape)
df.head()

## Quick EDA
Check data ranges and missing values.

In [None]:
print(df.info())
print('Missing per column:', df.isna().mean())

# Basic plot of close price (last 500 points)
_ = df['close'].iloc[-500:].plot(title='Close (last 500 points)', figsize=(12,4))

## Feature Engineering
Create a few robust features: returns, moving averages, vol, and a simple RSI implementation (no external `ta` required).

In [None]:
def add_basic_features(df):
    x = df.copy()
    # returns
    x['ret_1'] = x['close'].pct_change(1)
    x['ret_3'] = x['close'].pct_change(3)
    x['ret_24'] = x['close'].pct_change(24)
    # moving averages
    x['ma_5'] = x['close'].rolling(5).mean()
    x['ma_20'] = x['close'].rolling(20).mean()
    x['ma_50'] = x['close'].rolling(50).mean()
    x['ema_12'] = x['close'].ewm(span=12, adjust=False).mean()
    x['ema_26'] = x['close'].ewm(span=26, adjust=False).mean()
    # momentum / volatility
    x['std_20'] = x['close'].rolling(20).std()
    x['range'] = x['high'] - x['low']
    x['vol'] = x['volume']
    # simple RSI implementation
    delta = x['close'].diff()
    up = delta.clip(lower=0)
    down = -1 * delta.clip(upper=0)
    roll_up = up.rolling(14).mean()
    roll_down = down.rolling(14).mean()
    rs = roll_up / roll_down
    x['rsi14'] = 100 - (100 / (1 + rs))
    return x.dropna()

df_feat = add_basic_features(df)
print('after features:', df_feat.shape)
df_feat.head()

## Labeling (supervised target)
We'll create a simple classification target: whether price returns over the next N periods exceed a threshold. This is a straightforward target for a signal-based bot.

In [None]:
def create_labels(df, horizon=6, ret_threshold=0.001):
    # horizon: lookahead periods, ret_threshold: threshold for 'up'
    x = df.copy()
    x['future_close'] = x['close'].shift(-horizon)
    x['future_ret'] = x['future_close'] / x['close'] - 1
    # multiclass or binary: -1 bearish, 0 neutral, 1 bullish
    x['label'] = 0
    x.loc[x['future_ret'] > ret_threshold, 'label'] = 1
    x.loc[x['future_ret'] < -ret_threshold, 'label'] = -1
    return x.dropna()

df_labeled = create_labels(df_feat, horizon=12, ret_threshold=0.002)  # e.g., 12 hours ahead for H1 data
print('labeled shape:', df_labeled.shape)
df_labeled['label'].value_counts()

## Train / Test split (time-series aware) and feature selection
We'll use a simple time split to avoid leakage. Then train a RandomForest classifier as a baseline.

In [None]:
# select features (drop price-derived columns we don't want as direct features)
drop_cols = ['future_close','future_ret','label']
features = [c for c in df_labeled.columns if c not in drop_cols and c in ['ret_1','ret_3','ret_24','ma_5','ma_20','ma_50','ema_12','ema_26','std_20','rsi14']]
X = df_labeled[features].copy()
y = df_labeled['label'].copy()

# time split: last 20% as test
split_index = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

print('X_train, X_test shapes', X_train.shape, X_test.shape)

# baseline model
clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(classification_report(y_test, pred))
print('Accuracy:', accuracy_score(y_test, pred))

# cross-validated time series score (quick)
tscv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(clf, X, y, cv=tscv, scoring='accuracy', n_jobs=-1)
print('CV accuracy (TimeSeriesSplit):', scores, 'mean=', np.mean(scores))

## Simple Backtester (Signal-based)
Convert model predictions to position signals and simulate P&L simply (no transaction costs by default).

In [None]:
def backtest_signals(df_indexed, signals, price_col='close', hold_period=12, transaction_cost=0.0):
    # df_indexed: original df indexed by datetime aligned to signals array index
    # signals: array-like of -1/0/1 signals aligned with df_indexed rows
    res = pd.DataFrame(index=df_indexed.index)
    res['signal'] = signals
    res['price'] = df_indexed[price_col]
    # shift signal so we enter at next bar open (approx)
    res['entry_price'] = res['price'].shift(-1)
    res['exit_price'] = res['price'].shift(-hold_period)
    # compute return per trade (exit/entry - 1) * sign - cost
    res['trade_ret'] = (res['exit_price'] / res['entry_price'] - 1) * res['signal'] - transaction_cost
    # cumulative simple equity curve ignoring position sizing
    res['strategy_equity'] = (1 + res['trade_ret'].fillna(0)).cumprod()
    res['market_equity'] = (1 + res['price'].pct_change().fillna(0)).cumprod()
    return res

# produce model signals on test set (map predictions -1/0/1)
pred_proba = clf.predict(X_test)  # already -1/0/1 labels from classifier
# align with df_labeled index for test slice
test_index = X_test.index
signals = pd.Series(pred_proba, index=test_index)
bt = backtest_signals(df.loc[test_index], signals, hold_period=12, transaction_cost=0.000)

print('Strategy final equity (relative):', bt['strategy_equity'].iloc[-1])
print('Market final equity (relative):', bt['market_equity'].iloc[-1])

plt.figure(figsize=(12,5))
plt.plot(bt['strategy_equity'], label='Strategy')
plt.plot(bt['market_equity'], label='Market')
plt.legend()
plt.title('Equity Curves (Test Period)')
plt.show()

## Next steps / Recommendations
- Replace RandomForest baseline with LightGBM or XGBoost for tabular features for speed and better performance.
- Implement walk-forward validation and hyperparameter tuning (Bayesian or Optuna).
- Add transaction costs, slippage, and leverage to the backtester.
- For sequence models: build LSTM/CNN models earlier observed in `POC_*` notebooks and compare.
- Add a paper-trade adapter to a broker API (ccxt for crypto, OANDA/IB for FX).
- Create CI tests for data loading and feature functions.