# 02_feature_engineering_v2.2_pro_fixed

Fixed, GPU-aware feature engineering notebook.

Key fixes:
- Disables Stumpy/Numba GPU to avoid TSFresh crashes while keeping system GPU available for LightGBM
- Safe oscillator building (no duplicate columns)
- Auto-detection of oscillator column names for composites
- Efficient TSFresh extraction (CPU) + LightGBM training (GPU if available)


In [1]:
# 0) Environment safety for TSFresh / Stumpy (RUN this first)
import os, warnings
# Prevent stumpy/numba from initializing CUDA kernels (avoids Windows/numba crash)
os.environ["STUMPY_USE_GPU"] = "false"
os.environ["NUMBA_DISABLE_CUDA"] = "1"
warnings.filterwarnings('ignore')
print('Environment flags set: STUMPY_USE_GPU=false, NUMBA_DISABLE_CUDA=1')

Environment flags set: STUMPY_USE_GPU=false, NUMBA_DISABLE_CUDA=1


In [2]:
# 1) Paths & lightweight imports
from pathlib import Path
import numpy as np, pandas as pd, math, json
ROOT = Path('.').resolve()
DATA = ROOT/'data'
INTERIM = DATA/'interim'
PROCESSED = DATA/'processed'
MODELS = ROOT/'models'
REPORTS = ROOT/'reports'
for p in [INTERIM, PROCESSED, MODELS, REPORTS]: p.mkdir(parents=True, exist_ok=True)
print('Paths prepared:', INTERIM, PROCESSED, MODELS, REPORTS)

Paths prepared: E:\NuFinTech\notebooks\data\interim E:\NuFinTech\notebooks\data\processed E:\NuFinTech\notebooks\models E:\NuFinTech\notebooks\reports


In [3]:
# 2) Load data (use processed synthetic if present otherwise try combined_files.zip)
import zipfile, io
synth = PROCESSED / 'synthetic_trades.parquet'
if synth.exists():
    df = pd.read_parquet(synth)
    print('Loaded synthetic:', synth)
else:
    zpath = Path('data') / 'raw' / 'combined_files.zip'
    if zpath.exists():
        z = zipfile.ZipFile(zpath)
        df = None
        for n in z.namelist():
            if n.lower().endswith('.parquet'):
                df = pd.read_parquet(io.BytesIO(z.read(n))); break
            if n.lower().endswith('.csv'):
                df = pd.read_csv(io.BytesIO(z.read(n))); break
        if df is None:
            raise FileNotFoundError('No csv/parquet in zip')
        print('Loaded from zip:', zpath)
    else:
        # generate small synthetic for quick runs
        def gen(n=3000):
            rng = np.random.default_rng(42)
            rows = []
            for i in range(n):
                net = float(rng.normal(0,1)*(1+rng.random()*3))
                flip = float(rng.normal(0,0.6))
                pnl = float(rng.normal(0,1))
                rows.append({'trade_id': int(i),'entry_date': pd.Timestamp('2024-01-01')+pd.Timedelta(days=i//10),
                             'net_gex': net, 'gamma_flip_strike_perc': flip, 'pnl': pnl})
            df = pd.DataFrame(rows); df['label'] = (df['pnl']>0).astype(int); return df
        df = gen(3000); df.to_parquet(synth, index=False); print('Synthetic generated')

print('rows:', len(df))
df.head()

Loaded synthetic: E:\NuFinTech\notebooks\data\processed\synthetic_trades.parquet
rows: 2000


Unnamed: 0,trade_id,symbol,entry_date,exit_date,entry_cost,exit_cost,pnl,net_gex,gamma_flip_strike_perc,label
0,0,AAPL,2024-01-01,2024-01-01 09:00:00,98.048965,98.69782,0.12784,-5.504625,0.470282,1
1,1,GOOG,2024-01-01,2024-01-01 09:00:00,100.777792,100.066031,1.127241,-0.054636,0.439699,1
2,2,GOOG,2024-01-01,2024-01-01 05:00:00,100.87845,99.950074,-0.184862,-3.242045,-0.479441,0
3,3,GOOG,2024-01-01,2024-01-01 04:00:00,99.647866,100.532309,0.365444,6.681929,-0.214164,1
4,4,GOOG,2024-01-02,2024-01-02 02:00:00,99.487757,99.186227,0.615979,2.035117,-0.203208,1


In [4]:
# 3) Safe oscillator builder - build per-column and merge by key (no duplicate names)
ROLL_Z, MOM_N = 100, 5
def build_osc(df, col):
    s = df[col].astype(float)
    out = pd.DataFrame({'trade_id': df['trade_id']})
    out[f'{col}_mean'] = s.rolling(ROLL_Z, min_periods=10).mean()
    out[f'{col}_std'] = s.rolling(ROLL_Z, min_periods=10).std().replace(0,1e-6)
    out[f'{col}_z'] = (s - out[f'{col}_mean']) / out[f'{col}_std']
    out[f'{col}_osc'] = np.tanh(out[f'{col}_z'])
    out[f'{col}_mom'] = (s - s.shift(MOM_N)) / (s.shift(MOM_N).abs() + 1e-9)
    out[f'{col}_mom_tanh'] = np.tanh(out[f'{col}_mom'])
    return out

cols = ['net_gex','gamma_flip_strike_perc']
frames = []
for c in cols:
    if c not in df.columns:
        print('missing column', c); continue
    tmp = build_osc(df, c)
    frames.append(tmp)
# merge safely on trade_id
from functools import reduce
osc = reduce(lambda left,right: left.merge(right, on='trade_id', how='outer'), frames).drop_duplicates('trade_id')
osc = osc.merge(df[['trade_id','label']], on='trade_id', how='left')
osc.to_parquet(PROCESSED/'oscillators_v2_fixed.parquet', index=False)
print('Oscillators written', osc.shape)
osc.head()

Oscillators written (2000, 14)


Unnamed: 0,trade_id,net_gex_mean,net_gex_std,net_gex_z,net_gex_osc,net_gex_mom,net_gex_mom_tanh,gamma_flip_strike_perc_mean,gamma_flip_strike_perc_std,gamma_flip_strike_perc_z,gamma_flip_strike_perc_osc,gamma_flip_strike_perc_mom,gamma_flip_strike_perc_mom_tanh,label
0,0,,,,,,,,,,,,,1
1,1,,,,,,,,,,,,,1
2,2,,,,,,,,,,,,,0
3,3,,,,,,,,,,,,,1
4,4,,,,,,,,,,,,,1


In [5]:
# 4) Semantically rich composites - auto-detect oscillator columns (no hard-coded names)
merged = pd.read_parquet(PROCESSED/'oscillators_v2_fixed.parquet')

def find_col_for(prefix, key):
    # find first column that contains prefix AND key
    for col in merged.columns:
        if prefix in col and key in col:
            return col
    return None

for c in cols:
    osc_col = find_col_for(c, '_osc')
    z_col = find_col_for(c, '_z')
    mom_col = find_col_for(c, '_mom_tanh')
    if not osc_col or not z_col or not mom_col:
        print(f'Skipping composites for {c} - missing columns', osc_col, z_col, mom_col)
        continue
    merged[f'{c}_stability'] = 1 - merged[osc_col].abs()
    merged[f'{c}_vol_potential'] = (1 - merged[z_col].abs()) * merged[mom_col].abs()
    merged[f'{c}_instability'] = merged[mom_col].abs() * (1 - merged[z_col].abs())

# Interaction term if both exist
try:
    gex_vol = [cc for cc in merged.columns if 'net_gex' in cc and 'vol_potential' in cc][0]
    gamma_vol = [cc for cc in merged.columns if 'gamma_flip' in cc and 'vol_potential' in cc][0]
    merged['gex_gamma_interact'] = merged[gex_vol] * merged[gamma_vol]
except Exception:
    print('Could not build interaction term (columns missing)')

merged = merged.replace([np.inf, -np.inf], np.nan).fillna(0).clip(-1e6, 1e6)
merged.to_parquet(PROCESSED/'merged_composites_v2_fixed.parquet', index=False)
print('Merged composites saved', merged.shape)
merged.head()

Merged composites saved (2000, 21)


Unnamed: 0,trade_id,net_gex_mean,net_gex_std,net_gex_z,net_gex_osc,net_gex_mom,net_gex_mom_tanh,gamma_flip_strike_perc_mean,gamma_flip_strike_perc_std,gamma_flip_strike_perc_z,...,gamma_flip_strike_perc_mom,gamma_flip_strike_perc_mom_tanh,label,net_gex_stability,net_gex_vol_potential,net_gex_instability,gamma_flip_strike_perc_stability,gamma_flip_strike_perc_vol_potential,gamma_flip_strike_perc_instability,gex_gamma_interact
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# 5) TSFresh extraction (CPU-safe) - build small long-form per trade
import numpy as _np
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters

merged = pd.read_parquet(PROCESSED/'merged_composites_v2_fixed.parquet')
ids = merged['trade_id'].unique()
MAX_IDS = min(2000, len(ids))
ids = ids[:MAX_IDS]
WINDOW = 20
rows = []
for tid in ids:
    row = merged[merged.trade_id == tid].iloc[0]
    for t in range(WINDOW):
        rec = {'id': int(tid), 'time': int(t)}
        for c in merged.columns:
            if c in ['trade_id','label']: continue
            rec[c] = float(row[c]) * (1.0 + 0.01 * _np.random.randn())
        rows.append(rec)
ts = pd.DataFrame(rows)
print('Long form shape:', ts.shape)

fc = EfficientFCParameters()
feat = extract_features(ts, column_id='id', column_sort='time', default_fc_parameters=fc, n_jobs=4)
feat = feat.fillna(0)
feat.to_parquet(PROCESSED/'tsfresh_eff_features_fixed.parquet')
print('TSFresh features saved', feat.shape)


Long form shape: (40000, 21)


Feature Extraction: 100%|███████████████████████████████████████████████████████████| 20/20 [11:39<00:00, 34.98s/it]


TSFresh features saved (2000, 14763)


In [7]:
# 6) Merge features + selection
feat = pd.read_parquet(PROCESSED/'tsfresh_eff_features_fixed.parquet')
comp = pd.read_parquet(PROCESSED/'merged_composites_v2_fixed.parquet').set_index('trade_id')
comp = comp.loc[comp.index.intersection(feat.index)]
X_full = pd.concat([comp.drop(columns=['label'], errors='ignore'), feat.reindex(comp.index)], axis=1).fillna(0)
y = comp['label']
X_full = X_full.replace([np.inf, -np.inf], np.nan).fillna(0).clip(-1e6, 1e6)
X_full.to_parquet(PROCESSED/'features_full_v2_fixed.parquet')
print('features_full_v2_fixed saved', X_full.shape)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=300, n_jobs=4, random_state=42)
rf.fit(X_full, y)
imp = pd.Series(rf.feature_importances_, index=X_full.columns).sort_values(ascending=False)
TOP = 60
sel = imp.head(TOP).index.tolist()
pd.Series(sel).to_csv(MODELS/'selected_features_v2_fixed.csv', index=False)
X_sel = X_full[sel]
X_sel.to_parquet(PROCESSED/'features_train_v2_fixed.parquet')
print('Saved selected features', X_sel.shape)


features_full_v2_fixed saved (2000, 14782)
Saved selected features (2000, 60)


In [12]:
# --- Fixed LightGBM Training (handles GPU + special character feature names) ---
import re, joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Load features
X = pd.read_parquet(PROCESSED / 'features_train_v2_fixed.parquet')
y = y.reindex(X.index).fillna(0).astype(int)

# --- Sanitize feature names for LightGBM ---
def sanitize_columns(df):
    safe_cols = []
    for c in df.columns:
        new_c = re.sub(r'[^A-Za-z0-9_]+', '_', c)  # replace all special chars
        safe_cols.append(new_c)
    df.columns = safe_cols
    return df

X = sanitize_columns(X)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

use_gpu = False
auc_lgb = None

try:
    import lightgbm as lgb
    params = {
        'n_estimators': 1000,
        'learning_rate': 0.05,
        'num_leaves': 31,
        'n_jobs': -1,
        'device': 'gpu'  # try GPU first
    }

    lgb_clf = lgb.LGBMClassifier(**params)
    callbacks = [lgb.early_stopping(50, verbose=False)]

    try:
        lgb_clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=callbacks)
        use_gpu = True
    except Exception as e:
        print("⚠️ GPU training failed:", e)
        params['device'] = 'cpu'
        lgb_clf = lgb.LGBMClassifier(**params)
        lgb_clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=callbacks)

    p_val = lgb_clf.predict_proba(X_val)[:, 1]
    auc_lgb = roc_auc_score(y_val, p_val)
    print(f"✅ LightGBM AUC: {auc_lgb:.4f} | GPU used: {use_gpu}")
    joblib.dump(lgb_clf, MODELS / 'lgb_model_v2_final.pkl')

except Exception as e:
    print("❌ LightGBM training failed:", e)

# --- RandomForest baseline ---
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=400, n_jobs=4, random_state=42)
rf.fit(X_train, y_train)
p_rf = rf.predict_proba(X_val)[:, 1]
auc_rf = roc_auc_score(y_val, p_rf)
print(f"✅ RF AUC: {auc_rf:.4f}")

# --- Ensemble stack if LightGBM succeeded ---
if auc_lgb:
    import numpy as np
    stack = np.mean([p_val, p_rf], axis=0)
    auc_stack = roc_auc_score(y_val, stack)
    print(f"✅ Stacked AUC: {auc_stack:.4f}")
else:
    auc_stack = auc_rf

# Save results
pd.Series({
    'auc_lgb': auc_lgb,
    'auc_rf': auc_rf,
    'auc_stack': auc_stack
}).to_csv(REPORTS / 'metrics_v2_final.csv')


[LightGBM] [Info] Number of positive: 788, number of negative: 812
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 15300
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 60
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 60 dense feature groups (0.09 MB) transferred to GPU in 0.001810 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492500 -> initscore=-0.030002
[LightGBM] [Info] Start training from score -0.030002
✅ LightGBM AUC: 0.5633 | GPU used: True
✅ RF AUC: 0.5746
✅ Stacked AUC: 0.5798
