# 0.2 — Feature Engineering
Build richer features from the raw data for modelling.

In [None]:
import pandas as pd
import numpy as np
import os

BASE = '.'
TRADERS = ['calm_trader', 'loss_averse_trader', 'overtrader', 'revenge_trader']


## Load Processed Data

In [None]:
dfs = {}
for t in TRADERS:
    dfs[t] = pd.read_csv(f'{BASE}/{t}/data/engineered/{t}_engineered.csv', parse_dates=['timestamp'])
    print(f"{t}: {dfs[t].shape}")
    print(dfs[t].head(2))


## Encoding Map Reference
Consistent encoding applied across all datasets.

In [None]:
import json
with open(f'{BASE}/encoding_map.json') as f:
    enc = json.load(f)
print(json.dumps(enc, indent=2))


## Additional Feature Engineering

In [None]:
def engineer(df):
    df = df.copy()

    # ── Rolling features (window=10) ──────────────────────────
    df['rolling_win_rate_10']  = df['win'].rolling(10, min_periods=1).mean()
    df['rolling_avg_pnl_10']   = df['profit_loss'].rolling(10, min_periods=1).mean()
    df['rolling_std_pnl_10']   = df['profit_loss'].rolling(10, min_periods=1).std().fillna(0)

    # ── Cumulative features ────────────────────────────────────
    df['cum_wins']   = df['win'].cumsum()
    df['cum_losses'] = (1 - df['win']).cumsum()
    df['cum_pnl']    = df['profit_loss'].cumsum()

    # ── Streak features ────────────────────────────────────────
    streak, s = [], 0
    for w in df['win']:
        s = s + 1 if w == 1 else (s - 1 if w == 0 else 0)
        if w == 0 and s > 0: s = -1
        if w == 1 and s < 0: s = 1
        streak.append(s)
    df['streak'] = streak

    # ── Risk features ──────────────────────────────────────────
    df['trade_value'] = df['quantity'] * df['entry_price']
    df['return_pct']  = df['profit_loss'] / df['trade_value'].replace(0, np.nan)

    # ── Day of week ────────────────────────────────────────────
    df['day_of_week'] = df['timestamp'].dt.dayofweek  # 0=Mon

    # ── Interaction ────────────────────────────────────────────
    df['asset_side'] = df['asset_encoded'].astype(str) + '_' + df['side_encoded'].astype(str)

    return df

engineered = {}
for t in TRADERS:
    engineered[t] = engineer(dfs[t])
    print(f"{t}: {engineered[t].shape} columns: {list(engineered[t].columns)}")


## Feature Importance Preview (Correlation with Win)

In [None]:
import matplotlib.pyplot as plt

num_feats = ['quantity','price_change_pct','rolling_win_rate_10','rolling_avg_pnl_10',
             'rolling_std_pnl_10','streak','return_pct','day_of_week',
             'asset_encoded','side_encoded','hour','month']

fig, axes = plt.subplots(2, 2, figsize=(14, 8))
for ax, t in zip(axes.flat, TRADERS):
    corrs = engineered[t][num_feats + ['win']].corr()['win'].drop('win').sort_values()
    colors = ['#F44336' if v < 0 else '#4CAF50' for v in corrs]
    ax.barh(corrs.index, corrs.values, color=colors, alpha=0.85)
    ax.axvline(0, color='black', linewidth=0.8)
    ax.set_title(f'{t} — Feature Correlation w/ Win')
    ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()


## Save Fully Engineered Data

In [None]:
for t in TRADERS:
    out = f'{BASE}/{t}/data/engineered/{t}_full_features.csv'
    engineered[t].to_csv(out, index=False)
    print(f"Saved: {out}")
