# 02 â€” Feature Engineering
Build the complete feature matrix: wavelet denoising, technical indicators,
volatility microstructure, order flow, Smart Money Concepts, and MTF alignment.

In [None]:
!pip install -q torch xgboost ccxt PyWavelets pandas-ta hmmlearn numba scikit-learn pyyaml tqdm pyarrow

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys, os
REPO_DIR = '/content/scalp2'
if not os.path.exists(REPO_DIR):
    !git clone https://github.com/<YOUR_USERNAME>/scalp2.git {REPO_DIR}
sys.path.insert(0, REPO_DIR)

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s')

from scalp2.config import load_config
config = load_config(f'{REPO_DIR}/config.yaml')
config.data.processed_dir = '/content/drive/MyDrive/scalp2/data/processed'

In [None]:
import pandas as pd

# Load cleaned data
df_15m = pd.read_parquet(f'{config.data.processed_dir}/BTC_USDT_15m_clean.parquet')
df_1h = pd.read_parquet(f'{config.data.processed_dir}/BTC_USDT_1h_clean.parquet')
df_4h = pd.read_parquet(f'{config.data.processed_dir}/BTC_USDT_4h_clean.parquet')

print(f'15m: {len(df_15m)} bars')
print(f'1h:  {len(df_1h)} bars')
print(f'4h:  {len(df_4h)} bars')

In [None]:
from scalp2.features.builder import build_features, drop_warmup_nans, get_feature_columns

# Build features for each timeframe
print('Building 15m features...')
df_15m_feat = build_features(df_15m, config.features)

print('Building 1h features...')
df_1h_feat = build_features(df_1h, config.features)

print('Building 4h features...')
df_4h_feat = build_features(df_4h, config.features)

print(f'15m features: {len(get_feature_columns(df_15m_feat))} columns')

In [None]:
from scalp2.data.mtf_builder import build_mtf_dataset

# Align MTF features onto 15m index
df_full = build_mtf_dataset(df_15m_feat, df_1h_feat, df_4h_feat)
print(f'Full dataset: {len(df_full)} rows x {len(df_full.columns)} columns')

# Drop warmup NaNs
df_full = drop_warmup_nans(df_full)
print(f'After warmup removal: {len(df_full)} rows x {len(df_full.columns)} columns')

feature_cols = get_feature_columns(df_full)
print(f'Model features: {len(feature_cols)}')
print(f'Memory: {df_full.memory_usage(deep=True).sum() / 1e6:.1f} MB')

In [None]:
# Save feature matrix
output_path = f'{config.data.processed_dir}/BTC_USDT_features.parquet'
df_full.to_parquet(output_path)
print(f'Saved feature matrix to {output_path}')

# Save feature column list
import json
with open(f'{config.data.processed_dir}/feature_columns.json', 'w') as f:
    json.dump(feature_cols, f)
print(f'Saved {len(feature_cols)} feature column names')