# 02 — Data Preprocessing

Loads raw data, engineers features, creates the target variable, splits data, and normalizes.
Saves processed splits for subsequent notebooks.

In [1]:
import sys
import os
import json
import pickle
from pathlib import Path

# ── Project configuration ──────────────────────────────────────────────────────
PROJECT_FOLDER = "default_run"   # Change to "default_run" for full training

# ── Set up paths ───────────────────────────────────────────────────────────────
PROJECT_ROOT = Path(os.getcwd()).parent
sys.path.insert(0, str(PROJECT_ROOT))
os.chdir(PROJECT_ROOT)

DATA_CONFIG = PROJECT_ROOT / "config" / PROJECT_FOLDER / "data_config.json"
MODEL_CONFIG = PROJECT_ROOT / "config" / PROJECT_FOLDER / "model_config.json"

print(f"Project root  : {PROJECT_ROOT}")
print(f"Project folder: {PROJECT_FOLDER}")

Project root  : /Users/sharannaribole/Documents/github/stock-return-classifier
Project folder: default_run


In [2]:
from src.utils.config_loader import ConfigLoader

config = ConfigLoader(str(DATA_CONFIG), str(MODEL_CONFIG))
print(config)

ConfigLoader(ticker=SPY, project=default_run)


## Load Raw Data

In [3]:
import pandas as pd

raw_path = PROJECT_ROOT / "data" / "raw" / f"{PROJECT_FOLDER}_raw.parquet"
raw_df = pd.read_parquet(raw_path)
print(f"Loaded: {raw_path.name}")
print(f"Shape : {raw_df.shape}")
print(f"Date range: {raw_df.index[0].date()} to {raw_df.index[-1].date()}")
raw_df.head()

Loaded: default_run_raw.parquet
Shape : (5119, 5)
Date range: 2005-10-10 to 2026-02-13


Unnamed: 0_level_0,Adj_Close,High,Low,Volume,VIX_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2005-10-10,81.43998,82.202192,81.233979,52677000,15.55
2005-10-11,81.323257,81.982468,81.247722,75629800,15.63
2005-10-12,80.684631,81.803914,80.622833,100510400,16.219999
2005-10-13,80.636574,81.082915,80.258899,99052900,16.469999
2005-10-14,81.488052,81.584187,80.72584,88651000,14.87


## Feature Engineering

In [4]:
from src.features.engineer import FeatureEngineer

engineer = FeatureEngineer(config)
feat_df = engineer.create_features(raw_df)
print(f"Features created: {list(feat_df.columns)}")
print(f"Shape after feature engineering: {feat_df.shape}")
print(f"NaN values: {feat_df.isnull().sum().sum()}")
feat_df.head()

Features created: ['Volume', 'VIX_Close', 'Close', 'BB_High', 'BB_Low', 'BB_Width', 'BB_Position', 'EMA_8', 'EMA_21', 'ADX', 'RSI', 'MACD_line', 'MACD_signal', 'MACD_hist', 'Stoch_K', 'Stoch_D', 'ROC_3', 'ROC_5', 'Price_Return_1', 'Price_Return_5', 'IBS', 'ATR_pct']
Shape after feature engineering: (5086, 22)
NaN values: 0


Unnamed: 0_level_0,Volume,VIX_Close,Close,BB_High,BB_Low,BB_Width,BB_Position,EMA_8,EMA_21,ADX,...,MACD_signal,MACD_hist,Stoch_K,Stoch_D,ROC_3,ROC_5,Price_Return_1,Price_Return_5,IBS,ATR_pct
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-11-25,15270000,10.88,87.29734,87.568458,81.828762,0.067766,0.952764,86.256928,84.853488,45.565989,...,1.058224,0.199608,95.053026,94.713918,1.089334,1.997785,0.078727,1.997785,0.78048,0.902802
2005-11-28,54498500,11.84,86.679314,87.683476,82.155275,0.065099,0.818356,86.350792,85.019472,45.766173,...,1.094454,0.144921,79.15152,89.239926,-0.055412,0.879062,-0.707956,0.879062,0.154474,0.913894
2005-11-29,51738900,11.89,86.583183,87.740606,82.507406,0.061477,0.778831,86.402434,85.161628,45.952059,...,1.116188,0.086935,76.678135,83.627561,-0.739992,0.262348,-0.110903,0.262348,0.0,0.899976
2005-11-30,56007200,12.06,86.116272,87.699293,82.886565,0.056426,0.671076,86.338843,85.248413,44.942194,...,1.117703,0.00606,64.66475,73.498135,-1.352926,-0.704621,-0.539263,-0.704621,0.097564,0.910279
2005-12-01,65468200,11.24,86.995193,87.846631,83.078444,0.055793,0.821434,86.484698,85.407212,44.350997,...,1.117941,0.000952,85.541843,75.628243,0.364424,-0.267658,1.020622,-0.267658,0.676193,0.928053


## Target Creation

In [5]:
from src.features.target import TargetCreator

target_creator = TargetCreator.from_config(config)
labeled_df = target_creator.create_target(feat_df)

target_info = target_creator.get_target_info(labeled_df)
print("Target class distribution:")
for k, v in target_info.items():
    print(f"  {k}: {v}")
print()
print(f"Shape after target creation: {labeled_df.shape}")
labeled_df[["Close", "target"]].tail(10)

Target class distribution:
  total: 5085
  threshold_pct: 1.0
  class_0: 4390
  class_1: 695
  class_0_pct: 86.33
  class_1_pct: 13.67

Shape after target creation: (5085, 23)


Unnamed: 0_level_0,Close,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2026-01-30,691.969971,0
2026-02-02,695.409973,0
2026-02-03,689.530029,0
2026-02-04,686.190002,0
2026-02-05,677.619995,1
2026-02-06,690.619995,0
2026-02-09,693.950012,0
2026-02-10,692.119995,0
2026-02-11,691.960022,0
2026-02-12,681.27002,0


## Temporal Data Splits

In [6]:
from src.data.splitter import DataSplitter

splitter = DataSplitter(config)
train_df, val_folds, test_df = splitter.split(labeled_df)

split_info = splitter.get_split_info(train_df, val_folds, test_df)
print("Split information:")
for k, v in split_info.items():
    print(f"  {k}: {v}")

Split information:
  train_start: 2005-11-25
  train_end: 2025-02-12
  train_size: 4834
  n_val_folds: 5
  val_fold_sizes: [805, 805, 805, 805, 805]
  val_train_sizes: [809, 1614, 2419, 3224, 4029]
  test_start: 2025-02-13
  test_end: 2026-02-12
  test_size: 251


## Normalization

In [7]:
from src.features.normalizer import Normalizer

normalizer = Normalizer.from_config(config)
norm_train, norm_val_folds, norm_test = normalizer.fit_transform(train_df, val_folds, test_df)

print(f"Normalization method: {normalizer.method}")
print(f"Train shape  : {norm_train.shape}")
print(f"Val folds    : {len(norm_val_folds)}")
print(f"Test shape   : {norm_test.shape}")

# Show target balance in train set
train_balance = norm_train["target"].value_counts()
print()
print(f"Train target balance:")
print(train_balance)
print(f"  Class 1 rate: {train_balance.get(1, 0) / len(norm_train) * 100:.1f}%")

Normalization method: rolling
Train shape  : (4834, 23)
Val folds    : 5
Test shape   : (251, 23)

Train target balance:
target
0    4164
1     670
Name: count, dtype: int64
  Class 1 rate: 13.9%


## Save Processed Data

In [8]:
processed_dir = PROJECT_ROOT / "data" / "processed" / PROJECT_FOLDER
processed_dir.mkdir(parents=True, exist_ok=True)

# Save splits
norm_train.to_parquet(processed_dir / "train.parquet")
norm_test.to_parquet(processed_dir / "test.parquet")

with open(processed_dir / "val_folds.pkl", "wb") as f:
    pickle.dump(norm_val_folds, f)

# Save metadata
feature_cols = [c for c in norm_train.columns if c not in ["target", "Adj_Close"]]
metadata = {
    "project_folder": PROJECT_FOLDER,
    "feature_cols": feature_cols,
    "target_info": target_creator.get_target_info(norm_train),
    "split_info": split_info,
    "normalization_method": normalizer.method,
}
with open(processed_dir / "metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

print(f"Saved to: {processed_dir}")
print(f"  train.parquet    : {norm_train.shape}")
print(f"  test.parquet     : {norm_test.shape}")
print(f"  val_folds.pkl    : {len(norm_val_folds)} folds")
print(f"  metadata.json    : {len(feature_cols)} features")

Saved to: /Users/sharannaribole/Documents/github/stock-return-classifier/data/processed/default_run
  train.parquet    : (4834, 23)
  test.parquet     : (251, 23)
  val_folds.pkl    : 5 folds
  metadata.json    : 22 features
