# Q1 Ablation Study

This notebook orchestrates the feature-engineering ablation for the stock price movement research question using the shared configuration in `assets/config.json`.

In [1]:
from pathlib import Path
import os, sys
MAIN_PATH = Path(os.getcwd()).parent
sys.path.append(str(MAIN_PATH))

In [2]:
from __future__ import annotations

from pathlib import Path
from typing import Mapping, Sequence

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

from data_processings.datasets import StockMarketDataset
from data_processings.pipeline_builder import build_pipeline_from_config, load_pipeline_config
from data_processings.transforms import DFXPipeline
from models import build_model


In [3]:
CONFIG_DIR = Path(MAIN_PATH, "assets/configs/q1")
PROFILE_CONFIGS = {path.stem: path for path in sorted(CONFIG_DIR.glob("*.json"))}
if not PROFILE_CONFIGS:
    raise RuntimeError("No preprocessing configs found under assets/configs/q1")

dataset_loader = StockMarketDataset()


In [4]:
def split_time_series_frame(df: pd.DataFrame, split_cfg: Mapping[str, object]) -> tuple[pd.DataFrame, pd.DataFrame]:
    method = split_cfg.get("method", "time")
    if method != "time":
        raise ValueError("Only time-based splits are supported in this notebook")
    test_size = float(split_cfg.get("test_size", 0.2))
    split_idx = int(len(df) * (1 - test_size))
    split_idx = max(1, min(split_idx, len(df) - 1))
    return df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy()


def sanitize_features_target(features: pd.DataFrame, target: pd.Series) -> tuple[pd.DataFrame, pd.Series]:
    target_name = target.name or "target"
    combined = pd.concat([features, target.rename(target_name)], axis=1)
    combined = combined.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
    cleaned_target = combined[target_name]
    cleaned_features = combined.drop(columns=[target_name])
    return cleaned_features, cleaned_target


def load_dataset_for_config(cfg: Mapping[str, object]) -> pd.DataFrame:
    options = cfg.get("dataset_options", {})
    if "tickers" not in options:
        raise ValueError("StockMarketDataset requires 'dataset_options.tickers'")
    return dataset_loader.load(options)


def prepare_pipeline_frames(raw_df: pd.DataFrame, pipeline: DFXPipeline, split_cfg: Mapping[str, object]) -> tuple[pd.DataFrame, pd.DataFrame]:
    processed = pipeline.apply_global(raw_df)
    processed = processed.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
    train_df, test_df = split_time_series_frame(processed, split_cfg)
    train_df = pipeline.fit_transform(train_df)
    test_df = pipeline.transform(test_df)
    return train_df, test_df


def prepare_feature_matrices(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    target_column: str,
    drop_columns: Sequence[str]
) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
    if target_column not in train_df.columns or target_column not in test_df.columns:
        raise KeyError(f"Target column '{target_column}' missing from processed frames.")
    drop_list = list(drop_columns)
    y_train = train_df[target_column].copy()
    y_test = test_df[target_column].copy()
    features_train = train_df.drop(columns=drop_list + [target_column], errors="ignore")
    features_test = test_df.drop(columns=drop_list + [target_column], errors="ignore")
    features_train = features_train.select_dtypes(include=[np.number])
    features_test = features_test.select_dtypes(include=[np.number])
    features_train, y_train = sanitize_features_target(features_train, y_train)
    features_test, y_test = sanitize_features_target(features_test, y_test)
    features_test = features_test.reindex(columns=features_train.columns, fill_value=0.0)
    return features_train, y_train, features_test, y_test


def evaluate_model(
    model_key: str,
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    metrics: Sequence[str],
) -> dict[str, float]:
    model = build_model(model_key)
    model.fit(X_train.to_numpy(dtype=np.float64), y_train.to_numpy())
    predictions = model.predict(X_test.to_numpy(dtype=np.float64))

    results: dict[str, float] = {}
    for metric in metrics:
        if metric == "accuracy":
            results["accuracy"] = accuracy_score(y_test, predictions)
        elif metric == "f1":
            results["f1"] = f1_score(y_test, predictions, zero_division=0)
        else:
            raise ValueError(f"Unsupported metric: {metric}")
    return results


def run_profile(profile_name: str, cfg_path: Path) -> list[dict[str, object]]:
    cfg = load_pipeline_config(cfg_path)
    pipeline, metadata = build_pipeline_from_config(cfg)
    raw_df = load_dataset_for_config(cfg)
    split_cfg = cfg.get("split", {})
    train_df, test_df = prepare_pipeline_frames(raw_df, pipeline, split_cfg)

    target_column = str(metadata.get("target_column", cfg.get("target_column", "target")))
    drop_columns = metadata.get("drop_columns", cfg.get("drop_columns", [])) or []
    models = cfg.get("models", [])
    metrics = cfg.get("metrics", ["accuracy"])

    X_train, y_train, X_test, y_test = prepare_feature_matrices(train_df, test_df, target_column, drop_columns)
    if len(X_train) < 10 or len(X_test) < 5:
        return []

    records: list[dict[str, object]] = []
    for model_key in models:
        metric_values = evaluate_model(model_key, X_train, y_train, X_test, y_test, metrics)
        record = {
            "config": profile_name,
            "model": model_key,
            "train_samples": len(X_train),
            "test_samples": len(X_test),
            "num_features": X_train.shape[1],
        }
        record.update(metric_values)
        records.append(record)

    return records


In [5]:
all_records: list[dict[str, object]] = []
for profile_name, cfg_path in PROFILE_CONFIGS.items():
    profile_records = run_profile(profile_name, cfg_path)
    all_records.extend(profile_records)

results_df = pd.DataFrame(all_records)
if not results_df.empty:
    sort_columns = [column for column in ["config", "model"] if column in results_df.columns]
    if sort_columns:
        results_df = results_df.sort_values(sort_columns).reset_index(drop=True)
results_df


Unnamed: 0,config,model,train_samples,test_samples,num_features,accuracy,f1
0,P0_baseline,decision_tree,2572,644,36,0.532609,0.426667
1,P0_baseline,gradient_boosting,2572,644,36,0.506211,0.487097
2,P0_baseline,logistic_regression,2572,644,36,0.559006,0.0
3,P0_baseline,naive_bayes,2572,644,36,0.526398,0.442413
4,P0_baseline,random_forest,2572,644,36,0.531056,0.044304
5,P0_baseline,svm,2572,644,36,0.559006,0.006993
6,P1_median_mode_no_indicator,decision_tree,2572,644,36,0.532609,0.426667
7,P1_median_mode_no_indicator,gradient_boosting,2572,644,36,0.506211,0.487097
8,P1_median_mode_no_indicator,logistic_regression,2572,644,36,0.559006,0.0
9,P1_median_mode_no_indicator,naive_bayes,2572,644,36,0.526398,0.442413


## Notes
- All preprocessing and model definitions are sourced from `assets/config.json`.
- Extend the experiment by editing the configuration (e.g., add feature sets or models) and re-running the notebook.
