# Q1 Ablation Study

This notebook orchestrates the feature-engineering ablation for the stock price movement research question using the shared configuration in `assets/config.json`.

In [1]:
from pathlib import Path
import os, sys
sys.path.append(str(Path(os.getcwd()).parent))

In [2]:
from __future__ import annotations

from typing import Mapping, Sequence

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

from data_processings import (
    append_target,
    apply_base_preprocessing,
    apply_feature_sets,
    apply_post_split_transforms,
    balance_training_dataframe,
    get_experiment_config,
    get_preprocessing_config,
    load_config,
    load_stock_market_data,
    select_feature_columns,
)
from models import build_model

In [3]:
CONFIG = load_config()
EXPERIMENT_KEY = "q1_stock_movement"
EXPERIMENT_CFG = get_experiment_config(EXPERIMENT_KEY)
DATASET_KEY = EXPERIMENT_CFG["dataset"]
PREPROCESSING_CFG = get_preprocessing_config(DATASET_KEY)


def load_dataset() -> pd.DataFrame:
    options = EXPERIMENT_CFG.get("dataset_options", {})
    tickers = options.get("tickers")
    if not tickers:
        raise ValueError("Experiment dataset options must define 'tickers'")
    dataset_kwargs = {key: value for key, value in options.items() if key != "tickers"}
    return load_stock_market_data(tickers, **dataset_kwargs)


def sanitize_features_target(
    features: pd.DataFrame,
    target: pd.Series,
) -> tuple[pd.DataFrame, pd.Series]:
    target_name = target.name or "target"
    combined = pd.concat([features, target.rename(target_name)], axis=1)
    combined = combined.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
    cleaned_target = combined[target_name]
    cleaned_features = combined.drop(columns=[target_name])
    return cleaned_features, cleaned_target


def split_time_series_frame(
    df: pd.DataFrame,
    split_cfg: Mapping[str, object],
) -> tuple[pd.DataFrame, pd.DataFrame]:
    if split_cfg.get("method", "time") != "time":
        raise ValueError("Only time-based splits are supported in this notebook")

    test_size = float(split_cfg.get("test_size", 0.2))
    split_idx = int(len(df) * (1 - test_size))
    split_idx = max(1, min(split_idx, len(df) - 1))
    return df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy()


def prepare_labelled_frame(
    base_df: pd.DataFrame,
    feature_sets: Sequence[str],
    profile_config: Mapping[str, object],
) -> pd.DataFrame:
    enriched = apply_feature_sets(
        base_df,
        DATASET_KEY,
        feature_sets,
        config_override=profile_config,
    )
    labelled = append_target(
        enriched,
        DATASET_KEY,
        config_override=profile_config,
    )
    return labelled.replace([np.inf, -np.inf], np.nan).dropna(axis=0)


def evaluate_model(
    model_key: str,
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    metrics: Sequence[str],
) -> dict:
    model = build_model(model_key)
    model.fit(X_train.values, y_train.values)
    predictions = model.predict(X_test.values)

    results: dict[str, float] = {}
    for metric in metrics:
        if metric == "accuracy":
            results["accuracy"] = accuracy_score(y_test, predictions)
        elif metric == "f1":
            results["f1"] = f1_score(y_test, predictions, zero_division=0)
        else:
            raise ValueError(f"Unsupported metric: {metric}")
    return results


def run_ablation(raw_df: pd.DataFrame) -> pd.DataFrame:
    ablation_axis = EXPERIMENT_CFG.get("ablation_axis", "features")
    ablation_sets_key = EXPERIMENT_CFG.get("ablation_sets_key")
    if not ablation_sets_key:
        raise KeyError("Experiment configuration missing 'ablation_sets_key'")

    split_cfg = EXPERIMENT_CFG.get("split", {})
    metrics = EXPERIMENT_CFG.get("metrics", ["accuracy"])
    models = EXPERIMENT_CFG.get("models", [])

    records: list[dict[str, object]] = []

    if ablation_axis == "preprocessing":
        profile_names = PREPROCESSING_CFG.get(ablation_sets_key)
        if not profile_names:
            raise KeyError(f"No preprocessing profiles defined under key '{ablation_sets_key}'")

        feature_sets_fixed = EXPERIMENT_CFG.get("feature_sets_fixed") or ["technical"]

        for profile_name in profile_names:
            base_df, profile_config = apply_base_preprocessing(
                raw_df,
                DATASET_KEY,
                profile_name=profile_name,
            )
            labelled = prepare_labelled_frame(base_df, feature_sets_fixed, profile_config)
            if len(labelled) < 10:
                continue

            train_df, test_df = split_time_series_frame(labelled, split_cfg)
            train_df = balance_training_dataframe(train_df, DATASET_KEY, config_override=profile_config)
            train_df, test_df = apply_post_split_transforms(train_df, test_df, profile_config)

            train_features, y_train = select_feature_columns(
                train_df,
                DATASET_KEY,
                config_override=profile_config,
            )
            test_features, y_test = select_feature_columns(
                test_df,
                DATASET_KEY,
                config_override=profile_config,
            )

            train_features, y_train = sanitize_features_target(train_features, y_train)
            test_features, y_test = sanitize_features_target(test_features, y_test)
            if len(train_features) < 10 or len(test_features) < 5:
                continue

            test_features = test_features.reindex(columns=train_features.columns, fill_value=0.0)

            for model_key in models:
                metric_values = evaluate_model(model_key, train_features, y_train, test_features, y_test, metrics)
                record = {
                    "preproc_profile": profile_name,
                    "feature_sets": ", ".join(feature_sets_fixed),
                    "model": model_key,
                    "train_samples": len(train_features),
                    "test_samples": len(test_features),
                    "num_features": train_features.shape[1],
                }
                record.update(metric_values)
                records.append(record)
    else:
        feature_combinations = PREPROCESSING_CFG.get(ablation_sets_key)
        if not feature_combinations:
            raise KeyError(f"No feature sets defined under key '{ablation_sets_key}'")

        base_df, base_config = apply_base_preprocessing(raw_df, DATASET_KEY)

        for feature_sets in feature_combinations:
            labelled = prepare_labelled_frame(base_df, feature_sets, base_config)
            if len(labelled) < 10:
                continue

            train_df, test_df = split_time_series_frame(labelled, split_cfg)
            train_df = balance_training_dataframe(train_df, DATASET_KEY, config_override=base_config)
            train_df, test_df = apply_post_split_transforms(train_df, test_df, base_config)

            train_features, y_train = select_feature_columns(
                train_df,
                DATASET_KEY,
                config_override=base_config,
            )
            test_features, y_test = select_feature_columns(
                test_df,
                DATASET_KEY,
                config_override=base_config,
            )

            train_features, y_train = sanitize_features_target(train_features, y_train)
            test_features, y_test = sanitize_features_target(test_features, y_test)
            if len(train_features) < 10 or len(test_features) < 5:
                continue

            test_features = test_features.reindex(columns=train_features.columns, fill_value=0.0)

            for model_key in models:
                metric_values = evaluate_model(model_key, train_features, y_train, test_features, y_test, metrics)
                record = {
                    "preproc_profile": "default",
                    "feature_sets": ", ".join(feature_sets),
                    "model": model_key,
                    "train_samples": len(train_features),
                    "test_samples": len(test_features),
                    "num_features": train_features.shape[1],
                }
                record.update(metric_values)
                records.append(record)

    return pd.DataFrame(records)


In [4]:
raw_df = load_dataset()
ablation_results = run_ablation(raw_df)

sort_columns = [column for column in ["preproc_profile", "feature_sets", "model"] if column in ablation_results.columns]
if sort_columns:
    ablation_results = ablation_results.sort_values(by=sort_columns)

ablation_results.reset_index(drop=True)


Unnamed: 0,preproc_profile,feature_sets,model,train_samples,test_samples,num_features,accuracy,f1
0,P0_baseline,"technical, sentiment, macro",decision_tree,3102,644,44,0.496894,0.327801
1,P0_baseline,"technical, sentiment, macro",gradient_boosting,3102,644,44,0.535714,0.339956
2,P0_baseline,"technical, sentiment, macro",logistic_regression,3102,644,44,0.526398,0.272076
3,P0_baseline,"technical, sentiment, macro",naive_bayes,3102,644,44,0.487578,0.537815
4,P0_baseline,"technical, sentiment, macro",random_forest,3102,644,44,0.554348,0.172911
5,P0_baseline,"technical, sentiment, macro",svm,3102,644,44,0.529503,0.456014
6,P1_median_mode_no_indicator,"technical, sentiment, macro",decision_tree,3102,644,36,0.498447,0.339468
7,P1_median_mode_no_indicator,"technical, sentiment, macro",gradient_boosting,3102,644,36,0.534161,0.342105
8,P1_median_mode_no_indicator,"technical, sentiment, macro",logistic_regression,3102,644,36,0.526398,0.272076
9,P1_median_mode_no_indicator,"technical, sentiment, macro",naive_bayes,3102,644,36,0.487578,0.537815


## Notes
- All preprocessing and model definitions are sourced from `assets/config.json`.
- Extend the experiment by editing the configuration (e.g., add feature sets or models) and re-running the notebook.
