# Model Generation

In [None]:
import os
import pandas as pd

pd.set_option("display.precision", 2)

PROCESSED_PATH = "data/processed/"
TRAIN_DATA_NAME = "train_data.csv"  # 60%
VALIDATION_DATA_NAME = "validation_data.csv"  # 20%
TEST_DATA_NAME = "test_data.csv"  # 20%

TRAIN_DATA_FILE = os.path.join(PROCESSED_PATH, TRAIN_DATA_NAME)
VALIDATION_DATA_FILE = os.path.join(PROCESSED_PATH, VALIDATION_DATA_NAME)

train_data = pd.read_csv(TRAIN_DATA_FILE)
valid_data = pd.read_csv(VALIDATION_DATA_FILE)

train_data.set_index("Timestamp", inplace=True)
valid_data.set_index("Timestamp", inplace=True)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import (
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

models = []
# Lightweight Models
models.append(("LDA", LinearDiscriminantAnalysis()))  # Unsupervised Model
models.append(("KNN", KNeighborsClassifier()))  # Unsupervised Model
models.append(("TREE", DecisionTreeClassifier()))  # Supervised Model
models.append(("NB", GaussianNB()))  # Unsupervised Model

# More Advanced Models
models.append(("GBM", GradientBoostingClassifier(n_estimators=25)))
models.append(("XGB", XGBClassifier(n_estimators=25, eval_metric="logloss")))
models.append(("CAT", CatBoostClassifier(silent=True, n_estimators=25)))
models.append(("RF", RandomForestClassifier(n_estimators=25)))

In [None]:
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score


def evaluate_models(
    train_data, validation_data, target_column="signal", visualize=True
):
    X_train = train_data.drop(columns=[target_column])
    y_train = train_data[target_column]
    X_test = validation_data.drop(columns=[target_column])
    y_test = validation_data[target_column]

    print(f"Model Evaluation Summary")
    print(f"Features: {', '.join(X_train.columns)}")
    print(f"Target: {target_column}")
    print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")

    results = {"Train Score": [], "Test Score": []}
    model_names = []

    for name, model in models:
        model_names.append(name)

        start_time = time.time()
        fitted_model = model.fit(X_train, y_train)

        train_score = accuracy_score(fitted_model.predict(X_train), y_train)
        results["Train Score"].append(train_score)

        test_score = accuracy_score(fitted_model.predict(X_test), y_test)
        results["Test Score"].append(test_score)

        train_time = time.time() - start_time

        print(
            f"{name}: Train: {train_score:.3f}, Test: {test_score:.3f} [{train_time:.2f}s]"
        )

    results_df = pd.DataFrame(results, index=model_names)

    if visualize:
        plt.figure(figsize=(10, 6))
        sns.heatmap(
            results_df,
            vmin=0.5,
            vmax=1.0,
            center=0.75,
            annot=True,
            fmt=".3f",
            cmap="Blues",
            linewidths=1,
        )
        plt.title("Model Performance Comparison")
        plt.tight_layout()
        plt.show()

    return results_df