---
---
# **1. Imports**
---
---

In [1]:
import glob
import joblib
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.base import clone

In [2]:
# Reproducibility
np.random.seed(42)
random.seed(42)

## 1.1 File Imports

In [3]:
# Models
best_lr = joblib.load("../models/classification/model_package_LogisticRegression_Tuned.pkl")
best_rf = joblib.load("../models/classification/model_package_RandomForest_Tuned.pkl")
best_xgb = joblib.load("../models/classification/model_package_XGBoost_Tuned.pkl")

models = {
    "LogisticRegression_Tuned": best_lr,
    "RandomForest_Tuned": best_rf,
    "XGBoost_Tuned": best_xgb
}

# Validation data
df = pd.read_csv("../qws1_dataset/validation_data_classification.csv")
target = 'Class'

In [4]:
X = df.drop(columns=[target])
y = df[target]

## 1.2 Official Metrics:
- Primary: F1-score
- Secondary: Accuracy, Precision, Recall, AUC

Information:

"Implement k-fold or time-based" is about validating models. Instead of training once and testing on a single split, back-testing is used to check how the model performs across multiple subsets of the data:

    - More reliable performance estimates;
    - Insight into variability.
Rule objective:

1. If the task is about model evaluation, run back-tests on all candidate models and compare metrics.
2. If the task is about pipeline validation, run the best model.

In this case it's about model evaluation so it's better to analyze all candidate models

---
Difference Between Fold and TimeSeriesSplit.

KFold Cross-Validation:
- Splits data randomly into k folds. 
- Each fold is used once as test, others as train.

TimeSeriesSplit:
- Splits data sequentially (train on past, test on future).
- No shuffling (time order matters).

---
---
# **2. BACK-TEST**
---
---

In [5]:
print(models)

{'LogisticRegression_Tuned': {'model': Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 LogisticRegression(C=100, class_weight='balanced',
                                    max_iter=1000, random_state=42))]), 'features': ['Response Time', 'Availability', 'Throughput', 'Successability', 'Reliability', 'Compliance', 'Best Practices', 'Latency', 'Documentation'], 'classes': array([1, 2, 3, 4])}, 'RandomForest_Tuned': {'model': RandomForestClassifier(class_weight='balanced', n_estimators=200,
                       random_state=42), 'features': ['Response Time', 'Availability', 'Throughput', 'Successability', 'Reliability', 'Compliance', 'Best Practices', 'Latency', 'Documentation'], 'classes': array([1, 2, 3, 4])}, 'XGBoost_Tuned': {'model': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
     

In [6]:
n_splits = 5
splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

results_summary = []
results_folds = []

for model_name, model_info in models.items():
    fold_metrics = []
    for fold, (train_idx, test_idx) in enumerate(splitter.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        m = clone(model_info['model'])
        m.fit(X_train, y_train)
        y_pred = m.predict(X_test)
        y_pred_proba = m.predict_proba(X_test)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")
        prec = precision_score(y_test, y_pred, average="weighted")
        rec = recall_score(y_test, y_pred, average="weighted")
        all_classes = np.unique(y)
        proba_df = pd.DataFrame(y_pred_proba, columns=m.classes_)
        proba_df = proba_df.reindex(columns=all_classes, fill_value=0)
        y_true_df = pd.get_dummies(y_test)
        y_true_df = y_true_df.reindex(columns=all_classes, fill_value=0)

        auc = roc_auc_score(y_true_df, proba_df, average="macro", multi_class="ovr")

        fold_metrics.append({"Fold": fold+1, "Accuracy": acc, "F1": f1, "Precision": prec, "Recall": rec, "AUC": auc})
        results_folds.append({"Model": model_name, "Fold": fold+1, "Accuracy": acc, "F1": f1, "Precision": prec, "Recall": rec, "AUC": auc})

    df_metrics = pd.DataFrame(fold_metrics)
    avg = df_metrics.mean(numeric_only=True)
    std = df_metrics.std(numeric_only=True)

    results_summary.append({
        "Model": model_name,
        "Accuracy": avg["Accuracy"], "Accuracy_std": std["Accuracy"],
        "F1": avg["F1"], "F1_std": std["F1"],
        "Precision": avg["Precision"], "Precision_std": std["Precision"],
        "Recall": avg["Recall"], "Recall_std": std["Recall"],
        "AUC": avg["AUC"], "AUC_std": std["AUC"]
    })

summary_df = pd.DataFrame(results_summary)
fold_df = pd.DataFrame(results_folds)

summary_df.to_csv("backtest_summary_classification.csv", index=False)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  _warn_prf(average, modi

In [7]:
fold_df

Unnamed: 0,Model,Fold,Accuracy,F1,Precision,Recall,AUC
0,LogisticRegression_Tuned,1,0.285714,0.238095,0.214286,0.285714,0.691667
1,LogisticRegression_Tuned,2,0.714286,0.714286,0.857143,0.714286,0.95
2,LogisticRegression_Tuned,3,0.428571,0.333333,0.285714,0.428571,0.691667
3,LogisticRegression_Tuned,4,0.833333,0.777778,0.75,0.833333,0.9
4,LogisticRegression_Tuned,5,0.5,0.433333,0.388889,0.5,
5,RandomForest_Tuned,1,0.285714,0.285714,0.285714,0.285714,0.625
6,RandomForest_Tuned,2,0.714286,0.704762,0.761905,0.714286,0.925
7,RandomForest_Tuned,3,0.571429,0.419048,0.333333,0.571429,0.9
8,RandomForest_Tuned,4,0.666667,0.638889,0.722222,0.666667,0.9
9,RandomForest_Tuned,5,0.5,0.466667,0.444444,0.5,


In [8]:
summary_df

Unnamed: 0,Model,Accuracy,Accuracy_std,F1,F1_std,Precision,Precision_std,Recall,Recall_std,AUC,AUC_std
0,LogisticRegression_Tuned,0.552381,0.220415,0.499365,0.236587,0.499206,0.287204,0.552381,0.220415,0.808333,0.136253
1,RandomForest_Tuned,0.547619,0.168359,0.503016,0.169377,0.509524,0.220401,0.547619,0.168359,0.8375,0.142156
2,XGBoost_Tuned,0.419048,0.153012,0.373424,0.151622,0.357143,0.172608,0.419048,0.153012,0.698437,0.150533
