---
---
# **1. Imports**
---
---

In [1]:
import glob
import joblib
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import KFold  # Used in this project
# from sklearn.model_selection import TimeSeriesSplit  # Learning note: use this for time-dependent data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.base import clone

In [2]:
# Reproducibility
np.random.seed(42)
random.seed(42)

## 1.1 File Imports

In [3]:
# Models
best_lr = joblib.load("../models/regression/best_lr_pipeline.pkl")
best_rf = joblib.load("../models/regression/best_rf_pipeline.pkl")
best_xgb = joblib.load("../models/regression/best_xgb_pipeline.pkl")

models = {
    "LinearRegression_Tuned": best_lr,
    "RandomForest_Tuned": best_rf,
    "XGBoost_Tuned": best_xgb
}

# Validation data
df = pd.read_csv("../qws1_dataset/validation_data_regression.csv")
target = "WsRF: Web Service Relevancy Function (%)"
X_val = df.drop(columns=[target])
y_val = df[target]

## 1.2 Official Metrics:
- **$R^2$** - For Primary Metric
- **RMSE and MAE** - As Secondary Metrics

Information:

"Implement k-fold or time-based" is about validating models. Instead of training once and testing on a single split, back-testing is used to check how the model performs across multiple subsets of the data:

    - More reliable performance estimates;
    - Insight into variability.
Rule objective:

1. If the task is about model evaluation, run back-tests on all candidate models and compare metrics.
2. If the task is about pipeline validation, run the best model.

In this case it's about model evaluation so it's better to analyze all candidate models

---
Difference Between Fold and TimeSeriesSplit.

KFold Cross-Validation:
- Splits data randomly into k folds. 
- Each fold is used once as test, others as train.

TimeSeriesSplit:
- Splits data sequentially (train on past, test on future).
- No shuffling (time order matters).

---
---
# **2. BACK-TEST**
---
---

In [4]:
# Configuration
target = "WsRF: Web Service Relevancy Function (%)"
n_splits = 5

# Target Column clean
X = df.drop(columns=[target])
y = df[target]

# Models
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42)
}

In [5]:
# Split Strategy
'''
Learning note:
For time-dependent data, use:
    splitter = TimeSeriesSplit(n_splits=n_splits)
This ensures training on past data and testing on future data.
In this project we'll use KFold since the data is not time-dependent:
'''
splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)

results_summary = []
results_folds = []

for model_name, model in models.items():
    fold_metrics = []
    
    for fold, (train_idx, test_idx) in enumerate(splitter.split(X)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        m = clone(model)
        m.fit(X_train, y_train)
        y_pred = m.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        fold_metrics.append({"Fold": fold+1, "RMSE": rmse, "MAE": mae, "R²": r2})
        results_folds.append({"Model": model_name, "Fold": fold+1, "RMSE": rmse, "MAE": mae, "R²": r2})

    df_metrics = pd.DataFrame(fold_metrics)
    avg = df_metrics.mean(numeric_only=True)
    std = df_metrics.std(numeric_only=True)
    
    results_summary.append({
        "Model": model_name,
        "RMSE": avg["RMSE"], "RMSE_std": std["RMSE"],
        "MAE": avg["MAE"], "MAE_std": std["MAE"],
        "R²": avg["R²"], "R²_std": std["R²"]
    })

summary_df = pd.DataFrame(results_summary)
fold_df = pd.DataFrame(results_folds)
summary_df.to_csv("backtest_summary.csv", index=False)

In [6]:
fold_df

Unnamed: 0,Model,Fold,RMSE,MAE,R²
0,LinearRegression,1,2.754124,2.510302,0.922793
1,LinearRegression,2,26.71537,12.047063,-4.95164
2,LinearRegression,3,3.51323,2.697946,0.301621
3,LinearRegression,4,4.04366,3.323906,0.64129
4,LinearRegression,5,2.321351,1.743562,0.958513
5,RandomForest,1,8.073145,6.598571,0.3366
6,RandomForest,2,6.501659,5.065714,0.647497
7,RandomForest,3,2.518395,2.291429,0.641139
8,RandomForest,4,5.986248,5.386667,0.213854
9,RandomForest,5,8.752096,7.005,0.410271


In [7]:
summary_df

Unnamed: 0,Model,RMSE,RMSE_std,MAE,MAE_std,R²,R²_std
0,LinearRegression,7.869547,10.556159,4.464556,4.276077,-0.425485,2.543918
1,RandomForest,6.366309,2.427678,5.269476,1.850848,0.449872,0.190881
2,XGBoost,7.835782,1.883153,6.472271,1.915784,0.026155,0.485539
