In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

import joblib

In [2]:
df = pd.read_csv("../data/participant_summary_dataset.csv")

In [3]:
X = df.drop(['Mean_Accuracy','Gender','Participant_ID', 'Mean_ResponseTime'], axis=1)
y = df['Mean_Accuracy']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)

In [6]:
# Function to evaluate models
def evaluate_model(name, y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    # Avoid division by zero by adding a small epsilon
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100
    return {
        "Model": name,
        "RMSE": rmse,
        "MAE": mae,
        "MAPE (%)": mape
    }

# Collect results
results = []
results.append(evaluate_model("XGBoost", y_test, y_pred_xgb))

# Convert to DataFrame
results_df = pd.DataFrame(results)
print(results_df)


     Model      RMSE       MAE   MAPE (%)
0  XGBoost  0.104975  0.088001  10.969399


In [7]:
joblib.dump(xgb, "../models/performance_predictor.pkl")

['../models/performance_predictor.pkl']

In [8]:
df_male = pd.read_csv("../data/participant_summary_male.csv")
df_female = pd.read_csv("../data/participant_summary_female.csv")

# Quick check
print(df_male.shape, df_female.shape)

(22, 83) (16, 83)


In [9]:
# Drop non-feature columns
X_male = df_male.drop(columns=["Participant_ID", "Gender", "Mean_Accuracy", "Mean_ResponseTime"])
y_male = df_male["Mean_Accuracy"]

X_female = df_female.drop(columns=["Participant_ID", "Gender", "Mean_Accuracy", "Mean_ResponseTime"])
y_female = df_female["Mean_Accuracy"]


In [10]:
# Train-test split for fairness and to avoid overfitting
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_male, y_male, test_size=0.3, random_state=42
)

X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
    X_female, y_female, test_size=0.3, random_state=42
)

# Male model
model_male = XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.05,
                          subsample=0.8, colsample_bytree=0.8, random_state=42)
model_male.fit(X_train_m, y_train_m)

# Female model
model_female = XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.05,
                            subsample=0.8, colsample_bytree=0.8, random_state=42)
model_female.fit(X_train_f, y_train_f)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [20]:
def evaluate(name, model, X_train, y_train, X_test, y_test):
    # Train error
    y_pred_train = model.predict(X_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    train_mae = mean_absolute_error(y_train, y_pred_train)

    # Test error
    y_pred_test = model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_mae = mean_absolute_error(y_test, y_pred_test)

    # Safe MAPE (avoid division by zero)
    def safe_mape(y_true, y_pred):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        nonzero = y_true != 0
        return np.mean(np.abs((y_true[nonzero] - y_pred[nonzero]) / y_true[nonzero])) * 100

    test_mape = safe_mape(y_test, y_pred_test)

    return {
        "Model": name,
        "Train RMSE": train_rmse, "Train MAE": train_mae,
        "Test RMSE": test_rmse, "Test MAE": test_mae, "Test MAPE (%)": test_mape
    }

results = []
results.append(evaluate("Male", model_male, X_train_m, y_train_m, X_test_m, y_test_m))
results.append(evaluate("Female", model_female, X_train_f, y_train_f, X_test_f, y_test_f))

results_df = pd.DataFrame(results)
print(results_df)

    Model  Train RMSE  Train MAE  Test RMSE  Test MAE  Test MAPE (%)
0    Male    0.000463   0.000288   0.077229  0.065037       9.144673
1  Female    0.000503   0.000309   0.085389  0.070056      10.168799


In [14]:
# save male model
joblib.dump(model_male, "../models/model_male.pkl")

# save female model
joblib.dump(model_female, "../models/model_female.pkl")


['../models/model_female.pkl']