In [24]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
import optuna
import warnings
from gplearn.genetic import SymbolicRegressor
from pysr import PySRRegressor

# English: Import models and tools
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [16]:
def remove_highly_correlated_features(df, threshold=0.95):
    """
    Finds and removes one of each pair of highly correlated features in a dataframe.

    Parameters:
    -----------
    df : pandas.DataFrame
        The input dataframe with numerical features.
    threshold : float, optional
        The correlation threshold above which a feature is considered redundant. 
        Defaults to 0.95.

    Returns:
    --------
    pandas.DataFrame
        A new dataframe with highly correlated features removed.
    list
        A list of the column names that were dropped.
    """
    # Create a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    
    # --- Step 1: Remove zero-variance columns ---
    # These columns have no predictive power and can cause issues with correlation calculation.
    cols_to_drop_zerovar = df_copy.columns[df_copy.nunique() <= 1]
    if not cols_to_drop_zerovar.empty:
        df_copy.drop(columns=cols_to_drop_zerovar, inplace=True)
        print(f"Removed {len(cols_to_drop_zerovar)} columns with zero or single unique values: {cols_to_drop_zerovar.tolist()}")
    
    # --- Step 2: Calculate the correlation matrix ---
    # Use .abs() because a strong negative correlation (-0.95) is as redundant as a strong positive one.
    corr_matrix = df_copy.corr().abs()
    
    # --- Step 3: Identify one of each highly correlated pair ---
    # Select the upper triangle of the correlation matrix to avoid duplicates
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find features with correlation greater than the threshold
    cols_to_drop_corr = [column for column in upper.columns if any(upper[column] > threshold)]
    
    # --- Step 4: Drop the identified features ---
    df_reduced = df_copy.drop(columns=cols_to_drop_corr)
    
    # Combine all dropped columns for the report
    all_dropped_cols = cols_to_drop_zerovar.tolist() + cols_to_drop_corr
    
    return df_reduced, all_dropped_cols

## Without augmentation

In [17]:
dataset = pd.read_csv('../data/processed/studentlife_2014.csv')

In [18]:
dataset.dropna(inplace=True)

In [21]:
dataset

Unnamed: 0,user_id,date,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,...,organizational_deadlines,organizational_days_until_next_deadline,environmental_weekday,individual_personality_extraversion,individual_personality_agreeableness,individual_personality_conscientiousness,individual_personality_neuroticism,individual_personality_openness,individual_previous_stress_level,individual_days_since_previous_stress_measurement
1,4,2013-03-28,1,3.450000,8.0,0.9,76.333333,95.0,47.0,1.5,...,0.0,11.0,3,1,4,0,15,17,0.0,1.0
2,4,2013-03-29,1,3.354167,8.6,-1.6,75.833333,95.0,55.0,1.3,...,0.0,10.0,4,1,4,0,15,17,1.0,1.0
3,4,2013-04-02,2,-1.525000,1.0,-3.6,44.291667,53.0,32.0,0.0,...,0.0,6.0,1,1,4,0,15,17,1.0,4.0
4,4,2013-04-03,2,-1.150000,4.0,-4.2,45.833333,58.0,29.0,0.0,...,0.0,5.0,2,1,4,0,15,17,2.0,1.0
5,4,2013-04-04,1,1.929167,8.6,-2.2,47.041667,58.0,33.0,0.0,...,0.0,4.0,3,1,4,0,15,17,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,59,2013-05-21,0,18.033333,24.4,13.9,87.875000,97.0,67.0,5.5,...,0.0,3.0,1,14,13,-1,5,23,0.0,1.0
644,59,2013-05-22,0,14.208333,24.5,8.5,87.708333,99.0,63.0,6.2,...,0.0,2.0,2,14,13,-1,5,23,0.0,1.0
645,59,2013-05-23,0,18.450000,24.7,13.7,88.083333,99.0,68.0,1.9,...,0.0,1.0,3,14,13,-1,5,23,0.0,1.0
646,59,2013-05-24,1,13.508333,19.4,6.9,94.250000,100.0,84.0,11.7,...,1.0,5.0,4,14,13,-1,5,23,0.0,1.0


In [None]:
# Preparar los datos
X = dataset.drop(columns=['user_id', 'stress_level', 'date'])
y = dataset['stress_level']
groups = dataset['user_id']

# --- 2. CONFIGURACIÓN DEL EXPERIMENTO ---
n_splits = 5  # Usamos 2 pliegues para una demostración rápida
gkf = GroupKFold(n_splits=n_splits)
results_list = []

min_class, max_class = y.min(), y.max() # Para el clipping posterior

# --- 3. BUCLE DE VALIDACIÓN CRUZADA ---
print(f"--- Iniciando validación cruzada con {n_splits} pliegues ---")
for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups)):
    print(f"\n--- Procesando Pliegue {fold + 1}/{n_splits} ---")
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # --- Experimento A: gplearn ---
    print("  - Entrenando gplearn...")
    # Usamos SymbolicClassifier ya que stress_level es una categoría
    gp_model = SymbolicRegressor(population_size=2000,
                           generations=15, 
                           stopping_criteria=0.01,
                           p_crossover=0.7, 
                           p_subtree_mutation=0.1,
                           p_hoist_mutation=0.05, 
                           p_point_mutation=0.1,
                           max_samples=0.9, 
                           verbose=0,
                           parsimony_coefficient=0.001, 
                           random_state=42)
    gp_model.fit(X_train, y_train)
    # <-- CORRECCIÓN: Convertir predicciones de regresión a clases
    raw_gp_preds = gp_model.predict(X_test)
    rounded_gp_preds = np.round(raw_gp_preds)
    gp_preds = np.clip(rounded_gp_preds, min_class, max_class).astype(int)
    
    gp_f1 = f1_score(y_test, gp_preds, average='weighted', zero_division=0)
    
    # Guardar resultados de gplearn
    results_list.append({
        'Fold': fold + 1,
        'Algorithm': 'gplearn',
        'F1-Score (weighted)': gp_f1,
        'Best Formula': str(gp_model._program)
    })
    print(f"  - gplearn F1-Score: {gp_f1:.4f}")

    # --- Experimento B: PySR ---
    # PySR es opcional y requiere una instalación más compleja.
    try:
        
        print("  - Entrenando PySR...")
        pysr_model = PySRRegressor(
            niterations=30,  # Menos iteraciones que gplearn, ya que es más eficiente
            binary_operators=["+", "-", "*", "/"],
            unary_operators=["sin", "cos", "exp", "log"],
            model_selection="best", # Selecciona la mejor fórmula que equilibra complejidad y precisión
            # El `loss` define cómo se mide el error
            loss="L2DistLoss()",
            procs=0,
            verbosity=0
        )
        pysr_model.fit(X_train, y_train)

        # <-- INICIO DE LA CORRECCIÓN ---
        # 1. Obtener las predicciones continuas del regresor
        raw_predictions = pysr_model.predict(X_test)
        
        # 2. Redondear al entero más cercano para convertir a clase
        rounded_predictions = np.round(raw_predictions)
        
        # 3. Asegurar que las predicciones estén dentro del rango de clases válidas (clipping)
        pysr_preds = np.clip(rounded_predictions, min_class, max_class).astype(int)
        # <-- FIN DE LA CORRECCIÓN ---

        pysr_f1 = f1_score(y_test, pysr_preds, average='weighted', zero_division=0)
        
        # Guardar resultados de PySR
        results_list.append({
            'Fold': fold + 1,
            'Algorithm': 'PySR',
            'F1-Score (weighted)': pysr_f1,
            'Best Formula': pysr_model.get_best()["equation"] # Obtener la mejor fórmula
        })
        print(f"  - PySR F1-Score: {pysr_f1:.4f}")

    except ImportError:
        print("  - PySR no está instalado. Saltando este experimento.")
        results_list.append({
            'Fold': fold + 1,
            'Algorithm': 'PySR',
            'F1-Score (weighted)': np.nan,
            'Best Formula': 'Not executed'
        })

# --- 4. PRESENTACIÓN DE RESULTADOS ---
print("\n--- Resultados Finales del Experimento ---")
results_df = pd.DataFrame(results_list)
display(results_df)

print("\n--- Resumen de Rendimiento Promedio ---")
summary = results_df.groupby('Algorithm')['F1-Score (weighted)'].agg(['mean', 'std']).reset_index()
display(summary)

--- Iniciando validación cruzada con 5 pliegues ---

--- Procesando Pliegue 1/5 ---
  - Entrenando gplearn...
  - gplearn F1-Score: 0.3384
  - Entrenando PySR...




  - PySR F1-Score: 0.3384

--- Procesando Pliegue 2/5 ---
  - Entrenando gplearn...
  - gplearn F1-Score: 0.1939
  - Entrenando PySR...




  - PySR F1-Score: 0.1939

--- Procesando Pliegue 3/5 ---
  - Entrenando gplearn...
  - gplearn F1-Score: 0.1886
  - Entrenando PySR...




  - PySR F1-Score: 0.1886

--- Procesando Pliegue 4/5 ---
  - Entrenando gplearn...
  - gplearn F1-Score: 0.1480
  - Entrenando PySR...




  - PySR F1-Score: 0.1480

--- Procesando Pliegue 5/5 ---
  - Entrenando gplearn...
  - gplearn F1-Score: 0.3231
  - Entrenando PySR...




  - PySR F1-Score: 0.3277

--- Resultados Finales del Experimento ---


Unnamed: 0,Fold,Algorithm,F1-Score (weighted),Best Formula
0,1,gplearn,0.338443,"div(X30, X22)"
1,1,PySR,0.338443,exp(sin(individual_personality_openness) * 0.4...
2,2,gplearn,0.193939,"div(X3, X22)"
3,2,PySR,0.193939,exp(sin(individual_personality_agreeableness) ...
4,3,gplearn,0.188569,"div(X15, X22)"
5,3,PySR,0.188569,exp(sin(individual_personality_agreeableness /...
6,4,gplearn,0.147951,"div(X8, X22)"
7,4,PySR,0.147951,exp(sin(individual_personality_agreeableness) ...
8,5,gplearn,0.323096,"div(X22, X22)"
9,5,PySR,0.327654,(sin(individual_personality_agreeableness) * -...



--- Resumen de Rendimiento Promedio ---


Unnamed: 0,Algorithm,mean,std
0,PySR,0.239311,0.087481
1,gplearn,0.2384,0.086347


In [27]:
# English: Prepare data for modeling
X = dataset.drop(columns=['user_id', 'stress_level', 'date'])
y = dataset['stress_level']
groups = dataset['user_id']

# --- 2. EXPERIMENT CONFIGURATION ---
n_splits = 5  # Using 2 splits for a quick demonstration; recommend 5 for final results
gkf = GroupKFold(n_splits=n_splits)
results_list = []

# English: Define the models to be tested in a dictionary
models_to_test = {
    # English: Logistic Regression needs scaled data, so we use a Pipeline
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(random_state=42, max_iter=1000, solver='liblinear'))
    ]),
    # English: The "big three" of gradient boosting
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
    "LightGBM": LGBMClassifier(random_state=42, verbose=-1),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0, iterations=200)
}

# --- 3. CROSS-VALIDATION LOOP ---
print(f"--- Starting cross-validation with {n_splits} folds ---")
for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups)):
    print(f"\n--- Processing Fold {fold + 1}/{n_splits} ---")
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # English: Iterate through each model defined above
    for name, model in models_to_test.items():
        print(f"  - Training {name}...")
        
        # English: Fit the model on the training data for the current fold
        model.fit(X_train, y_train)
        
        # English: Make predictions on the test data
        preds = model.predict(X_test)
        
        # English: Calculate the weighted F1-score
        f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
        
        # English: Store the results
        results_list.append({
            'Fold': fold + 1,
            'Algorithm': name,
            'F1-Score (weighted)': f1
        })
        print(f"  - {name} F1-Score: {f1:.4f}")

# --- 4. RESULTS PRESENTATION ---
print("\n--- Final Experiment Results ---")
results_df = pd.DataFrame(results_list)
display(results_df)

print("\n--- Average Performance Summary ---")
summary = results_df.groupby('Algorithm')['F1-Score (weighted)'].agg(['mean', 'std']).reset_index()
display(summary)

--- Starting cross-validation with 5 folds ---

--- Processing Fold 1/5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.5509
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.3889
  - Training LightGBM...
  - LightGBM F1-Score: 0.3799
  - Training CatBoost...
  - CatBoost F1-Score: 0.4075

--- Processing Fold 2/5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.4879
  - Training XGBoost...
  - XGBoost F1-Score: 0.4953
  - Training LightGBM...


Parameters: { "use_label_encoder" } are not used.



  - LightGBM F1-Score: 0.4779
  - Training CatBoost...
  - CatBoost F1-Score: 0.4614

--- Processing Fold 3/5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.4584
  - Training XGBoost...
  - XGBoost F1-Score: 0.4468
  - Training LightGBM...


Parameters: { "use_label_encoder" } are not used.



  - LightGBM F1-Score: 0.4039
  - Training CatBoost...
  - CatBoost F1-Score: 0.4466

--- Processing Fold 4/5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3452
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.3338
  - Training LightGBM...
  - LightGBM F1-Score: 0.3586
  - Training CatBoost...
  - CatBoost F1-Score: 0.4204

--- Processing Fold 5/5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.4345
  - Training XGBoost...
  - XGBoost F1-Score: 0.4317
  - Training LightGBM...


Parameters: { "use_label_encoder" } are not used.



  - LightGBM F1-Score: 0.4760
  - Training CatBoost...
  - CatBoost F1-Score: 0.4228

--- Final Experiment Results ---


Unnamed: 0,Fold,Algorithm,F1-Score (weighted)
0,1,Logistic Regression,0.550871
1,1,XGBoost,0.388927
2,1,LightGBM,0.379858
3,1,CatBoost,0.407457
4,2,Logistic Regression,0.487889
5,2,XGBoost,0.495305
6,2,LightGBM,0.477943
7,2,CatBoost,0.461417
8,3,Logistic Regression,0.458394
9,3,XGBoost,0.446847



--- Average Performance Summary ---


Unnamed: 0,Algorithm,mean,std
0,CatBoost,0.431755,0.021786
1,LightGBM,0.419253,0.055064
2,Logistic Regression,0.455384,0.075415
3,XGBoost,0.419302,0.061094


## With augmentation

In [None]:
# --- 2. EXPERIMENT CONFIGURATION ---
window_sizes = [3, 4, 5]
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

# English: This is the main list to collect results from ALL experiments
all_results_list = []

# --- 3. MAIN EXPERIMENT LOOP ---
for window_size in window_sizes:
    print(f"\n========================================================")
    print(f"--- Starting Experiment for Window Size: {window_size} ---")
    print(f"========================================================")
    
    try:
        # English: Load the dataset for the current window size
        # NOTE: You must have these files in the specified path
        # dataset = pd.read_csv(f'../data/augmented/studentlife_2014_{window_size}.csv')
        

        dataset = pd.read_csv(f'../data/augmented/studentlife_2014_{window_size}.csv')
        # --- End of placeholder block ---

        dataset.dropna(inplace=True)

        # English: Prepare data for modeling
        X = dataset.drop(columns=['user_id', 'stress_level', 'date'])
        y = dataset['stress_level']
        groups = dataset['user_id']
        min_class, max_class = y.min(), y.max()

    except FileNotFoundError:
        print(f"Error: Data file for window size {window_size} not found. Skipping.")
        continue

    # --- Cross-validation loop for the current dataset ---
    for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups)):
        print(f"\n--- Processing Fold {fold + 1}/{n_splits} for window {window_size} ---")
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # --- Experiment A: gplearn ---
        if True:
            print("  - Training gplearn...")
            gp_model = SymbolicRegressor(population_size=1000, generations=10, 
                                       parsimony_coefficient=0.001, random_state=42, verbose=0)
            gp_model.fit(X_train, y_train)
            
            raw_gp_preds = gp_model.predict(X_test)
            rounded_gp_preds = np.round(raw_gp_preds)
            gp_preds = np.clip(rounded_gp_preds, min_class, max_class).astype(int)
            gp_f1 = f1_score(y_test, gp_preds, average='weighted', zero_division=0)
            
            all_results_list.append({'Window Size': window_size, 'Fold': fold + 1, 'Algorithm': 'gplearn', 
                                     'F1-Score (weighted)': gp_f1, 'Best Formula': str(gp_model._program)})
            print(f"  - gplearn F1-Score: {gp_f1:.4f}")

        # --- Experiment B: PySR ---
        if True:
            print("  - Training PySR...")
            pysr_model = PySRRegressor(niterations=10, binary_operators=["+", "-", "*", "/"],
                                       model_selection="best", procs=0, verbosity=0)
            pysr_model.fit(X_train, y_train)
            
            raw_pysr_preds = pysr_model.predict(X_test)
            rounded_pysr_preds = np.round(raw_pysr_preds)
            pysr_preds = np.clip(rounded_pysr_preds, min_class, max_class).astype(int)
            pysr_f1 = f1_score(y_test, pysr_preds, average='weighted', zero_division=0)
            
            all_results_list.append({'Window Size': window_size, 'Fold': fold + 1, 'Algorithm': 'PySR',
                                     'F1-Score (weighted)': pysr_f1, 
                                     'Best Formula': pysr_model.get_best()["equation"] if pysr_model.get_best() else "No formula found"})
            print(f"  - PySR F1-Score: {pysr_f1:.4f}")

# --- 4. FINAL RESULTS PRESENTATION ---
print("\n\n================================================")
print("--- Final Combined Experiment Results ---")
print("================================================")

if not all_results_list:
    print("No results were generated. Please check data paths and library installations.")
else:
    results_df = pd.DataFrame(all_results_list)
    
    # English: Display the full results table
    print("\n--- Full Results Table ---")
    display(results_df)

    # English: Display the summary table
    print("\n--- Average Performance Summary ---")
    summary = results_df.groupby(['Window Size', 'Algorithm'])['F1-Score (weighted)'].agg(['mean', 'std']).reset_index()
    display(summary)


--- Starting Experiment for Window Size: 3 ---

--- Processing Fold 1/5 for window 3 ---
  - Training gplearn...
  - gplearn F1-Score: 0.1667
  - Training PySR...




ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [40]:
# English: This script assumes you have already run the necessary import statements
# for pandas, numpy, GroupKFold, f1_score, and all the required models.

# --- 1. EXPERIMENT CONFIGURATION ---
window_sizes = [3, 4, 5]
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

# English: This is the main list to collect results from ALL experiments
all_results_list = []

# English: Define the models to be tested in a dictionary (outside the loop)
models_to_test = {
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(random_state=42, max_iter=100000, solver='liblinear'))
    ]),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
    "LightGBM": LGBMClassifier(random_state=42, verbose=-1),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0, iterations=200)
}

# --- 2. MAIN EXPERIMENT LOOP ---
for window_size in window_sizes:
    print(f"\n========================================================")
    print(f"--- Starting Experiment for Window Size: {window_size} ---")
    print(f"========================================================")
    
    try:
        # English: Load the dataset for the current window size
        # NOTE: You must have these files in the specified path
        # dataset = pd.read_csv(f'../data/augmented/studentlife_2014_{window_size}.csv')
        
        # --- This is a placeholder for your data loading. Remove for production. ---
        # Creating a dummy dataframe for demonstration purposes as I can't access local files.
        # Replace this block with your pd.read_csv line.
        dataset = pd.read_csv(f'../data/augmented/studentlife_2014_{window_size}.csv')
        # --- End of placeholder block ---

        dataset.dropna(inplace=True)

        # English: Prepare data for modeling
        X = dataset.drop(columns=['user_id', 'stress_level', 'date'])
        y = dataset['stress_level']
        groups = dataset['user_id']

    except FileNotFoundError:
        print(f"Error: Data file for window size {window_size} not found. Skipping.")
        continue

    # --- Cross-validation loop for the current dataset ---
    for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups)):
        print(f"\n--- Processing Fold {fold + 1}/{n_splits} for window {window_size} ---")
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # English: Iterate through each model defined above
        for name, model in models_to_test.items():
            print(f"  - Training {name}...")
            
            # English: Fit the model on the training data for the current fold
            model.fit(X_train, y_train)
            
            # English: Make predictions on the test data
            preds = model.predict(X_test)
            
            # English: Calculate the weighted F1-score
            f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
            
            # English: Store the results, including the window size
            all_results_list.append({
                'Window Size': window_size,
                'Fold': fold + 1,
                'Algorithm': name,
                'F1-Score (weighted)': f1
            })
            print(f"  - {name} F1-Score: {f1:.4f}")

# --- 3. FINAL RESULTS PRESENTATION ---
print("\n\n================================================")
print("--- Final Combined Experiment Results ---")
print("================================================")

if not all_results_list:
    print("No results were generated. Please check data paths.")
else:
    results_df = pd.DataFrame(all_results_list)
    
    # English: Display the full results table
    print("\n--- Full Results Table ---")
    display(results_df)

    # English: Display the summary table, grouped by window size and algorithm
    print("\n--- Average Performance Summary ---")
    summary = results_df.groupby(['Window Size', 'Algorithm'])['F1-Score (weighted)'].agg(['mean', 'std']).reset_index()
    display(summary)



--- Starting Experiment for Window Size: 3 ---

--- Processing Fold 1/5 for window 3 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.4639
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.3838
  - Training LightGBM...
  - LightGBM F1-Score: 0.4006
  - Training CatBoost...
  - CatBoost F1-Score: 0.3884

--- Processing Fold 2/5 for window 3 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3906
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.5291
  - Training LightGBM...
  - LightGBM F1-Score: 0.4909
  - Training CatBoost...
  - CatBoost F1-Score: 0.5376

--- Processing Fold 3/5 for window 3 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.4723
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.4824
  - Training LightGBM...
  - LightGBM F1-Score: 0.5351
  - Training CatBoost...
  - CatBoost F1-Score: 0.5474

--- Processing Fold 4/5 for window 3 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.4176
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.3521
  - Training LightGBM...
  - LightGBM F1-Score: 0.3604
  - Training CatBoost...
  - CatBoost F1-Score: 0.3797

--- Processing Fold 5/5 for window 3 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3676
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.5325
  - Training LightGBM...
  - LightGBM F1-Score: 0.4970
  - Training CatBoost...
  - CatBoost F1-Score: 0.4717

--- Starting Experiment for Window Size: 4 ---

--- Processing Fold 1/5 for window 4 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.4234
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.4539
  - Training LightGBM...
  - LightGBM F1-Score: 0.4006
  - Training CatBoost...
  - CatBoost F1-Score: 0.4069

--- Processing Fold 2/5 for window 4 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.4287
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.5611
  - Training LightGBM...
  - LightGBM F1-Score: 0.5186
  - Training CatBoost...
  - CatBoost F1-Score: 0.4576

--- Processing Fold 3/5 for window 4 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3790
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.5442
  - Training LightGBM...
  - LightGBM F1-Score: 0.4966
  - Training CatBoost...
  - CatBoost F1-Score: 0.4439

--- Processing Fold 4/5 for window 4 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3994
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.4520
  - Training LightGBM...
  - LightGBM F1-Score: 0.3701
  - Training CatBoost...
  - CatBoost F1-Score: 0.4104

--- Processing Fold 5/5 for window 4 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3358
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.4445
  - Training LightGBM...
  - LightGBM F1-Score: 0.4603
  - Training CatBoost...
  - CatBoost F1-Score: 0.4062

--- Starting Experiment for Window Size: 5 ---

--- Processing Fold 1/5 for window 5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3747
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.3775
  - Training LightGBM...
  - LightGBM F1-Score: 0.3556
  - Training CatBoost...
  - CatBoost F1-Score: 0.4397

--- Processing Fold 2/5 for window 5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.4190
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.5625
  - Training LightGBM...
  - LightGBM F1-Score: 0.5589
  - Training CatBoost...
  - CatBoost F1-Score: 0.5623

--- Processing Fold 3/5 for window 5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3855
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.3861
  - Training LightGBM...
  - LightGBM F1-Score: 0.4268
  - Training CatBoost...
  - CatBoost F1-Score: 0.4548

--- Processing Fold 4/5 for window 5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3469
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.4858
  - Training LightGBM...
  - LightGBM F1-Score: 0.3955
  - Training CatBoost...
  - CatBoost F1-Score: 0.4222

--- Processing Fold 5/5 for window 5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3554
  - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



  - XGBoost F1-Score: 0.4145
  - Training LightGBM...
  - LightGBM F1-Score: 0.3956
  - Training CatBoost...
  - CatBoost F1-Score: 0.4219


--- Final Combined Experiment Results ---

--- Full Results Table ---


Unnamed: 0,Window Size,Fold,Algorithm,F1-Score (weighted)
0,3,1,Logistic Regression,0.463857
1,3,1,XGBoost,0.383822
2,3,1,LightGBM,0.400554
3,3,1,CatBoost,0.38842
4,3,2,Logistic Regression,0.39057
5,3,2,XGBoost,0.529074
6,3,2,LightGBM,0.490909
7,3,2,CatBoost,0.537644
8,3,3,Logistic Regression,0.472312
9,3,3,XGBoost,0.482445



--- Average Performance Summary ---


Unnamed: 0,Window Size,Algorithm,mean,std
0,3,CatBoost,0.464977,0.079462
1,3,LightGBM,0.456799,0.073069
2,3,Logistic Regression,0.422393,0.045412
3,3,XGBoost,0.455983,0.083514
4,4,CatBoost,0.425,0.024064
5,4,LightGBM,0.449224,0.062836
6,4,Logistic Regression,0.393279,0.037756
7,4,XGBoost,0.491146,0.056556
8,5,CatBoost,0.460171,0.0587
9,5,LightGBM,0.426494,0.078216


In [None]:
# English: Suppress warnings for a cleaner output
warnings.filterwarnings("ignore", category=UserWarning)

# --- 2. DATA LOADING & PREPARATION ---
window_size = 3
print(f"--- Loading and preparing data for window size: {window_size} ---")

try:
    # English: Load the specific dataset for the experiment
    dataset = pd.read_csv(f'../data/augmented/studentlife_2014_{window_size}.csv')
    dataset.dropna(inplace=True)

    # English: Prepare data for modeling
    X = dataset.drop(columns=['user_id', 'stress_level', 'date'])
    y = dataset['stress_level']
    groups = dataset['user_id']
    print("Data loaded successfully.")

except FileNotFoundError:
    print(f"Error: Data file for window size {window_size} not found. Please check the path.")
    # Stop execution if data is not found
    # In a real script, you might exit or raise an error here.
    # For this example, we'll create dummy data to proceed.
    print("Creating dummy data to continue demonstration...")
    num_samples = 150
    X = pd.DataFrame(np.random.rand(num_samples, 20), columns=[f'feature_{i}' for i in range(20)])
    y = pd.Series(np.random.randint(0, 3, num_samples))
    groups = pd.Series(np.repeat(np.arange(10), num_samples // 10))

# --- 3. EXPERIMENT CONFIGURATION ---
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

# English: Define the models and their exhaustive parameter grids for GridSearchCV
models_to_tune = [
    {
        "name": "XGBoost",
        "estimator": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
        "param_grid": {
            'n_estimators': [200, 500, 750],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7],
            'subsample': [0.8, 0.9, 1.0],
            'colsample_bytree': [0.8, 0.9, 1.0],
            'gamma': [0, 0.1, 0.5]
        }
    },
    {
        "name": "CatBoost",
        "estimator": CatBoostClassifier(random_state=42, verbose=0),
        "param_grid": {
            'iterations': [250, 500, 750],
            'learning_rate': [0.01, 0.05, 0.1],
            'depth': [4, 6, 8],
            'l2_leaf_reg': [1, 3, 5],
            'border_count': [32, 64, 128]
        }
    }
]

# English: This list will store the final results of the grid searches
all_results_list = []

# --- 4. GRID SEARCH EXECUTION LOOP ---
for config in models_to_tune:
    model_name = config["name"]
    estimator = config["estimator"]
    param_grid = config["param_grid"]
    
    print(f"\n========================================================")
    print(f"--- Starting Exhaustive GridSearchCV for {model_name} ---")
    print(f"========================================================")
    
    # English: Set up GridSearchCV
    # It uses GroupKFold for the cross-validation strategy to respect user groups.
    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        scoring='f1_weighted',
        cv=gkf,
        n_jobs=-1,  # Use all available CPU cores to speed up the process
        verbose=2   # Show detailed progress updates
    )
    
    # English: Run the grid search. This is the time-consuming part.
    # The 'groups' parameter is passed to the splitter within GridSearchCV.
    grid_search.fit(X, y, groups=groups)
    
    # English: Store the best results found by the search
    all_results_list.append({
        'Algorithm': model_name,
        'Best F1-Score (weighted)': grid_search.best_score_,
        'Best Parameters': grid_search.best_params_
    })

# --- 5. FINAL RESULTS PRESENTATION ---
print("\n\n================================================")
print("--- Final GridSearchCV Results ---")
print("================================================")

if not all_results_list:
    print("No results were generated. Please check the experiment setup.")
else:
    results_df = pd.DataFrame(all_results_list)
    
    # English: Display the summary table
    # We use a custom print for better display of the 'Best Parameters' dictionary
    for index, row in results_df.iterrows():
        print(f"Algorithm: {row['Algorithm']}")
        print(f"  - Best F1-Score (avg): {row['Best F1-Score (weighted)']:.4f}")
        print(f"  - Best Hyperparameters: {row['Best Parameters']}")
        print("-" * 30)
    
    results_df.to_csv('results/results_ws3_gscv.csv')

--- Loading and preparing data for window size: 3 ---
Data loaded successfully.

--- Starting Exhaustive GridSearchCV for XGBoost ---
Fitting 5 folds for each of 729 candidates, totalling 3645 fits


KeyboardInterrupt: 

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.9; total time=   4.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   4.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   4.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   4.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   4.4s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.9; total time=   4.6s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.9; total time=   4.6s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   4.7s


## Old

In [4]:
dataset.columns

Index(['user_id', 'date', 'stress_level', 'environmental_temperature_mean',
       'environmental_temperature_max', 'environmental_temperature_min',
       'environmental_humidity_mean', 'environmental_humidity_max',
       'environmental_humidity_min', 'environmental_precipitation',
       'environmental_cloudcover', 'individual_sleep_duration',
       'individual_sleep_rate', 'organizational_social_interaction',
       'organizational_social_voice_sum', 'organizational_social_voice_count',
       'organizational_social_voice_mean', 'organizational_social_voice_max',
       'individual_minutes_stationary', 'individual_minutes_walking',
       'individual_minutes_running', 'individual_minutes_unknown',
       'environmental_minutes_silence', 'environmental_minutes_voice',
       'environmental_minutes_noise', 'environmental_minutes_unknown',
       'organizational_work_hours', 'deadlines', 'days_until_next_deadline',
       'weekday', 'individual_personality_extraversion',
       'indi

In [5]:
rename_map = {
    'deadlines': 'organizational_deadlines',
    'days_until_next_deadline': 'organizational_days_until_next_deadline',
    'weekday': 'environmental_weekday'
}

dataset = dataset.rename(columns=rename_map)

In [7]:
def generate_features_for_columns(df, feature_columns, window_size, feature_function):
    """
    Applies a feature generation function to a list of specified columns.

    Parameters:
    -----------
    df : pandas.DataFrame
        The input dataframe.
    feature_columns : list
        A list of column names to generate features for.
    window_size : int
        The rolling window size to use.
    feature_function : function
        The function to apply (e.g., add_stress_rolling_features).

    Returns:
    --------
    pandas.DataFrame
        The dataframe enriched with all the new features.
    """
    df_enriched = df.copy()
    
    # Track original columns to avoid creating features on features
    original_cols = set(df_enriched.columns)
    
    for col in feature_columns:
        if col in original_cols:
            print(f"Generating features for column: '{col}' with window size {window_size}...")
            df_enriched = feature_function(df_enriched, window_size, col)
        else:
            print(f"Warning: Column '{col}' not found in the initial dataframe. Skipping.")
            
    print("\nFeature generation complete.")
    return df_enriched



In [12]:
enriched_df.describe()

Unnamed: 0,user_id,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,environmental_cloudcover,...,stress_level_rolling_q75_3d,stress_level_rolling_range_3d,stress_level_rolling_iqr_3d,stress_level_rolling_cv_3d,stress_level_rolling_trend_slope_3d,stress_level_rolling_direction_changes_3d,stress_level_rolling_entropy_3d,stress_level_rolling_zscore_3d,stress_level_rolling_time_since_peak_3d,stress_level_rolling_time_since_trough_3d
count,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,...,647.0,647.0,647.0,600.0,600.0,647.0,647.0,600.0,647.0,647.0
mean,33.62037,1.154321,8.512854,14.699537,3.327778,68.407986,88.521605,43.833333,2.281636,48.63098,...,1.268934,0.476043,0.238022,0.442335,0.013333,0.0,0.418856,0.015321,0.744977,0.76507
std,17.982157,0.742368,5.562435,6.753744,4.765486,12.982973,12.694466,13.07971,3.664127,31.175947,...,0.625293,0.603625,0.301813,0.581263,0.798468,0.0,0.493753,0.475369,0.436211,0.424283
min,4.0,0.0,-1.525,1.0,-6.1,44.291667,53.0,19.0,0.0,0.041667,...,0.0,0.0,0.0,0.0,-2.0,0.0,0.0,-0.707107,0.0,0.0
25%,17.0,1.0,3.854167,9.0,-0.6,58.75,80.0,35.0,0.0,27.25,...,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,33.0,1.0,7.454167,14.1,2.8,67.791667,94.0,40.0,0.1,39.083333,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
75%,51.0,2.0,13.508333,20.5,6.8,78.958333,99.0,54.0,2.3,77.375,...,1.75,1.0,0.5,0.471405,0.0,0.0,1.0,0.0,1.0,1.0
max,59.0,2.0,18.45,26.4,13.9,94.25,100.0,84.0,15.0,99.916667,...,2.0,2.0,1.0,1.414214,2.0,0.0,1.0,0.707107,1.0,1.0


In [18]:
np.random.seed(24091993)

# English: Suppress Optuna's trial logs for a cleaner output
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings("ignore", category=UserWarning)

# enriched_df = enriched_df[enriched_df['user_id'] != 59]# dataset.copy()

# English: Filter out users who do not have all three stress classes
#user_class_diversity = enriched_df.groupby('user_id')['stress_level'].nunique()
#complete_users = user_class_diversity[user_class_diversity == 3].index
#df_complete_stress = enriched_df[enriched_df['user_id'].isin(complete_users)].copy()

# English: From the remaining users, select the top 20 by response count
#user_counts_filtered = df_complete_stress['user_id'].value_counts()
#num_top_users = min(20, len(user_counts_filtered))
#top_users_from_complete = user_counts_filtered.head(num_top_users).index
#df_final_selection = df_complete_stress[df_complete_stress['user_id'].isin(top_users_from_complete)].copy()

# English: Now, handle NaNs and Infs
enriched_df_filled = enriched_df.dropna()
#enriched_df_filled.replace([np.inf, -np.inf], 0, inplace=True)
df_model = enriched_df_filled.sort_values(by='date').reset_index(drop=True)

# English: Define X, Y, and groups for the entire process
Y = df_model['stress_level']
X = df_model.drop(columns=['stress_level', 'user_id', 'date'])

correlation_threshold = 0.98

# Apply the function
X, dropped_columns = remove_highly_correlated_features(X, threshold=correlation_threshold)
print("List of dropped columns:", dropped_columns)

groups = df_model['user_id']

# ==============================================================================
# STAGE 1: GLOBAL FEATURE SELECTION WITH RFECV
# ==============================================================================
print("--- STAGE 1: Finding the globally optimal set of features with RFECV ---")

# English: Define your feature domains based on their prefixes
# (Adjust these lists based on your actual column names)
environmental_cols = [col for col in X.columns if 'environmental_' in col]
individual_cols = [col for col in X.columns if 'individual_' in col]
organizational_cols = [col for col in X.columns if 'organizational_' in col]
stress_history_cols = [col for col in X.columns if 'stress_level_' in col] # Assuming lagged features start with this

feature_domains = {
    "environmental": environmental_cols,
    "individual": individual_cols,
    "organizational": organizational_cols,
    "stress_history": stress_history_cols
}

best_features_per_domain = {}
N_FEATURES_PER_DOMAIN = 1

for domain, cols in feature_domains.items():
    print(f"\n--- Running RFECV for domain: {domain} ({len(cols)} features) ---")
    if not cols:
        print("No columns found for this domain. Skipping.")
        continue
        
    X_domain = X[cols]
    
    # Initialize RFECV for this domain
    estimator = XGBClassifier(objective='multiclass', random_state=24091993, n_jobs=-1)
    cv_strategy = GroupKFold(n_splits=5)
    rfecv_domain = RFECV(
        estimator=estimator,
        step=1,
        cv=cv_strategy,
        scoring='f1_weighted',
        n_jobs=-1,
        min_features_to_select=N_FEATURES_PER_DOMAIN # Select at least N
    )
    
    # Fit on the domain-specific data
    rfecv_domain.fit(X_domain, Y, groups=groups)
    
    # Store the best features for this domain
    selected_cols = X_domain.columns[rfecv_domain.support_].tolist()
    best_features_per_domain[domain] = selected_cols
    print(f"Selected {len(selected_cols)} features for {domain}: {selected_cols}")

# --- Combine the best features from all domains ---
final_selected_features = []
for domain_features in best_features_per_domain.values():
    final_selected_features.extend(domain_features)

# Remove duplicates if any feature was selected in multiple domains
final_selected_features = list(dict.fromkeys(final_selected_features)) 

print(f"\n--- Final combined set of {len(final_selected_features)} features ---")
print(final_selected_features)

# Now, use this `final_selected_features` list to create your final X,
# and proceed with hyperparameter tuning and model evaluation.
X_selected = X[final_selected_features]


# ==============================================================================
# STAGE 2: GLOBAL HYPERPARAMETER TUNING WITH OPTUNA (ON SELECTED FEATURES)
# ==============================================================================
print("\n--- STAGE 2: Finding optimal hyperparameters with Optuna on selected features ---")

def objective(trial, x_data, y_data, group_data):
    param = {
        'verbosity': 0, 'objective': 'multiclass', 'random_state': 24091993,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
    }
    
    gkf = GroupKFold(n_splits=5)
    f1_scores = []
    for train_idx, test_idx in gkf.split(x_data, y_data, groups=group_data):
        X_train, X_test = x_data.iloc[train_idx], x_data.iloc[test_idx]
        y_train, y_test = y_data.iloc[train_idx], y_data.iloc[test_idx]
        
        # English: Apply class weights inside the objective function
        class_weights_fold = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        sample_weights_fold = np.array([class_weights_fold[cls] for cls in y_train])
        
        model = XGBClassifier(**param)
        model.fit(X_train, y_train, sample_weight=sample_weights_fold)
        preds = model.predict(X_test)
        f1_scores.append(f1_score(y_test, preds, average='weighted', zero_division=0))
        
    return np.mean(f1_scores)

# English: Run Optuna study on the data with ONLY the selected features
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_selected, Y, groups), n_trials=50)

best_params = study.best_trial.params
print("\nBest hyperparameters found:", best_params)

# ==============================================================================
# STAGE 3: FINAL UNBIASED EVALUATION
# ==============================================================================
print(f"\n--- STAGE 3: Final evaluation using {len(final_selected_features)} best features and optimal hyperparameters ---")

n_splits = 5
gkf_final = GroupKFold(n_splits=n_splits)
all_accuracies = []
all_f1_scores = []

for fold, (train_idx, test_idx) in enumerate(gkf_final.split(X_selected, Y, groups=groups)):
    print(f"\n--- Fold {fold + 1}/{n_splits} ---")
    
    # English: Use the pre-selected features (X_selected) for splitting
    X_train, X_test = X_selected.iloc[train_idx], X_selected.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    
    # English: Compute sample weights for the current training fold
    class_weights = compute_class_weight('balanced', classes=np.unique(Y_train), y=Y_train)
    sample_weights = np.array([class_weights[cls] for cls in Y_train])    

    # English: Initialize model with the best global parameters
    model = XGBClassifier(objective='multiclass', random_state=24091993, **best_params)
    model.fit(X_train, Y_train, sample_weight=sample_weights)
    
    # English: Evaluate the model
    predictions = model.predict(X_test)
    accuracy = accuracy_score(Y_test, predictions)
    f1 = f1_score(Y_test, predictions, average='weighted', zero_division=0)
    
    all_accuracies.append(accuracy)
    all_f1_scores.append(f1)
    
    print(f"Fold Accuracy: {accuracy:.4f}")
    print(f"Fold F1-Score (Weighted): {f1:.4f}")

# English: Display final results
print("\n--- Final Cross-Validation Results ---")
print(f"Mean Accuracy: {np.mean(all_accuracies):.4f} ± {np.std(all_accuracies):.4f}")
print(f"Mean F1-Score (Weighted): {np.mean(all_f1_scores):.4f} ± {np.std(all_f1_scores):.4f}")


Removed 18 columns with zero or single unique values: ['environmental_minutes_unknown', 'environmental_minutes_unknown_rolling_mean_3d', 'environmental_minutes_unknown_rolling_std_3d', 'environmental_minutes_unknown_rolling_min_3d', 'environmental_minutes_unknown_rolling_max_3d', 'environmental_minutes_unknown_rolling_median_3d', 'environmental_minutes_unknown_rolling_q25_3d', 'environmental_minutes_unknown_rolling_q75_3d', 'environmental_minutes_unknown_rolling_range_3d', 'environmental_minutes_unknown_rolling_iqr_3d', 'environmental_minutes_unknown_rolling_cv_3d', 'environmental_minutes_unknown_rolling_trend_slope_3d', 'environmental_minutes_unknown_rolling_direction_changes_3d', 'environmental_minutes_unknown_rolling_entropy_3d', 'environmental_minutes_unknown_rolling_zscore_3d', 'environmental_minutes_unknown_rolling_time_since_peak_3d', 'environmental_minutes_unknown_rolling_time_since_trough_3d', 'stress_level_rolling_direction_changes_3d']
List of dropped columns: ['environmenta

In [19]:
# English: Import the datetime library at the top of your script
import datetime

# --- Option 2: Append results to a log file with a timestamp ---

# English: Define the output filename
results_log_filename = 'experiment_log.txt'

# English: Get the current timestamp
timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# English: Open the file in append mode ('a') to add new results
with open(results_log_filename, 'a') as f:
    print(f"Appending results to {results_log_filename}...")
    
    f.write(f"\n-----------------------------------------------------\n")
    f.write(f"\n-----------------------------------------------------\n")
    
    # English: Write a separator and timestamp for this run
    f.write(f"\n--- Experiment Run: {timestamp} ---\n")
    
    # English: Write the metrics
    f.write(f"Mean Accuracy: {np.mean(all_accuracies):.4f} ± {np.std(all_accuracies):.4f}\n")
    f.write(f"Mean F1-Score (Weighted): {np.mean(all_f1_scores):.4f} ± {np.std(all_f1_scores):.4f}\n")
    f.write("\nBest hyperparameters found: " + str(best_params))
    f.write(f"\n--- Final combined set of {len(final_selected_features)} features ---")
    f.write(str(final_selected_features))
    
    f.write(f"\n-----------------------------------------------------\n")
    f.write(f"\n-----------------------------------------------------\n")
print("Results successfully logged.")

Appending results to experiment_log.txt...
Results successfully logged.


In [20]:
np.random.seed(24091993)

# English: Suppress Optuna's trial logs for a cleaner output
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings("ignore", category=UserWarning)

enriched_df = enriched_df[enriched_df['user_id'] != 59]# dataset.copy()

# English: Filter out users who do not have all three stress classes
#user_class_diversity = enriched_df.groupby('user_id')['stress_level'].nunique()
#complete_users = user_class_diversity[user_class_diversity == 3].index
#df_complete_stress = enriched_df[enriched_df['user_id'].isin(complete_users)].copy()

# English: From the remaining users, select the top 20 by response count
#user_counts_filtered = df_complete_stress['user_id'].value_counts()
#num_top_users = min(20, len(user_counts_filtered))
#top_users_from_complete = user_counts_filtered.head(num_top_users).index
#df_final_selection = df_complete_stress[df_complete_stress['user_id'].isin(top_users_from_complete)].copy()

# English: Now, handle NaNs and Infs
enriched_df_filled = enriched_df.dropna()
#enriched_df_filled.replace([np.inf, -np.inf], 0, inplace=True)
df_model = enriched_df_filled.sort_values(by='date').reset_index(drop=True)

# English: Define X, Y, and groups for the entire process
Y = df_model['stress_level']
X = df_model.drop(columns=['stress_level', 'user_id', 'date'])

correlation_threshold = 0.98

# Apply the function
X, dropped_columns = remove_highly_correlated_features(X, threshold=correlation_threshold)
print("List of dropped columns:", dropped_columns)

groups = df_model['user_id']

# ==============================================================================
# STAGE 1: GLOBAL FEATURE SELECTION WITH RFECV
# ==============================================================================
print("--- STAGE 1: Finding the globally optimal set of features with RFECV ---")
        
# Initialize RFECV for this domain
estimator = XGBClassifier(objective='multiclass', random_state=24091993, n_jobs=-1)
cv_strategy = GroupKFold(n_splits=5)
rfecv = RFECV(
    estimator=estimator,
    step=1,
    cv=cv_strategy,
    scoring='f1_weighted',
    n_jobs=-1,
    min_features_to_select=1 # Select at least N
)

# Fit on the domain-specific data
rfecv.fit(X, Y, groups=groups)

# Store the best features for this domain
selected_cols = X.columns[rfecv.support_].tolist()

# Remove duplicates if any feature was selected in multiple domains
final_selected_features = selected_cols

print(f"\n--- Final combined set of {len(final_selected_features)} features ---")
print(final_selected_features)

# Now, use this `final_selected_features` list to create your final X,
# and proceed with hyperparameter tuning and model evaluation.
X_selected = X[final_selected_features]


# ==============================================================================
# STAGE 2: GLOBAL HYPERPARAMETER TUNING WITH OPTUNA (ON SELECTED FEATURES)
# ==============================================================================
print("\n--- STAGE 2: Finding optimal hyperparameters with Optuna on selected features ---")

def objective(trial, x_data, y_data, group_data):
    param = {
        'verbosity': 0, 'objective': 'multiclass', 'random_state': 24091993,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
    }
    
    gkf = GroupKFold(n_splits=5)
    f1_scores = []
    for train_idx, test_idx in gkf.split(x_data, y_data, groups=group_data):
        X_train, X_test = x_data.iloc[train_idx], x_data.iloc[test_idx]
        y_train, y_test = y_data.iloc[train_idx], y_data.iloc[test_idx]
        
        # English: Apply class weights inside the objective function
        class_weights_fold = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        sample_weights_fold = np.array([class_weights_fold[cls] for cls in y_train])
        
        model = XGBClassifier(**param)
        model.fit(X_train, y_train, sample_weight=sample_weights_fold)
        preds = model.predict(X_test)
        f1_scores.append(f1_score(y_test, preds, average='weighted', zero_division=0))
        
    return np.mean(f1_scores)

# English: Run Optuna study on the data with ONLY the selected features
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_selected, Y, groups), n_trials=50)

best_params = study.best_trial.params
print("\nBest hyperparameters found:", best_params)

# ==============================================================================
# STAGE 3: FINAL UNBIASED EVALUATION
# ==============================================================================
print(f"\n--- STAGE 3: Final evaluation using {len(final_selected_features)} best features and optimal hyperparameters ---")

n_splits = 5
gkf_final = GroupKFold(n_splits=n_splits)
all_accuracies = []
all_f1_scores = []

for fold, (train_idx, test_idx) in enumerate(gkf_final.split(X_selected, Y, groups=groups)):
    print(f"\n--- Fold {fold + 1}/{n_splits} ---")
    
    # English: Use the pre-selected features (X_selected) for splitting
    X_train, X_test = X_selected.iloc[train_idx], X_selected.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    
    # English: Compute sample weights for the current training fold
    class_weights = compute_class_weight('balanced', classes=np.unique(Y_train), y=Y_train)
    sample_weights = np.array([class_weights[cls] for cls in Y_train])    

    # English: Initialize model with the best global parameters
    model = XGBClassifier(objective='multiclass', random_state=24091993, **best_params)
    model.fit(X_train, Y_train, sample_weight=sample_weights)
    
    # English: Evaluate the model
    predictions = model.predict(X_test)
    accuracy = accuracy_score(Y_test, predictions)
    f1 = f1_score(Y_test, predictions, average='weighted', zero_division=0)
    
    all_accuracies.append(accuracy)
    all_f1_scores.append(f1)
    
    print(f"Fold Accuracy: {accuracy:.4f}")
    print(f"Fold F1-Score (Weighted): {f1:.4f}")

# English: Display final results
print("\n--- Final Cross-Validation Results ---")
print(f"Mean Accuracy: {np.mean(all_accuracies):.4f} ± {np.std(all_accuracies):.4f}")
print(f"Mean F1-Score (Weighted): {np.mean(all_f1_scores):.4f} ± {np.std(all_f1_scores):.4f}")


Removed 18 columns with zero or single unique values: ['environmental_minutes_unknown', 'environmental_minutes_unknown_rolling_mean_3d', 'environmental_minutes_unknown_rolling_std_3d', 'environmental_minutes_unknown_rolling_min_3d', 'environmental_minutes_unknown_rolling_max_3d', 'environmental_minutes_unknown_rolling_median_3d', 'environmental_minutes_unknown_rolling_q25_3d', 'environmental_minutes_unknown_rolling_q75_3d', 'environmental_minutes_unknown_rolling_range_3d', 'environmental_minutes_unknown_rolling_iqr_3d', 'environmental_minutes_unknown_rolling_cv_3d', 'environmental_minutes_unknown_rolling_trend_slope_3d', 'environmental_minutes_unknown_rolling_direction_changes_3d', 'environmental_minutes_unknown_rolling_entropy_3d', 'environmental_minutes_unknown_rolling_zscore_3d', 'environmental_minutes_unknown_rolling_time_since_peak_3d', 'environmental_minutes_unknown_rolling_time_since_trough_3d', 'stress_level_rolling_direction_changes_3d']
List of dropped columns: ['environmenta

In [21]:
# English: Import the datetime library at the top of your script
import datetime

# --- Option 2: Append results to a log file with a timestamp ---

# English: Define the output filename
results_log_filename = 'experiment_log.txt'

# English: Get the current timestamp
timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# English: Open the file in append mode ('a') to add new results
with open(results_log_filename, 'a') as f:
    print(f"Appending results to {results_log_filename}...")
    
    f.write(f"\n-----------------------------------------------------\n")
    f.write(f"\n-----------------------------------------------------\n")
    # English: Write a separator and timestamp for this run
    f.write(f"\n--- Experiment Run: {timestamp} ---\n")
    
    # English: Write the metrics
    f.write(f"Mean Accuracy: {np.mean(all_accuracies):.4f} ± {np.std(all_accuracies):.4f}\n")
    f.write(f"Mean F1-Score (Weighted): {np.mean(all_f1_scores):.4f} ± {np.std(all_f1_scores):.4f}\n")
    f.write("\nBest hyperparameters found: " + str(best_params))
    f.write(f"\n--- Final combined set of {len(final_selected_features)} features ---")
    f.write(str(final_selected_features))
    f.write(f"\n-----------------------------------------------------\n")
    f.write(f"\n-----------------------------------------------------\n")
print("Results successfully logged.")

Appending results to experiment_log.txt...
Results successfully logged.


In [17]:
np.mean(all_f1_scores)

0.5158066523448012

Unnamed: 0,user_id,date,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,...,stress_level_rolling_q75_3d,stress_level_rolling_range_3d,stress_level_rolling_iqr_3d,stress_level_rolling_cv_3d,stress_level_rolling_trend_slope_3d,stress_level_rolling_direction_changes_3d,stress_level_rolling_entropy_3d,stress_level_rolling_zscore_3d,stress_level_rolling_time_since_peak_3d,stress_level_rolling_time_since_trough_3d
3,4,2013-03-28,0,3.450000,8.0,0.9,76.333333,95.0,47.0,1.5,...,0.75,1.0,0.5,1.414214,1.0,0.0,1.0,-7.071068e-01,0.0,1.0
4,4,2013-03-29,1,3.354167,8.6,-1.6,75.833333,95.0,55.0,1.3,...,1.00,0.0,0.0,0.000000,0.0,0.0,0.0,-1.000000e+08,1.0,1.0
2,4,2013-04-03,2,-1.150000,4.0,-4.2,45.833333,58.0,29.0,0.0,...,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,1.000000e+08,1.0,1.0
5,4,2013-04-04,0,1.929167,8.6,-2.2,47.041667,58.0,33.0,0.0,...,1.75,1.0,0.5,0.471405,1.0,0.0,1.0,7.071068e-01,0.0,1.0
6,4,2013-04-05,2,3.525000,9.9,-2.0,58.875000,78.0,40.0,0.0,...,1.50,2.0,1.0,1.414214,-2.0,0.0,1.0,-7.071068e-01,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,59,2013-05-21,1,18.033333,24.4,13.9,87.875000,97.0,67.0,5.5,...,1.75,1.0,0.5,0.471405,-1.0,0.0,1.0,-7.071068e-01,1.0,0.0
644,59,2013-05-22,1,14.208333,24.5,8.5,87.708333,99.0,63.0,6.2,...,1.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000e+00,1.0,1.0
645,59,2013-05-23,1,18.450000,24.7,13.7,88.083333,99.0,68.0,1.9,...,1.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000e+00,1.0,1.0
646,59,2013-05-24,2,13.508333,19.4,6.9,94.250000,100.0,84.0,11.7,...,1.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000e+00,1.0,1.0
