In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../data/processed/studentlife2014_interpolated.csv')

In [3]:
data

Unnamed: 0,stress_level,user_id,date,individual_sleep_duration,individual_sleep_rate,organizational_social_interaction,organizational_deadlines,organizational_days_until_next_deadline,individual_minutes_stationary,individual_minutes_walking,individual_minutes_running,environmental_minutes_silence,environmental_minutes_voice,environmental_minutes_noise,organizational_social_voice_sum,organizational_social_voice_count,environmental_weekday
0,1.0,4,2013-03-27,6.0,2.0,3.0,0.0,12.0,505.0,39.0,19.0,352.0,179.0,277.0,25142.0,41.0,2
1,2.0,4,2013-03-28,6.0,2.0,3.0,0.0,11.0,633.0,57.0,29.0,410.0,268.0,255.0,25256.0,37.0,3
2,2.0,4,2013-03-29,6.0,2.0,3.0,0.0,10.0,592.0,76.0,42.0,368.0,293.0,288.0,28051.0,39.0,4
3,2.0,4,2013-03-30,7.0,3.0,3.0,0.0,9.0,593.0,49.0,23.0,493.0,136.0,230.0,17375.0,33.0,5
4,2.0,4,2013-03-31,7.0,3.0,3.0,0.0,8.0,621.0,68.0,17.0,413.0,240.0,281.0,26301.0,37.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
851,1.0,59,2013-05-23,2.0,2.0,4.0,0.0,1.0,555.0,53.0,7.0,203.0,47.0,370.0,11873.0,53.0,3
852,2.0,59,2013-05-24,8.0,2.0,4.0,1.0,5.0,1330.0,46.0,12.0,399.0,178.0,836.0,30018.0,92.0,4
853,2.0,59,2013-05-25,12.0,2.0,4.0,0.0,4.0,1319.0,74.0,14.0,306.0,148.0,975.0,22226.0,69.0,5
854,1.0,59,2013-05-26,16.0,1.0,4.0,0.0,3.0,1062.0,185.0,14.0,299.0,77.0,898.0,14204.0,46.0,6


In [12]:
def add_previous_stress_features(df, user_col='user_id', date_col='date', stress_col='stress_level'):
    """
    Adds two new columns to the dataframe:
    1. individual_previous_stress_level: The previous stress level for the same user.
    2. days_since_previous_measurement: The number of days passed since the last measurement.

    Parameters:
    -----------
    df : pd.DataFrame
        The input dataframe, which must contain user ID, date, and stress level columns.
    user_col : str
        The name of the user identifier column.
    date_col : str
        The name of the date column (must be a datetime type).
    stress_col : str
        The name of the stress level column.

    Returns:
    --------
    pd.DataFrame
        A new dataframe with the two added features.
    """
    # Create a copy to avoid modifying the original dataframe
    df_copy = df.copy()

    # Ensure data is sorted chronologically for each user
    df_copy = df_copy.sort_values(by=[user_col, date_col])

    # --- Create the new features using groupby and shift ---

    # 1. Get the previous stress level for each user (lag of 1)
    df_copy['individual_previous_stress_level'] = df_copy.groupby(user_col)[stress_col].shift(1)

    # 2. Calculate the days since the last measurement
    # First, get the previous date for each user
    previous_date = df_copy.groupby(user_col)[date_col].shift(1)
    # Then, calculate the difference in days
    # df_copy['individual_days_since_previous_stress_measurement'] = (df_copy[date_col] - previous_date).dt.days

    return df_copy

In [16]:
binary = False
if binary:
    data['stress_level'] = data['stress_level'].apply(lambda x: 0 if x < 2 else 1)
else:
    data['stress_level'] = data['stress_level'].apply(lambda x: 0 if x == 1 else 1 if x == 2 else 2)

In [17]:
data = add_previous_stress_features(data)

In [18]:
data.columns

Index(['stress_level', 'user_id', 'date', 'individual_sleep_duration',
       'individual_sleep_rate', 'organizational_social_interaction',
       'organizational_deadlines', 'organizational_days_until_next_deadline',
       'individual_minutes_stationary', 'individual_minutes_walking',
       'individual_minutes_running', 'environmental_minutes_silence',
       'environmental_minutes_voice', 'environmental_minutes_noise',
       'organizational_social_voice_sum', 'organizational_social_voice_count',
       'environmental_weekday', 'individual_previous_stress_level'],
      dtype='object')

In [20]:
# --- 1. DATA PREPARATION ---
from sklearn.utils import class_weight
import numpy as np
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import GroupKFold, StratifiedKFold, GroupShuffleSplit
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, f1_score, classification_report
import optuna
import warnings
from gplearn.genetic import SymbolicRegressor, SymbolicClassifier
from pysr import PySRRegressor
from sklearn.utils import class_weight

# English: Import models and tools
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# --- 1. DATA PREPARATION ---

# --- THE FIX: Add a control flag for the filtering logic ---
FILTER_FOR_TOP_USERS = True # Set to False to use all users

# English: Initial data cleaning
df = data.copy()#dataset.drop(columns=['individual_previous_stress_level', 'individual_days_since_previous_stress_measurement'])
# map stress levels from 1 2 3 to 0 1 2


df.dropna(inplace=True)

random_seed= 3052011

np.random.seed(random_seed)
# English: Optional filtering block
if FILTER_FOR_TOP_USERS:
    print("--- Filtering for the top 20 users with the most responses ---")
    
    # Step 1: Get the response counts for each user
    user_counts = df['user_id'].value_counts()
    
    # Step 2: Get the list of the top 20 user IDs
    # We use .index to get the user_id values
    top_20_users = user_counts.head(20).index
    
    # Step 3: Filter the dataframe to keep only these top users
    # .isin() checks which rows have a 'user_id' that is in our list
    df_filtered = df[df['user_id'].isin(top_20_users)].copy()
    
    print(f"Original number of users: {df['user_id'].nunique()}")
    print(f"Number of users after filtering: {df_filtered['user_id'].nunique()}")
    
    # English: Prepare data for modeling using the filtered dataframe
    X = df_filtered.drop(columns=['user_id', 'stress_level', 'date'])
    y = df_filtered['stress_level']
    groups = df_filtered['user_id']
    
else:
    print("--- Using all available users (no filtering) ---")
    
    # English: Prepare data for modeling using the original dataframe
    X = df[['individual_previous_stress_level']]#df.drop(columns=['user_id', 'stress_level', 'date'])
    y = df['stress_level']
    groups = df['user_id']


# --- The rest of your experiment pipeline remains exactly the same ---
# --- 2. EXPERIMENT CONFIGURATION ---
# n_splits = ...
# models_to_test = { ... }
# ...


# --- 2. EXPERIMENT CONFIGURATION ---
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)
results_list = []

# --- THE FIX: Add class weighting parameters to the models ---
# English: Define the models to be tested in a dictionary
models_to_test = {
    # English: For scikit-learn compatible models like Logistic Regression, we use the `class_weight` parameter.
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(random_state=random_seed, max_iter=10000, solver='liblinear', class_weight='balanced'))
    ]),
    
    # English: For XGBoost, the parameter is `scale_pos_weight`, but it's more complex for multiclass.
    # The best approach for XGBoost is to calculate weights manually and pass them to .fit().
    # However, we will handle this inside the loop for a more robust calculation per fold.
    "XGBoost": XGBClassifier(random_state=random_seed),
    
    # English: For LightGBM, the parameter is `class_weight`.
    "LightGBM": LGBMClassifier(random_state=random_seed, verbose=-1, class_weight='balanced'),
    
    # English: For CatBoost, the parameter is `auto_class_weights`.
    "CatBoost": CatBoostClassifier(random_state=random_seed, verbose=0, auto_class_weights='Balanced')
}


# --- 3. CROSS-VALIDATION LOOP ---
print(f"--- Starting cross-validation with {n_splits} folds ---")
for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups)):
    print(f"\n--- Processing Fold {fold + 1}/{n_splits} ---")
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # --- THE FIX for XGBoost: Calculate sample weights for the current training fold ---
    # This is the most robust way to handle class imbalance with XGBoost in a CV setting.
    xgb_sample_weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)

    # English: Iterate through each model defined above
    for name, model in models_to_test.items():
        print(f"  - Training {name}...")
        
        # English: Fit the model on the training data for the current fold
        if name == "XGBoost":
            # Pass the calculated sample weights to the fit method for XGBoost
            model.fit(X_train, y_train, sample_weight=xgb_sample_weights)
        else:
            # Other models handle balancing internally via their parameters
            model.fit(X_train, y_train)
        
        # (The rest of the prediction and evaluation logic remains the same)
        preds = model.predict(X_test)
        f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
        results_list.append({'Fold': fold + 1, 'Algorithm': name, 'F1-Score (weighted)': f1})
        print(f"  - {name} F1-Score: {f1:.4f}")


# --- 4. RESULTS PRESENTATION ---
print("\n--- Final Experiment Results ---")
results_df = pd.DataFrame(results_list)
display(results_df)

print("\n--- Average Performance Summary ---")
summary = results_df.groupby('Algorithm')['F1-Score (weighted)'].agg(['mean', 'std']).reset_index()
display(summary)

--- Filtering for the top 20 users with the most responses ---
Original number of users: 22
Number of users after filtering: 20
--- Starting cross-validation with 5 folds ---

--- Processing Fold 1/5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3706
  - Training XGBoost...
  - XGBoost F1-Score: 0.3853
  - Training LightGBM...
  - LightGBM F1-Score: 0.3872
  - Training CatBoost...
  - CatBoost F1-Score: 0.4214

--- Processing Fold 2/5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3750
  - Training XGBoost...
  - XGBoost F1-Score: 0.3871
  - Training LightGBM...
  - LightGBM F1-Score: 0.3701
  - Training CatBoost...
  - CatBoost F1-Score: 0.3791

--- Processing Fold 3/5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.4606
  - Training XGBoost...
  - XGBoost F1-Score: 0.4656
  - Training LightGBM...
  - LightGBM F1-Score: 0.4731
  - Training CatBoost...
  - CatBoost F1-Score: 0.4697

--- Processing Fo

Unnamed: 0,Fold,Algorithm,F1-Score (weighted)
0,1,Logistic Regression,0.370602
1,1,XGBoost,0.385267
2,1,LightGBM,0.387167
3,1,CatBoost,0.421396
4,2,Logistic Regression,0.37501
5,2,XGBoost,0.387081
6,2,LightGBM,0.370146
7,2,CatBoost,0.379123
8,3,Logistic Regression,0.460639
9,3,XGBoost,0.465616



--- Average Performance Summary ---


Unnamed: 0,Algorithm,mean,std
0,CatBoost,0.399451,0.049771
1,LightGBM,0.401112,0.043561
2,Logistic Regression,0.397389,0.037043
3,XGBoost,0.392618,0.043106


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.metrics import f1_score, classification_report
from sklearn.utils import class_weight
import xgboost as xgb
import catboost as catb
import warnings

# --- 0. CONFIGURACIÓN INICIAL ---
warnings.filterwarnings('ignore')
# Suponemos que el DataFrame original se llama 'df_original'
# df_original = pd.read_csv('tu_archivo_de_datos.csv') 

# --- 1. INGENIERÍA Y PREPARACIÓN DE CARACTERÍSTICAS (VERSIÓN AVANZADA CON ROLLING FEATURES) ---
print("--- Paso 1: Iniciando la ingeniería de características avanzadas ---")

# Usar una copia para evitar modificar el dataframe original
df = data.copy() # Asumiendo que el dataframe se llama 'data'

# 1.1. Pre-procesamiento básico
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['user_id', 'date']).reset_index(drop=True)

# 1.2. Creación de la variable objetivo 'stress_change'
df['stress_change'] = np.where(df['stress_level'] != df['individual_previous_stress_level'], 1, 0)
df.loc[df['individual_previous_stress_level'].isna(), 'stress_change'] = np.nan

# 1.3. Creación de características de diferencia (cambio inmediato)
features_to_diff = [
    'environmental_temperature_mean', 'environmental_humidity_mean', 'environmental_precipitation',
    'individual_sleep_duration', 'organizational_work_hours', 'organizational_days_until_next_deadline'
]
for col in features_to_diff:
    df[f'{col}_diff_1d'] = df.groupby('user_id')[col].diff() # Nombramos como _diff_1d para claridad

# 1.4. Creación de características de ventana deslizante (tendencias, volatilidad, acumulación)
print("Calculando características de ventana deslizante (rolling features)...")
window_size = '3D'  # Ventana de 3 días

features_for_rolling = [
    'individual_sleep_duration',
    'organizational_work_hours',
    'environmental_temperature_mean',
    'environmental_precipitation',
    'individual_minutes_walking',
    'individual_minutes_stationary'
]

# Definimos las agregaciones que queremos calcular en la ventana
aggregations = {
    'mean', # Tendencia
    'std',  # Volatilidad
    'sum',  # Acumulación
    'min',  # Mínimo en el periodo
    'max'   # Máximo en el periodo
}

# El cálculo se hace por grupo de usuario para no mezclar datos
# Usamos un índice temporal para que .rolling() funcione correctamente
df_rolling_features = df.set_index('date').groupby('user_id')[features_for_rolling].rolling(window=window_size).agg(aggregations)

# Limpiamos los nombres de las nuevas columnas (ej. de ('individual_sleep_duration', 'mean') a 'individual_sleep_duration_rolling_3d_mean')
df_rolling_features.columns = [f'{col[0]}_rolling_{window_size}_{col[1]}' for col in df_rolling_features.columns]

# Eliminamos el multi-índice para poder unirlo al dataframe principal
df_rolling_features = df_rolling_features.reset_index(level='user_id', drop=True)

# Unimos las nuevas características al dataframe principal
df = df.set_index('date').join(df_rolling_features).reset_index()
df = df.sort_values(['user_id', 'date']).reset_index(drop=True)


# 1.5. Selección final de características y limpieza
print("Seleccionando el conjunto final de características...")
final_features = [
    # Rasgos de personalidad (estáticos)
    'individual_personality_extraversion', 'individual_personality_agreeableness',
    'individual_personality_conscientiousness', 'individual_personality_neuroticism',
    'individual_personality_openness',
    
    # Contexto temporal y organizacional (no-diferenciadas)
    'individual_days_since_previous_stress_measurement',
    'environmental_weekday',
    'organizational_deadlines',

    # Características de cambio inmediato (diferencia 1 día)
    *[f'{col}_diff_1d' for col in features_to_diff],
    
    # NUEVAS Características de ventana deslizante
    *df_rolling_features.columns.tolist()
]

# Creamos el DataFrame final para el modelo
df_model = df[['user_id', 'stress_change'] + final_features].copy()

# Eliminamos todas las filas que tengan NaN en la columna objetivo o en las nuevas características.
# Esto elimina la primera fila de cada usuario (por el _diff) y las primeras N filas (por el rolling)
df_model.dropna(subset=['stress_change'], inplace=True)
df_model.dropna(subset=df_rolling_features.columns.tolist(), inplace=True)

# Rellenamos cualquier otro posible NaN con 0 (estrategia conservadora)
df_model.fillna(0, inplace=True)

# Convertimos el target a entero
df_model['stress_change'] = df_model['stress_change'].astype(int)

# 1.6. Preparar datos para el modelado
X = df_model[final_features]
y = df_model['stress_change']
groups = df_model['user_id']

print(f"Preparación finalizada. Entrenando con {len(X.columns)} características.")
print(f"Número de muestras válidas tras crear rolling features: {len(df_model)}")
print("--------------------------------------------------\n")


# --- LOS PASOS 2, 3, 4 y 5 CONTINÚAN EXACTAMENTE IGUAL ---
# (Pega aquí el resto de tu script desde "--- 2. CONFIGURACIÓN DEL EXPERIMENTO...")
# ...


# --- 2. CONFIGURACIÓN DEL EXPERIMENTO DE VALIDACIÓN CRUZADA ---
print("--- Paso 2: Configurando el experimento de validación cruzada ---")
random_seed = 42
np.random.seed(random_seed)
n_splits = 5  # Usamos 5 folds, un estándar robusto
gkf = GroupKFold(n_splits=n_splits)
results_list = []

# Definimos los modelos a probar
models_to_test = {
    "XGBoost": xgb.XGBClassifier(random_state=random_seed, use_label_encoder=False, eval_metric='logloss'),
    "CatBoost": catb.CatBoostClassifier(random_state=random_seed, verbose=0, auto_class_weights='Balanced')
}
print(f"Modelos a comparar: {list(models_to_test.keys())}")
print(f"Estrategia de validación: GroupKFold con {n_splits} splits.")
print("--------------------------------------------------\n")


# --- 3. BUCLE DE VALIDACIÓN CRUZADA (GroupKFold) ---
print(f"--- Paso 3: Iniciando validación cruzada... ---")
for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Obtenemos los IDs de los participantes en cada conjunto para verificar que no hay solapamiento
    train_groups = set(groups.iloc[train_idx])
    test_groups = set(groups.iloc[test_idx])
    
    print(f"\nFold {fold+1}/{n_splits}:")
    print(f"  Train users: {len(train_groups)}, Test users: {len(test_groups)}, Overlap: {len(train_groups.intersection(test_groups))}")

    for name, model in models_to_test.items():
        if name == "XGBoost":
            # XGBoost requiere que los pesos se calculen y pasen explícitamente
            sample_weights = class_weight.compute_sample_weight('balanced', y=y_train)
            model.fit(X_train, y_train, sample_weight=sample_weights)
        else:
            # CatBoost gestiona los pesos internamente con auto_class_weights='Balanced'
            model.fit(X_train, y_train)
        
        preds = model.predict(X_test)
        f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
        results_list.append({'Fold': fold+1, 'Algorithm': name, 'F1-Score (weighted)': f1})
        print(f"  - {name} F1-Score: {f1:.4f}")

print("--------------------------------------------------\n")


# --- 4. PRESENTACIÓN DE RESULTADOS DE LA VALIDACIÓN CRUZADA ---
print("--- Paso 4: Resumen de rendimiento promedio en validación cruzada ---")
results_df = pd.DataFrame(results_list)
summary = results_df.groupby('Algorithm')['F1-Score (weighted)'].agg(['mean', 'std']).reset_index()
summary.rename(columns={'mean': 'F1-Score Medio', 'std': 'Desv. Estándar'}, inplace=True)
print(summary.to_string(index=False))
print("--------------------------------------------------\n")


# --- 5. ANÁLISIS DETALLADO Y EVALUACIÓN FINAL DEL MEJOR MODELO ---
print("--- Paso 5: Análisis final del mejor modelo en un conjunto de prueba aislado ---")
best_model_name = summary.loc[summary['F1-Score Medio'].idxmax()]['Algorithm']
best_model_score = summary.loc[summary['F1-Score Medio'].idxmax()]['F1-Score Medio']
print(f"Mejor modelo identificado: {best_model_name} (F1-Score Promedio: {best_model_score:.4f})")

# Separación final usando GroupShuffleSplit para crear un conjunto de prueba con participantes no vistos
gss = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=random_seed)
final_train_idx, final_test_idx = next(gss.split(X, y, groups=groups))

X_train_final, X_test_final = X.iloc[final_train_idx], X.iloc[final_test_idx]
y_train_final, y_test_final = y.iloc[final_train_idx], y.iloc[final_test_idx]

print(f"\nSeparación final de datos: {len(X_train_final)} muestras de entrenamiento, {len(X_test_final)} de prueba.")
print(f"Usuarios para entrenamiento: {len(set(groups.iloc[final_train_idx]))}, Usuarios para prueba: {len(set(groups.iloc[final_test_idx]))}")

# Re-entrenamos el mejor modelo con el conjunto de entrenamiento final
best_model_config = models_to_test[best_model_name]

print(f"\nRe-entrenando {best_model_name} en el conjunto de entrenamiento final...")
if best_model_name == "XGBoost":
    final_weights = class_weight.compute_sample_weight('balanced', y=y_train_final)
    best_model_config.fit(X_train_final, y_train_final, sample_weight=final_weights)
else:
    best_model_config.fit(X_train_final, y_train_final)

# Evaluación final
print("\n--- Reporte de Clasificación Final ---")
final_predictions = best_model_config.predict(X_test_final)
target_names = ['Sin Cambio de Estrés (0)', 'Con Cambio de Estrés (1)']
report = classification_report(y_test_final, final_predictions, target_names=target_names)
print(report)
print("==================================================")


--- Paso 1: Iniciando la ingeniería de características avanzadas ---
Calculando características de ventana deslizante (rolling features)...
Seleccionando el conjunto final de características...
Preparación finalizada. Entrenando con 44 características.
Número de muestras válidas tras crear rolling features: 7422
--------------------------------------------------

--- Paso 2: Configurando el experimento de validación cruzada ---
Modelos a comparar: ['XGBoost', 'CatBoost']
Estrategia de validación: GroupKFold con 5 splits.
--------------------------------------------------

--- Paso 3: Iniciando validación cruzada... ---

Fold 1/5:
  Train users: 19, Test users: 5, Overlap: 0
  - XGBoost F1-Score: 0.4033
  - CatBoost F1-Score: 0.4461

Fold 2/5:
  Train users: 19, Test users: 5, Overlap: 0
  - XGBoost F1-Score: 0.4168
  - CatBoost F1-Score: 0.4973

Fold 3/5:
  Train users: 20, Test users: 4, Overlap: 0
  - XGBoost F1-Score: 0.4927
  - CatBoost F1-Score: 0.5043

Fold 4/5:
  Train users: 19