In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('../data/processed/studentlife_2014.csv')

In [4]:
data

Unnamed: 0,user_id,date,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,...,organizational_deadlines,organizational_days_until_next_deadline,environmental_weekday,individual_personality_extraversion,individual_personality_agreeableness,individual_personality_conscientiousness,individual_personality_neuroticism,individual_personality_openness,individual_previous_stress_level,individual_days_since_previous_stress_measurement
0,4,2013-03-27,0,0.466667,7.2,-6.1,64.125000,75.0,46.0,0.0,...,0.0,12.0,2,1,4,0,15,17,,
1,4,2013-03-28,1,3.450000,8.0,0.9,76.333333,95.0,47.0,1.5,...,0.0,11.0,3,1,4,0,15,17,0.0,1.0
2,4,2013-03-29,1,3.354167,8.6,-1.6,75.833333,95.0,55.0,1.3,...,0.0,10.0,4,1,4,0,15,17,1.0,1.0
3,4,2013-04-02,1,-1.525000,1.0,-3.6,44.291667,53.0,32.0,0.0,...,0.0,6.0,1,1,4,0,15,17,1.0,4.0
4,4,2013-04-03,2,-1.150000,4.0,-4.2,45.833333,58.0,29.0,0.0,...,0.0,5.0,2,1,4,0,15,17,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,59,2013-05-21,0,18.033333,24.4,13.9,87.875000,97.0,67.0,5.5,...,0.0,3.0,1,14,13,-1,5,23,0.0,1.0
644,59,2013-05-22,0,14.208333,24.5,8.5,87.708333,99.0,63.0,6.2,...,0.0,2.0,2,14,13,-1,5,23,0.0,1.0
645,59,2013-05-23,0,18.450000,24.7,13.7,88.083333,99.0,68.0,1.9,...,0.0,1.0,3,14,13,-1,5,23,0.0,1.0
646,59,2013-05-24,1,13.508333,19.4,6.9,94.250000,100.0,84.0,11.7,...,1.0,5.0,4,14,13,-1,5,23,0.0,1.0


In [5]:
data.columns

Index(['user_id', 'date', 'stress_level', 'environmental_temperature_mean',
       'environmental_temperature_max', 'environmental_temperature_min',
       'environmental_humidity_mean', 'environmental_humidity_max',
       'environmental_humidity_min', 'environmental_precipitation',
       'environmental_cloudcover', 'individual_sleep_duration',
       'individual_sleep_rate', 'organizational_social_interaction',
       'organizational_social_voice_sum', 'organizational_social_voice_count',
       'organizational_social_voice_mean', 'organizational_social_voice_max',
       'individual_minutes_stationary', 'individual_minutes_walking',
       'individual_minutes_running', 'individual_minutes_unknown',
       'environmental_minutes_silence', 'environmental_minutes_voice',
       'environmental_minutes_noise', 'environmental_minutes_unknown',
       'organizational_work_hours', 'organizational_deadlines',
       'organizational_days_until_next_deadline', 'environmental_weekday',
       

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.metrics import f1_score, classification_report
from sklearn.utils import class_weight
import xgboost as xgb
import catboost as catb
import warnings

# --- 0. CONFIGURACIÓN INICIAL ---
warnings.filterwarnings('ignore')
# Suponemos que el DataFrame original se llama 'df_original'
# df_original = pd.read_csv('tu_archivo_de_datos.csv') 

# --- 1. INGENIERÍA Y PREPARACIÓN DE CARACTERÍSTICAS (VERSIÓN AVANZADA CON ROLLING FEATURES) ---
print("--- Paso 1: Iniciando la ingeniería de características avanzadas ---")

# Usar una copia para evitar modificar el dataframe original
df = data.copy() # Asumiendo que el dataframe se llama 'data'

# 1.1. Pre-procesamiento básico
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['user_id', 'date']).reset_index(drop=True)

# 1.2. Creación de la variable objetivo 'stress_change'
df['stress_change'] = np.where(df['stress_level'] != df['individual_previous_stress_level'], 1, 0)
df.loc[df['individual_previous_stress_level'].isna(), 'stress_change'] = np.nan

# 1.3. Creación de características de diferencia (cambio inmediato)
features_to_diff = [
    'environmental_temperature_mean', 'environmental_humidity_mean', 'environmental_precipitation',
    'individual_sleep_duration', 'organizational_work_hours', 'organizational_days_until_next_deadline'
]
for col in features_to_diff:
    df[f'{col}_diff_1d'] = df.groupby('user_id')[col].diff() # Nombramos como _diff_1d para claridad

# 1.4. Creación de características de ventana deslizante (tendencias, volatilidad, acumulación)
print("Calculando características de ventana deslizante (rolling features)...")
window_size = '3D'  # Ventana de 3 días

features_for_rolling = [
    'individual_sleep_duration',
    'organizational_work_hours',
    'environmental_temperature_mean',
    'environmental_precipitation',
    'individual_minutes_walking',
    'individual_minutes_stationary'
]

# Definimos las agregaciones que queremos calcular en la ventana
aggregations = {
    'mean', # Tendencia
    'std',  # Volatilidad
    'sum',  # Acumulación
    'min',  # Mínimo en el periodo
    'max'   # Máximo en el periodo
}

# El cálculo se hace por grupo de usuario para no mezclar datos
# Usamos un índice temporal para que .rolling() funcione correctamente
df_rolling_features = df.set_index('date').groupby('user_id')[features_for_rolling].rolling(window=window_size).agg(aggregations)

# Limpiamos los nombres de las nuevas columnas (ej. de ('individual_sleep_duration', 'mean') a 'individual_sleep_duration_rolling_3d_mean')
df_rolling_features.columns = [f'{col[0]}_rolling_{window_size}_{col[1]}' for col in df_rolling_features.columns]

# Eliminamos el multi-índice para poder unirlo al dataframe principal
df_rolling_features = df_rolling_features.reset_index(level='user_id', drop=True)

# Unimos las nuevas características al dataframe principal
df = df.set_index('date').join(df_rolling_features).reset_index()
df = df.sort_values(['user_id', 'date']).reset_index(drop=True)


# 1.5. Selección final de características y limpieza
print("Seleccionando el conjunto final de características...")
final_features = [
    # Rasgos de personalidad (estáticos)
    'individual_personality_extraversion', 'individual_personality_agreeableness',
    'individual_personality_conscientiousness', 'individual_personality_neuroticism',
    'individual_personality_openness',
    
    # Contexto temporal y organizacional (no-diferenciadas)
    'individual_days_since_previous_stress_measurement',
    'environmental_weekday',
    'organizational_deadlines',

    # Características de cambio inmediato (diferencia 1 día)
    *[f'{col}_diff_1d' for col in features_to_diff],
    
    # NUEVAS Características de ventana deslizante
    *df_rolling_features.columns.tolist()
]

# Creamos el DataFrame final para el modelo
df_model = df[['user_id', 'stress_change'] + final_features].copy()

# Eliminamos todas las filas que tengan NaN en la columna objetivo o en las nuevas características.
# Esto elimina la primera fila de cada usuario (por el _diff) y las primeras N filas (por el rolling)
df_model.dropna(subset=['stress_change'], inplace=True)
df_model.dropna(subset=df_rolling_features.columns.tolist(), inplace=True)

# Rellenamos cualquier otro posible NaN con 0 (estrategia conservadora)
df_model.fillna(0, inplace=True)

# Convertimos el target a entero
df_model['stress_change'] = df_model['stress_change'].astype(int)

# 1.6. Preparar datos para el modelado
X = df_model[final_features]
y = df_model['stress_change']
groups = df_model['user_id']

print(f"Preparación finalizada. Entrenando con {len(X.columns)} características.")
print(f"Número de muestras válidas tras crear rolling features: {len(df_model)}")
print("--------------------------------------------------\n")


# --- LOS PASOS 2, 3, 4 y 5 CONTINÚAN EXACTAMENTE IGUAL ---
# (Pega aquí el resto de tu script desde "--- 2. CONFIGURACIÓN DEL EXPERIMENTO...")
# ...


# --- 2. CONFIGURACIÓN DEL EXPERIMENTO DE VALIDACIÓN CRUZADA ---
print("--- Paso 2: Configurando el experimento de validación cruzada ---")
random_seed = 42
np.random.seed(random_seed)
n_splits = 5  # Usamos 5 folds, un estándar robusto
gkf = GroupKFold(n_splits=n_splits)
results_list = []

# Definimos los modelos a probar
models_to_test = {
    "XGBoost": xgb.XGBClassifier(random_state=random_seed, use_label_encoder=False, eval_metric='logloss'),
    "CatBoost": catb.CatBoostClassifier(random_state=random_seed, verbose=0, auto_class_weights='Balanced')
}
print(f"Modelos a comparar: {list(models_to_test.keys())}")
print(f"Estrategia de validación: GroupKFold con {n_splits} splits.")
print("--------------------------------------------------\n")


# --- 3. BUCLE DE VALIDACIÓN CRUZADA (GroupKFold) ---
print(f"--- Paso 3: Iniciando validación cruzada... ---")
for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Obtenemos los IDs de los participantes en cada conjunto para verificar que no hay solapamiento
    train_groups = set(groups.iloc[train_idx])
    test_groups = set(groups.iloc[test_idx])
    
    print(f"\nFold {fold+1}/{n_splits}:")
    print(f"  Train users: {len(train_groups)}, Test users: {len(test_groups)}, Overlap: {len(train_groups.intersection(test_groups))}")

    for name, model in models_to_test.items():
        if name == "XGBoost":
            # XGBoost requiere que los pesos se calculen y pasen explícitamente
            sample_weights = class_weight.compute_sample_weight('balanced', y=y_train)
            model.fit(X_train, y_train, sample_weight=sample_weights)
        else:
            # CatBoost gestiona los pesos internamente con auto_class_weights='Balanced'
            model.fit(X_train, y_train)
        
        preds = model.predict(X_test)
        f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
        results_list.append({'Fold': fold+1, 'Algorithm': name, 'F1-Score (weighted)': f1})
        print(f"  - {name} F1-Score: {f1:.4f}")

print("--------------------------------------------------\n")


# --- 4. PRESENTACIÓN DE RESULTADOS DE LA VALIDACIÓN CRUZADA ---
print("--- Paso 4: Resumen de rendimiento promedio en validación cruzada ---")
results_df = pd.DataFrame(results_list)
summary = results_df.groupby('Algorithm')['F1-Score (weighted)'].agg(['mean', 'std']).reset_index()
summary.rename(columns={'mean': 'F1-Score Medio', 'std': 'Desv. Estándar'}, inplace=True)
print(summary.to_string(index=False))
print("--------------------------------------------------\n")


# --- 5. ANÁLISIS DETALLADO Y EVALUACIÓN FINAL DEL MEJOR MODELO ---
print("--- Paso 5: Análisis final del mejor modelo en un conjunto de prueba aislado ---")
best_model_name = summary.loc[summary['F1-Score Medio'].idxmax()]['Algorithm']
best_model_score = summary.loc[summary['F1-Score Medio'].idxmax()]['F1-Score Medio']
print(f"Mejor modelo identificado: {best_model_name} (F1-Score Promedio: {best_model_score:.4f})")

# Separación final usando GroupShuffleSplit para crear un conjunto de prueba con participantes no vistos
gss = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=random_seed)
final_train_idx, final_test_idx = next(gss.split(X, y, groups=groups))

X_train_final, X_test_final = X.iloc[final_train_idx], X.iloc[final_test_idx]
y_train_final, y_test_final = y.iloc[final_train_idx], y.iloc[final_test_idx]

print(f"\nSeparación final de datos: {len(X_train_final)} muestras de entrenamiento, {len(X_test_final)} de prueba.")
print(f"Usuarios para entrenamiento: {len(set(groups.iloc[final_train_idx]))}, Usuarios para prueba: {len(set(groups.iloc[final_test_idx]))}")

# Re-entrenamos el mejor modelo con el conjunto de entrenamiento final
best_model_config = models_to_test[best_model_name]

print(f"\nRe-entrenando {best_model_name} en el conjunto de entrenamiento final...")
if best_model_name == "XGBoost":
    final_weights = class_weight.compute_sample_weight('balanced', y=y_train_final)
    best_model_config.fit(X_train_final, y_train_final, sample_weight=final_weights)
else:
    best_model_config.fit(X_train_final, y_train_final)

# Evaluación final
print("\n--- Reporte de Clasificación Final ---")
final_predictions = best_model_config.predict(X_test_final)
target_names = ['Sin Cambio de Estrés (0)', 'Con Cambio de Estrés (1)']
report = classification_report(y_test_final, final_predictions, target_names=target_names)
print(report)
print("==================================================")


--- Paso 1: Iniciando la ingeniería de características avanzadas ---
Calculando características de ventana deslizante (rolling features)...
Seleccionando el conjunto final de características...
Preparación finalizada. Entrenando con 44 características.
Número de muestras válidas tras crear rolling features: 7422
--------------------------------------------------

--- Paso 2: Configurando el experimento de validación cruzada ---
Modelos a comparar: ['XGBoost', 'CatBoost']
Estrategia de validación: GroupKFold con 5 splits.
--------------------------------------------------

--- Paso 3: Iniciando validación cruzada... ---

Fold 1/5:
  Train users: 19, Test users: 5, Overlap: 0
  - XGBoost F1-Score: 0.4033
  - CatBoost F1-Score: 0.4461

Fold 2/5:
  Train users: 19, Test users: 5, Overlap: 0
  - XGBoost F1-Score: 0.4168
  - CatBoost F1-Score: 0.4973

Fold 3/5:
  Train users: 20, Test users: 4, Overlap: 0
  - XGBoost F1-Score: 0.4927
  - CatBoost F1-Score: 0.5043

Fold 4/5:
  Train users: 19