In [8]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import GroupKFold, StratifiedKFold, GroupShuffleSplit
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, f1_score, classification_report
import optuna
import warnings
from gplearn.genetic import SymbolicRegressor, SymbolicClassifier
from pysr import PySRRegressor
from sklearn.utils import class_weight

# English: Import models and tools
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
def remove_highly_correlated_features(df, threshold=0.95):
    """
    Finds and removes one of each pair of highly correlated features in a dataframe.

    Parameters:
    -----------
    df : pandas.DataFrame
        The input dataframe with numerical features.
    threshold : float, optional
        The correlation threshold above which a feature is considered redundant. 
        Defaults to 0.95.

    Returns:
    --------
    pandas.DataFrame
        A new dataframe with highly correlated features removed.
    list
        A list of the column names that were dropped.
    """
    # Create a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    
    # --- Step 1: Remove zero-variance columns ---
    # These columns have no predictive power and can cause issues with correlation calculation.
    cols_to_drop_zerovar = df_copy.columns[df_copy.nunique() <= 1]
    if not cols_to_drop_zerovar.empty:
        df_copy.drop(columns=cols_to_drop_zerovar, inplace=True)
        print(f"Removed {len(cols_to_drop_zerovar)} columns with zero or single unique values: {cols_to_drop_zerovar.tolist()}")
    
    # --- Step 2: Calculate the correlation matrix ---
    # Use .abs() because a strong negative correlation (-0.95) is as redundant as a strong positive one.
    corr_matrix = df_copy.corr().abs()
    
    # --- Step 3: Identify one of each highly correlated pair ---
    # Select the upper triangle of the correlation matrix to avoid duplicates
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find features with correlation greater than the threshold
    cols_to_drop_corr = [column for column in upper.columns if any(upper[column] > threshold)]
    
    # --- Step 4: Drop the identified features ---
    df_reduced = df_copy.drop(columns=cols_to_drop_corr)
    
    # Combine all dropped columns for the report
    all_dropped_cols = cols_to_drop_zerovar.tolist() + cols_to_drop_corr
    
    return df_reduced, all_dropped_cols

In [3]:
dataset = pd.read_csv('../data/processed/studentlife_2014.csv')

In [5]:
dataset.columns

Index(['user_id', 'date', 'stress_level', 'environmental_temperature_mean',
       'environmental_temperature_max', 'environmental_temperature_min',
       'environmental_humidity_mean', 'environmental_humidity_max',
       'environmental_humidity_min', 'environmental_precipitation',
       'environmental_cloudcover', 'individual_sleep_duration',
       'individual_sleep_rate', 'organizational_social_interaction',
       'organizational_social_voice_sum', 'organizational_social_voice_count',
       'organizational_social_voice_mean', 'organizational_social_voice_max',
       'individual_minutes_stationary', 'individual_minutes_walking',
       'individual_minutes_running', 'individual_minutes_unknown',
       'environmental_minutes_silence', 'environmental_minutes_voice',
       'environmental_minutes_noise', 'environmental_minutes_unknown',
       'organizational_work_hours', 'organizational_deadlines',
       'organizational_days_until_next_deadline', 'environmental_weekday',
       

## Only from stress

In [12]:
# --- 1. DATA PREPARATION FOR THE INERTIA-ONLY EXPERIMENT ---
# English: Define the two features of interest for this baseline model
features_of_interest = [
    'individual_previous_stress_level',
]

# English: Select only the necessary columns and drop rows with missing values
# (the first entry for each user will be NaN for these features)
df_inertia = dataset[['user_id', 'stress_level'] + features_of_interest].copy()
df_inertia.dropna(inplace=True)

# English: Prepare data for modeling using only the inertia features
X = df_inertia[features_of_interest]
y = df_inertia['stress_level']
groups = df_inertia['user_id']

print(f"--- Experiment Setup: Training on {len(X.columns)} inertia-based features only. ---")
print(f"Number of samples after dropping NaNs: {len(df_inertia)}")


# --- 2. EXPERIMENT CONFIGURATION ---
random_seed = 3052011
np.random.seed(random_seed)
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)
results_list = []

# English: Define the models to be tested
models_to_test = {
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(random_state=random_seed, max_iter=10000, class_weight='balanced'))
    ]),
    "XGBoost": XGBClassifier(random_state=random_seed),
    "LightGBM": LGBMClassifier(random_state=random_seed, verbose=-1, class_weight='balanced'),
    "CatBoost": CatBoostClassifier(random_state=random_seed, verbose=0, auto_class_weights='Balanced')
}

# --- 3. CROSS-VALIDATION LOOP ---
print(f"--- Starting cross-validation with {n_splits} folds ---")
for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    xgb_sample_weights = class_weight.compute_sample_weight('balanced', y=y_train)

    for name, model in models_to_test.items():
        if name == "XGBoost":
            model.fit(X_train, y_train, sample_weight=xgb_sample_weights)
        else:
            model.fit(X_train, y_train)
        
        preds = model.predict(X_test)
        f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
        results_list.append({'Fold': fold, 'Algorithm': name, 'F1-Score (weighted)': f1})

# --- 4. RESULTS PRESENTATION ---
print("\n--- Average Performance Summary (Inertia Features Only) ---")
results_df = pd.DataFrame(results_list)
summary = results_df.groupby('Algorithm')['F1-Score (weighted)'].agg(['mean', 'std']).reset_index()
display(summary)


# --- 5. DETAILED ANALYSIS OF THE BEST MODEL ---
print("\n\n================================================")
print("--- Detailed Analysis of the Best Performing Model (Inertia Features) ---")
print("================================================")

best_model_name = summary.loc[summary['mean'].idxmax()]['Algorithm']
best_model_score = summary.loc[summary['mean'].idxmax()]['mean']
print(f"Best performing model identified: {best_model_name} (Average F1-Score: {best_model_score:.4f})")

best_model_config = models_to_test[best_model_name]

gss = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=random_seed)
final_train_idx, final_test_idx = next(gss.split(X, y, groups=groups))

X_train_final, X_test_final = X.iloc[final_train_idx], X.iloc[final_test_idx]
y_train_final, y_test_final = y.iloc[final_train_idx], y.iloc[final_test_idx]

print(f"\n--- Retraining {best_model_name} on the final training set ---")
if best_model_name == "XGBoost":
    final_xgb_weights = class_weight.compute_sample_weight('balanced', y=y_train_final)
    best_model_config.fit(X_train_final, y_train_final, sample_weight=final_xgb_weights)
else:
    best_model_config.fit(X_train_final, y_train_final)

print("\n--- Final Classification Report ---")
final_predictions = best_model_config.predict(X_test_final)
target_names = [f'Stress Level {i}' for i in sorted(y.unique())]
report = classification_report(y_test_final, final_predictions, target_names=target_names)
print(report)

--- Experiment Setup: Training on 1 inertia-based features only. ---
Number of samples after dropping NaNs: 624
--- Starting cross-validation with 5 folds ---

--- Average Performance Summary (Inertia Features Only) ---


Unnamed: 0,Algorithm,mean,std
0,CatBoost,0.564583,0.073767
1,LightGBM,0.564583,0.073767
2,Logistic Regression,0.564583,0.073767
3,XGBoost,0.564583,0.073767




--- Detailed Analysis of the Best Performing Model (Inertia Features) ---
Best performing model identified: CatBoost (Average F1-Score: 0.5646)

--- Retraining CatBoost on the final training set ---

--- Final Classification Report ---
                precision    recall  f1-score   support

Stress Level 0       0.46      0.50      0.48        22
Stress Level 1       0.64      0.63      0.64        79
Stress Level 2       0.62      0.61      0.62        62

      accuracy                           0.61       163
     macro avg       0.57      0.58      0.58       163
  weighted avg       0.61      0.61      0.61       163



In [13]:
# English: This code block should be added at the end of your previous script,
# after you have generated the classification report for your best model.

# --- 6. SANITY CHECK: COMPARE AGAINST A NAIVE PERSISTENCE MODEL ---
print("\n\n================================================")
print("--- Sanity Check: Performance of a Naive Persistence Model ---")
print("================================================")

# English: The persistence model's predictions are simply the previous day's stress levels.
# We will use the same final test set for a fair comparison.
# X_test_final and y_test_final are available from the previous step.

# English: Get the 'individual_previous_stress_level' for the test set.
# We need to select it from the original dataframe using the test index.
persistence_preds = X_test_final['individual_previous_stress_level'].astype(int)

# English: Calculate the F1-score for this naive model
persistence_f1_score = f1_score(y_test_final, persistence_preds, average='weighted', zero_division=0)

print(f"F1-Score of Naive Persistence Model (predicting yesterday's stress): {persistence_f1_score:.4f}")

# English: Retrieve the score of your best ML model for direct comparison
# This assumes 'final_predictions' holds the predictions from your best ML model
best_model_f1_score = f1_score(y_test_final, final_predictions, average='weighted', zero_division=0)

print(f"F1-Score of your Best ML Model ({best_model_name}): {best_model_f1_score:.4f}")

# English: Conclude based on the comparison
if best_model_f1_score > persistence_f1_score:
    print("\nConclusion: Your ML model IS learning patterns beyond simple persistence. It provides value.")
else:
    print("\nConclusion: WARNING! Your ML model is NOT outperforming a simple persistence baseline.")
    
# English: You can also print the classification report for the persistence model to see its weaknesses
print("\n--- Classification Report for Naive Persistence Model ---")
persistence_report = classification_report(y_test_final, persistence_preds, target_names=target_names)
print(persistence_report)





--- Sanity Check: Performance of a Naive Persistence Model ---
F1-Score of Naive Persistence Model (predicting yesterday's stress): 0.6083
F1-Score of your Best ML Model (CatBoost): 0.6083


--- Classification Report for Naive Persistence Model ---
                precision    recall  f1-score   support

Stress Level 0       0.46      0.50      0.48        22
Stress Level 1       0.64      0.63      0.64        79
Stress Level 2       0.62      0.61      0.62        62

      accuracy                           0.61       163
     macro avg       0.57      0.58      0.58       163
  weighted avg       0.61      0.61      0.61       163



In [14]:
# English: This code can also be run at the end of your script.

print("\n\n================================================")
print("--- Qualitative Analysis of Model Predictions ---")
print("================================================")

# English: Create a comparison dataframe
comparison_df = pd.DataFrame({
    'y_true': y_test_final,
    'y_persistence': persistence_preds,
    'y_model_pred': final_predictions
})

# English: Find cases where the model correctly predicted a CHANGE in stress
model_is_smart = comparison_df[
    (comparison_df['y_true'] != comparison_df['y_persistence']) &  # Stress level changed
    (comparison_df['y_true'] == comparison_df['y_model_pred'])   # Model predicted it correctly
]

print(f"\nModel correctly predicted a change in stress {len(model_is_smart)} times.")
if not model_is_smart.empty:
    print("Examples where the model was smart:")
    display(model_is_smart.head())

# English: Find cases where the model failed to predict a change that persistence would have missed anyway
model_missed_change = comparison_df[
    (comparison_df['y_true'] != comparison_df['y_persistence']) &  # Stress level changed
    (comparison_df['y_true'] != comparison_df['y_model_pred'])   # Model also missed it
]
print(f"\nModel failed to predict a change in stress {len(model_missed_change)} times.")
if not model_missed_change.empty:
    print("Examples where the model missed a change:")
    display(model_missed_change.head())




--- Qualitative Analysis of Model Predictions ---


ValueError: Per-column arrays must each be 1-dimensional

## Stress + metrics derived from stress

# Old

## Without augmentation

In [22]:
dataset = pd.read_csv('../data/processed/studentlife_2014.csv')

In [23]:
# dataset.dropna(inplace=True)

In [24]:
dataset

Unnamed: 0,user_id,date,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,environmental_cloudcover,individual_sleep_duration,individual_sleep_rate,organizational_social_interaction,organizational_social_voice_sum,organizational_social_voice_count,organizational_social_voice_mean,organizational_social_voice_max,individual_minutes_stationary,individual_minutes_walking,individual_minutes_running,individual_minutes_unknown,environmental_minutes_silence,environmental_minutes_voice,environmental_minutes_noise,environmental_minutes_unknown,organizational_work_hours,organizational_deadlines,organizational_days_until_next_deadline,environmental_weekday,individual_personality_extraversion,individual_personality_agreeableness,individual_personality_conscientiousness,individual_personality_neuroticism,individual_personality_openness,individual_previous_stress_level,individual_days_since_previous_stress_measurement
0,4,2013-03-27,0,0.466667,7.2,-6.1,64.125000,75.0,46.0,0.0,27.791667,6.0,2.0,3.0,25142.0,41.0,613.219512,3469.0,505.0,39.0,19.0,5.0,352.0,179.0,277.0,0.0,5.0,0.0,12.0,2,1,4,0,15,17,,
1,4,2013-03-28,1,3.450000,8.0,0.9,76.333333,95.0,47.0,1.5,84.541667,6.0,2.0,3.0,25256.0,37.0,682.594595,3328.0,633.0,57.0,29.0,3.0,410.0,268.0,255.0,0.0,5.0,0.0,11.0,3,1,4,0,15,17,0.0,1.0
2,4,2013-03-29,1,3.354167,8.6,-1.6,75.833333,95.0,55.0,1.3,27.250000,6.0,2.0,3.0,28051.0,39.0,719.256410,4280.0,592.0,76.0,42.0,10.0,368.0,293.0,288.0,0.0,3.0,0.0,10.0,4,1,4,0,15,17,1.0,1.0
3,4,2013-04-02,1,-1.525000,1.0,-3.6,44.291667,53.0,32.0,0.0,37.500000,8.0,2.0,3.0,20964.0,36.0,582.333333,4034.0,635.0,53.0,28.0,4.0,518.0,195.0,176.0,0.0,4.0,0.0,6.0,1,1,4,0,15,17,1.0,4.0
4,4,2013-04-03,1,-1.150000,4.0,-4.2,45.833333,58.0,29.0,0.0,28.583333,8.0,2.0,3.0,29059.0,50.0,581.180000,2884.0,564.0,57.0,23.0,2.0,387.0,300.0,269.0,0.0,3.0,0.0,5.0,2,1,4,0,15,17,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,59,2013-05-21,0,18.033333,24.4,13.9,87.875000,97.0,67.0,5.5,59.125000,2.0,2.0,4.0,31359.0,78.0,402.038462,3528.0,1346.0,55.0,28.0,11.0,468.0,189.0,783.0,0.0,3.0,0.0,3.0,1,14,13,-1,5,23,0.0,1.0
644,59,2013-05-22,0,14.208333,24.5,8.5,87.708333,99.0,63.0,6.2,96.000000,2.0,2.0,4.0,18770.0,49.0,383.061224,2638.0,1344.0,61.0,14.0,16.0,462.0,124.0,849.0,0.0,1.0,0.0,2.0,2,14,13,-1,5,23,0.0,1.0
645,59,2013-05-23,0,18.450000,24.7,13.7,88.083333,99.0,68.0,1.9,93.666667,2.0,2.0,4.0,11873.0,53.0,224.018868,2518.0,555.0,53.0,7.0,5.0,203.0,47.0,370.0,0.0,2.0,0.0,1.0,3,14,13,-1,5,23,0.0,1.0
646,59,2013-05-24,1,13.508333,19.4,6.9,94.250000,100.0,84.0,11.7,99.708333,8.0,2.0,4.0,30018.0,92.0,326.282609,3195.0,1330.0,46.0,12.0,24.0,399.0,178.0,836.0,0.0,2.0,1.0,5.0,4,14,13,-1,5,23,0.0,1.0


In [25]:
pd.set_option('display.max_columns', None)
dataset.describe()

Unnamed: 0,user_id,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,environmental_cloudcover,individual_sleep_duration,individual_sleep_rate,organizational_social_interaction,organizational_social_voice_sum,organizational_social_voice_count,organizational_social_voice_mean,organizational_social_voice_max,individual_minutes_stationary,individual_minutes_walking,individual_minutes_running,individual_minutes_unknown,environmental_minutes_silence,environmental_minutes_voice,environmental_minutes_noise,environmental_minutes_unknown,organizational_work_hours,organizational_deadlines,organizational_days_until_next_deadline,environmental_weekday,individual_personality_extraversion,individual_personality_agreeableness,individual_personality_conscientiousness,individual_personality_neuroticism,individual_personality_openness,individual_previous_stress_level,individual_days_since_previous_stress_measurement
count,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,591.0,591.0,648.0,648.0,648.0,648.0,648.0,648.0,624.0,624.0
mean,33.62037,0.800926,8.512854,14.699537,3.327778,68.407986,88.521605,43.833333,2.281636,48.63098,7.063272,1.935185,3.03858,19066.973765,31.466049,662.832093,3804.962963,650.459877,34.279321,9.324074,14.166667,552.192901,184.385802,130.234568,0.0,4.408951,0.461929,7.020305,2.016975,4.040123,8.822531,7.192901,6.439815,23.845679,0.804487,2.049679
std,17.982157,0.399612,5.562435,6.753744,4.765486,12.982973,12.694466,13.07971,3.664127,31.175947,2.448987,0.846191,0.988368,9815.475593,16.103378,458.478208,2391.010654,155.752454,23.720502,21.754139,32.386909,132.500263,110.337319,118.209585,0.0,3.820677,0.718866,5.758966,1.375887,5.984389,5.860115,6.387402,6.269039,4.994056,0.396913,1.985283
min,4.0,0.0,-1.525,1.0,-6.1,44.291667,53.0,19.0,0.0,0.041667,0.0,1.0,1.0,161.0,1.0,80.5,91.0,20.0,0.0,0.0,0.0,37.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,-5.0,-12.0,-7.0,-4.0,11.0,0.0,1.0
25%,17.0,1.0,3.854167,9.0,-0.6,58.75,80.0,35.0,0.0,27.25,6.0,1.0,2.0,12022.0,22.0,408.991935,2353.5,614.75,19.0,1.0,3.0,482.0,101.0,52.0,0.0,1.0,0.0,3.0,1.0,-1.0,6.0,3.0,2.0,21.0,1.0,1.0
50%,33.0,1.0,7.454167,14.1,2.8,67.791667,94.0,40.0,0.1,39.083333,7.0,2.0,3.0,18763.0,30.0,602.978865,3534.0,655.0,30.0,4.0,6.0,565.0,179.5,103.0,0.0,3.0,0.0,5.0,2.0,3.0,10.0,8.0,5.0,24.0,1.0,1.0
75%,51.0,1.0,13.508333,20.5,6.8,78.958333,99.0,54.0,2.3,77.375,8.0,2.0,4.0,25406.75,40.0,795.75,4661.75,677.0,43.0,13.0,12.0,630.0,262.25,172.0,0.0,6.0,1.0,9.0,3.0,9.0,13.0,10.0,11.0,26.0,1.0,3.0
max,59.0,1.0,18.45,26.4,13.9,94.25,100.0,84.0,15.0,99.916667,17.0,4.0,5.0,56133.0,109.0,8578.0,22576.0,1346.0,194.0,424.0,375.0,1075.0,615.0,849.0,0.0,12.0,3.0,33.0,4.0,16.0,15.0,19.0,18.0,34.0,1.0,15.0


In [26]:
dataset

Unnamed: 0,user_id,date,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,environmental_cloudcover,individual_sleep_duration,individual_sleep_rate,organizational_social_interaction,organizational_social_voice_sum,organizational_social_voice_count,organizational_social_voice_mean,organizational_social_voice_max,individual_minutes_stationary,individual_minutes_walking,individual_minutes_running,individual_minutes_unknown,environmental_minutes_silence,environmental_minutes_voice,environmental_minutes_noise,environmental_minutes_unknown,organizational_work_hours,organizational_deadlines,organizational_days_until_next_deadline,environmental_weekday,individual_personality_extraversion,individual_personality_agreeableness,individual_personality_conscientiousness,individual_personality_neuroticism,individual_personality_openness,individual_previous_stress_level,individual_days_since_previous_stress_measurement
0,4,2013-03-27,0,0.466667,7.2,-6.1,64.125000,75.0,46.0,0.0,27.791667,6.0,2.0,3.0,25142.0,41.0,613.219512,3469.0,505.0,39.0,19.0,5.0,352.0,179.0,277.0,0.0,5.0,0.0,12.0,2,1,4,0,15,17,,
1,4,2013-03-28,1,3.450000,8.0,0.9,76.333333,95.0,47.0,1.5,84.541667,6.0,2.0,3.0,25256.0,37.0,682.594595,3328.0,633.0,57.0,29.0,3.0,410.0,268.0,255.0,0.0,5.0,0.0,11.0,3,1,4,0,15,17,0.0,1.0
2,4,2013-03-29,1,3.354167,8.6,-1.6,75.833333,95.0,55.0,1.3,27.250000,6.0,2.0,3.0,28051.0,39.0,719.256410,4280.0,592.0,76.0,42.0,10.0,368.0,293.0,288.0,0.0,3.0,0.0,10.0,4,1,4,0,15,17,1.0,1.0
3,4,2013-04-02,1,-1.525000,1.0,-3.6,44.291667,53.0,32.0,0.0,37.500000,8.0,2.0,3.0,20964.0,36.0,582.333333,4034.0,635.0,53.0,28.0,4.0,518.0,195.0,176.0,0.0,4.0,0.0,6.0,1,1,4,0,15,17,1.0,4.0
4,4,2013-04-03,1,-1.150000,4.0,-4.2,45.833333,58.0,29.0,0.0,28.583333,8.0,2.0,3.0,29059.0,50.0,581.180000,2884.0,564.0,57.0,23.0,2.0,387.0,300.0,269.0,0.0,3.0,0.0,5.0,2,1,4,0,15,17,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,59,2013-05-21,0,18.033333,24.4,13.9,87.875000,97.0,67.0,5.5,59.125000,2.0,2.0,4.0,31359.0,78.0,402.038462,3528.0,1346.0,55.0,28.0,11.0,468.0,189.0,783.0,0.0,3.0,0.0,3.0,1,14,13,-1,5,23,0.0,1.0
644,59,2013-05-22,0,14.208333,24.5,8.5,87.708333,99.0,63.0,6.2,96.000000,2.0,2.0,4.0,18770.0,49.0,383.061224,2638.0,1344.0,61.0,14.0,16.0,462.0,124.0,849.0,0.0,1.0,0.0,2.0,2,14,13,-1,5,23,0.0,1.0
645,59,2013-05-23,0,18.450000,24.7,13.7,88.083333,99.0,68.0,1.9,93.666667,2.0,2.0,4.0,11873.0,53.0,224.018868,2518.0,555.0,53.0,7.0,5.0,203.0,47.0,370.0,0.0,2.0,0.0,1.0,3,14,13,-1,5,23,0.0,1.0
646,59,2013-05-24,1,13.508333,19.4,6.9,94.250000,100.0,84.0,11.7,99.708333,8.0,2.0,4.0,30018.0,92.0,326.282609,3195.0,1330.0,46.0,12.0,24.0,399.0,178.0,836.0,0.0,2.0,1.0,5.0,4,14,13,-1,5,23,0.0,1.0


In [27]:
# --- 1. DATA PREPARATION ---
from sklearn.utils import class_weight

# --- 1. DATA PREPARATION ---

# --- THE FIX: Add a control flag for the filtering logic ---
FILTER_FOR_TOP_USERS = False # Set to False to use all users

# English: Initial data cleaning
df = dataset.drop(columns=['individual_previous_stress_level', 'individual_days_since_previous_stress_measurement'])
df.dropna(inplace=True)

random_seed= 3052011

np.random.seed(random_seed)
# English: Optional filtering block
if FILTER_FOR_TOP_USERS:
    print("--- Filtering for the top 20 users with the most responses ---")
    
    # Step 1: Get the response counts for each user
    user_counts = df['user_id'].value_counts()
    
    # Step 2: Get the list of the top 20 user IDs
    # We use .index to get the user_id values
    top_20_users = user_counts.head(20).index
    
    # Step 3: Filter the dataframe to keep only these top users
    # .isin() checks which rows have a 'user_id' that is in our list
    df_filtered = df[df['user_id'].isin(top_20_users)].copy()
    
    print(f"Original number of users: {df['user_id'].nunique()}")
    print(f"Number of users after filtering: {df_filtered['user_id'].nunique()}")
    
    # English: Prepare data for modeling using the filtered dataframe
    X = df_filtered.drop(columns=['user_id', 'stress_level', 'date'])
    y = df_filtered['stress_level']
    groups = df_filtered['user_id']
    
else:
    print("--- Using all available users (no filtering) ---")
    
    # English: Prepare data for modeling using the original dataframe
    X = df.drop(columns=['user_id', 'stress_level', 'date'])
    y = df['stress_level']
    groups = df['user_id']


# --- The rest of your experiment pipeline remains exactly the same ---
# --- 2. EXPERIMENT CONFIGURATION ---
# n_splits = ...
# models_to_test = { ... }
# ...


# --- 2. EXPERIMENT CONFIGURATION ---
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)
results_list = []

# --- THE FIX: Add class weighting parameters to the models ---
# English: Define the models to be tested in a dictionary
models_to_test = {
    # English: For scikit-learn compatible models like Logistic Regression, we use the `class_weight` parameter.
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(random_state=random_seed, max_iter=10000, solver='liblinear', class_weight='balanced'))
    ]),
    
    # English: For XGBoost, the parameter is `scale_pos_weight`, but it's more complex for multiclass.
    # The best approach for XGBoost is to calculate weights manually and pass them to .fit().
    # However, we will handle this inside the loop for a more robust calculation per fold.
    "XGBoost": XGBClassifier(random_state=random_seed),
    
    # English: For LightGBM, the parameter is `class_weight`.
    "LightGBM": LGBMClassifier(random_state=random_seed, verbose=-1, class_weight='balanced'),
    
    # English: For CatBoost, the parameter is `auto_class_weights`.
    "CatBoost": CatBoostClassifier(random_state=random_seed, verbose=0, auto_class_weights='Balanced')
}


# --- 3. CROSS-VALIDATION LOOP ---
print(f"--- Starting cross-validation with {n_splits} folds ---")
for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups)):
    print(f"\n--- Processing Fold {fold + 1}/{n_splits} ---")
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # --- THE FIX for XGBoost: Calculate sample weights for the current training fold ---
    # This is the most robust way to handle class imbalance with XGBoost in a CV setting.
    xgb_sample_weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)

    # English: Iterate through each model defined above
    for name, model in models_to_test.items():
        print(f"  - Training {name}...")
        
        # English: Fit the model on the training data for the current fold
        if name == "XGBoost":
            # Pass the calculated sample weights to the fit method for XGBoost
            model.fit(X_train, y_train, sample_weight=xgb_sample_weights)
        else:
            # Other models handle balancing internally via their parameters
            model.fit(X_train, y_train)
        
        # (The rest of the prediction and evaluation logic remains the same)
        preds = model.predict(X_test)
        f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
        results_list.append({'Fold': fold + 1, 'Algorithm': name, 'F1-Score (weighted)': f1})
        print(f"  - {name} F1-Score: {f1:.4f}")


# --- 4. RESULTS PRESENTATION ---
print("\n--- Final Experiment Results ---")
results_df = pd.DataFrame(results_list)
display(results_df)

print("\n--- Average Performance Summary ---")
summary = results_df.groupby('Algorithm')['F1-Score (weighted)'].agg(['mean', 'std']).reset_index()
display(summary)

--- Using all available users (no filtering) ---
--- Starting cross-validation with 5 folds ---

--- Processing Fold 1/5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.5937
  - Training XGBoost...
  - XGBoost F1-Score: 0.5690
  - Training LightGBM...
  - LightGBM F1-Score: 0.5672
  - Training CatBoost...
  - CatBoost F1-Score: 0.5336

--- Processing Fold 2/5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.7259
  - Training XGBoost...
  - XGBoost F1-Score: 0.7752
  - Training LightGBM...
  - LightGBM F1-Score: 0.7711
  - Training CatBoost...
  - CatBoost F1-Score: 0.8049

--- Processing Fold 3/5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.7699
  - Training XGBoost...
  - XGBoost F1-Score: 0.8328
  - Training LightGBM...
  - LightGBM F1-Score: 0.8067
  - Training CatBoost...
  - CatBoost F1-Score: 0.7910

--- Processing Fold 4/5 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score

Unnamed: 0,Fold,Algorithm,F1-Score (weighted)
0,1,Logistic Regression,0.593683
1,1,XGBoost,0.569032
2,1,LightGBM,0.567177
3,1,CatBoost,0.5336
4,2,Logistic Regression,0.725876
5,2,XGBoost,0.775155
6,2,LightGBM,0.771054
7,2,CatBoost,0.804926
8,3,Logistic Regression,0.769912
9,3,XGBoost,0.832838



--- Average Performance Summary ---


Unnamed: 0,Algorithm,mean,std
0,CatBoost,0.673422,0.134431
1,LightGBM,0.686588,0.128174
2,Logistic Regression,0.65256,0.090085
3,XGBoost,0.691653,0.144263


In [28]:
# --- 5. DETAILED ANALYSIS OF THE BEST MODEL ---
# This is the new section you requested.
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import classification_report
print("\n\n================================================")
print("--- Detailed Analysis of the Best Performing Model ---")
print("================================================")

# English: Step 1: Identify the best model from the summary
best_model_name = summary.loc[summary['mean'].idxmax()]['Algorithm']
best_model_score = summary.loc[summary['mean'].idxmax()]['mean']
print(f"Best performing model identified: {best_model_name} (Average F1-Score: {best_model_score:.4f})")

# English: Get the untrained model configuration
best_model_config = models_to_test[best_model_name]

# English: Step 2: Perform a final, single train-test split that respects user groups
print("\n--- Performing a final train-test split for unbiased evaluation ---")
gss = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=random_seed)
final_train_idx, final_test_idx = next(gss.split(X, y, groups=groups))

X_train_final, X_test_final = X.iloc[final_train_idx], X.iloc[final_test_idx]
y_train_final, y_test_final = y.iloc[final_train_idx], y.iloc[final_test_idx]

print(f"Final training set size: {len(X_train_final)} samples")
print(f"Final test set size: {len(X_test_final)} samples")

# English: Step 3: Retrain the best model on the new, larger training set
print(f"\n--- Retraining {best_model_name} on the final training set ---")
if best_model_name == "XGBoost":
    final_xgb_weights = class_weight.compute_sample_weight('balanced', y=y_train_final)
    best_model_config.fit(X_train_final, y_train_final, sample_weight=final_xgb_weights)
else:
    best_model_config.fit(X_train_final, y_train_final)

# English: Step 4: Generate and display the final classification report
print("\n--- Final Classification Report ---")
final_predictions = best_model_config.predict(X_test_final)

# English: Define class names for a more readable report
target_names = [f'Stress Level {i}' for i in sorted(y.unique())]
report = classification_report(y_test_final, final_predictions, target_names=target_names)

print(report)



--- Detailed Analysis of the Best Performing Model ---
Best performing model identified: XGBoost (Average F1-Score: 0.6917)

--- Performing a final train-test split for unbiased evaluation ---
Final training set size: 422 samples
Final test set size: 169 samples

--- Retraining XGBoost on the final training set ---

--- Final Classification Report ---
                precision    recall  f1-score   support

Stress Level 0       0.19      0.22      0.21        32
Stress Level 1       0.81      0.79      0.80       137

      accuracy                           0.68       169
     macro avg       0.50      0.50      0.50       169
  weighted avg       0.70      0.68      0.69       169



## With augmentation

In [19]:
# English: This script assumes you have already run the necessary import statements
# for pandas, numpy, GroupKFold, f1_score, and all the required models.
# It is also assumed that the 'dataset' variable is loaded.
from sklearn.utils import class_weight
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# --- 1. EXPERIMENT CONFIGURATION ---
window_sizes = [3]
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

# English: This is the main list to collect results from ALL experiments
all_results_list = []

# --- THE FIX: Add class weighting parameters to the models ---


random_seed= 3052011

np.random.seed(random_seed)

# English: Define the models to be tested in a dictionary
models_to_test = {
    # English: For scikit-learn compatible models, we use the `class_weight` parameter.
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(random_state=random_seed, max_iter=100000, solver='liblinear', class_weight='balanced'))
    ]),
    
    # English: XGBoost is a special case. We will handle its weighting inside the loop.
    "XGBoost": XGBClassifier(random_state=random_seed),
    
    # English: LightGBM also has a `class_weight` parameter.
    "LightGBM": LGBMClassifier(random_state=random_seed, verbose=-1, class_weight='balanced'),
    
    # English: CatBoost has its own specific parameter for automatic weighting.
    "CatBoost": CatBoostClassifier(random_state=random_seed, verbose=0, iterations=200, auto_class_weights='Balanced')
}




# --- 2. MAIN EXPERIMENT LOOP ---
for window_size in window_sizes:
    print(f"\n========================================================")
    print(f"--- Starting Experiment for Window Size: {window_size} ---")
    print(f"========================================================")
    
    try:
        # Creating a dummy dataframe for demonstration purposes as I can't access local files.
        # Replace this block with your pd.read_csv line.
        dataset = pd.read_csv(f'../data/augmented/studentlife_2014_{window_size}.csv')
        dataset = dataset.drop(columns=['individual_previous_stress_level', 'individual_days_since_previous_stress_measurement'])
        dataset.dropna(inplace=True)
        # --- End of placeholder block ---

        # English: Prepare data for modeling
        X = dataset.drop(columns=['user_id', 'stress_level', 'date'])
        y = dataset['stress_level']
        groups = dataset['user_id']

    except FileNotFoundError:
        print(f"Error: Data file for window size {window_size} not found. Skipping.")
        continue

    # --- Cross-validation loop for the current dataset ---
    for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups)):
        print(f"\n--- Processing Fold {fold + 1}/{n_splits} for window {window_size} ---")
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # --- THE FIX for XGBoost: Calculate sample weights for the current training fold ---
        # This is the most robust way to handle class imbalance with XGBoost in a CV setting.
        xgb_sample_weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)

        # English: Iterate through each model defined above
        for name, model in models_to_test.items():
            print(f"  - Training {name}...")
            
            # English: Fit the model on the training data for the current fold
            if name == "XGBoost":
                # Pass the calculated sample weights to the fit method for XGBoost
                model.fit(X_train, y_train, sample_weight=xgb_sample_weights)
            else:
                # Other models handle balancing internally via their initialization parameters
                model.fit(X_train, y_train)
            
            # English: Make predictions on the test data
            preds = model.predict(X_test)
            
            # English: Calculate the weighted F1-score
            f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
            
            # English: Store the results, including the window size
            all_results_list.append({
                'Window Size': window_size,
                'Fold': fold + 1,
                'Algorithm': name,
                'F1-Score (weighted)': f1
            })
            print(f"  - {name} F1-Score: {f1:.4f}")

# --- 3. FINAL RESULTS PRESENTATION ---
print("\n\n================================================")
print("--- Final Combined Experiment Results ---")
print("================================================")

if not all_results_list:
    print("No results were generated. Please check data paths.")
else:
    results_df = pd.DataFrame(all_results_list)
    
    # English: Display the full results table
    print("\n--- Full Results Table ---")
    display(results_df)

    # English: Display the summary table, grouped by window size and algorithm
    print("\n--- Average Performance Summary ---")
    summary = results_df.groupby(['Window Size', 'Algorithm'])['F1-Score (weighted)'].agg(['mean', 'std']).reset_index()
    display(summary)



--- Starting Experiment for Window Size: 3 ---

--- Processing Fold 1/5 for window 3 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.6390
  - Training XGBoost...
  - XGBoost F1-Score: 0.7714
  - Training LightGBM...
  - LightGBM F1-Score: 0.7517
  - Training CatBoost...
  - CatBoost F1-Score: 0.8048

--- Processing Fold 2/5 for window 3 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.5765
  - Training XGBoost...
  - XGBoost F1-Score: 0.6573
  - Training LightGBM...
  - LightGBM F1-Score: 0.5714
  - Training CatBoost...
  - CatBoost F1-Score: 0.6457

--- Processing Fold 3/5 for window 3 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.5354
  - Training XGBoost...
  - XGBoost F1-Score: 0.7181
  - Training LightGBM...
  - LightGBM F1-Score: 0.7076
  - Training CatBoost...
  - CatBoost F1-Score: 0.7520

--- Processing Fold 4/5 for window 3 ---
  - Training Logistic Regression...
  - Logistic Regression F1-S

Unnamed: 0,Window Size,Fold,Algorithm,F1-Score (weighted)
0,3,1,Logistic Regression,0.639037
1,3,1,XGBoost,0.771429
2,3,1,LightGBM,0.751673
3,3,1,CatBoost,0.804765
4,3,2,Logistic Regression,0.576485
5,3,2,XGBoost,0.657313
6,3,2,LightGBM,0.571387
7,3,2,CatBoost,0.645719
8,3,3,Logistic Regression,0.535413
9,3,3,XGBoost,0.718095



--- Average Performance Summary ---


Unnamed: 0,Window Size,Algorithm,mean,std
0,3,CatBoost,0.720045,0.075803
1,3,LightGBM,0.677607,0.071333
2,3,Logistic Regression,0.593616,0.039371
3,3,XGBoost,0.679489,0.064159


In [31]:
# English: This script assumes you have already run the necessary import statements
# for pandas, numpy, GroupKFold, f1_score, and all the required models.
# It is also assumed that the 'dataset' variable is loaded.
from sklearn.utils import class_weight
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# --- 1. EXPERIMENT CONFIGURATION ---
window_sizes = [3]
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

# English: This is the main list to collect results from ALL experiments
all_results_list = []

random_seed = 3052011
np.random.seed(random_seed)

# English: Define the models to be tested in a dictionary
models_to_test = {
    # English: Logistic Regression still benefits from a scaler even if inputs are 0-1
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(random_state=random_seed, max_iter=100000, solver='liblinear', class_weight='balanced'))
    ]),
    "XGBoost": XGBClassifier(random_state=random_seed),
    "LightGBM": LGBMClassifier(random_state=random_seed, verbose=-1, class_weight='balanced'),
    "CatBoost": CatBoostClassifier(random_state=random_seed, verbose=0, iterations=200, auto_class_weights='Balanced')
}

# --- 2. MAIN EXPERIMENT LOOP ---
for window_size in window_sizes:
    print(f"\n========================================================")
    print(f"--- Starting Experiment for Window Size: {window_size} ---")
    print(f"========================================================")
    
    try:
        # Creating a dummy dataframe for demonstration purposes.
        # Replace this block with your actual pd.read_csv line.
        dataset = pd.read_csv(f'../data/augmented/studentlife_2014_interactions.csv')
        dataset = dataset.drop(columns=['individual_previous_stress_level', 'individual_days_since_previous_stress_measurement'])
        dataset.dropna(inplace=True)
        # --- End of placeholder block ---

        # --- THE FIX: Select ONLY the interaction features ---
        # English: Use a list comprehension to get all column names that start with 'interaction_'
        interaction_features = [col for col in dataset.columns if col.startswith('interaction_')]
        
        # English: Prepare data for modeling using only these selected features
        X = dataset[interaction_features]
        y = dataset['stress_level']
        groups = dataset['user_id']
        
        print(f"--- Experiment Setup: Training on {len(X.columns)} interaction features only. ---")

    except FileNotFoundError:
        print(f"Error: Data file 'studentlife_2014_interactions.csv' not found. Skipping.")
        continue

    # --- Cross-validation loop for the current dataset ---
    for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups)):
        print(f"\n--- Processing Fold {fold + 1}/{n_splits} for window {window_size} ---")
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # English: Calculate sample weights for the current training fold
        xgb_sample_weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)

        # English: Iterate through each model defined above
        for name, model in models_to_test.items():
            print(f"  - Training {name}...")
            
            # English: Fit the model on the training data for the current fold
            if name == "XGBoost":
                model.fit(X_train, y_train, sample_weight=xgb_sample_weights)
            else:
                model.fit(X_train, y_train)
            
            # English: Make predictions on the test data
            preds = model.predict(X_test)
            
            # English: Calculate the weighted F1-score
            f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
            
            # English: Store the results
            all_results_list.append({
                'Window Size': window_size,
                'Fold': fold + 1,
                'Algorithm': name,
                'F1-Score (weighted)': f1
            })
            print(f"  - {name} F1-Score: {f1:.4f}")

# --- 3. FINAL RESULTS PRESENTATION ---
print("\n\n================================================")
print("--- Final Combined Experiment Results (Interaction Features Only) ---")
print("================================================")

if not all_results_list:
    print("No results were generated. Please check data paths.")
else:
    results_df = pd.DataFrame(all_results_list)
    
    # English: Display the full results table
    print("\n--- Full Results Table ---")
    display(results_df)

    # English: Display the summary table
    print("\n--- Average Performance Summary ---")
    summary = results_df.groupby(['Window Size', 'Algorithm'])['F1-Score (weighted)'].agg(['mean', 'std']).reset_index()
    display(summary)




--- Starting Experiment for Window Size: 3 ---
--- Experiment Setup: Training on 135 interaction features only. ---

--- Processing Fold 1/5 for window 3 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3647
  - Training XGBoost...
  - XGBoost F1-Score: 0.3406
  - Training LightGBM...
  - LightGBM F1-Score: 0.4008
  - Training CatBoost...
  - CatBoost F1-Score: 0.3822

--- Processing Fold 2/5 for window 3 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.4782
  - Training XGBoost...
  - XGBoost F1-Score: 0.4103
  - Training LightGBM...
  - LightGBM F1-Score: 0.3840
  - Training CatBoost...
  - CatBoost F1-Score: 0.3753

--- Processing Fold 3/5 for window 3 ---
  - Training Logistic Regression...
  - Logistic Regression F1-Score: 0.3821
  - Training XGBoost...
  - XGBoost F1-Score: 0.5411
  - Training LightGBM...
  - LightGBM F1-Score: 0.5191
  - Training CatBoost...
  - CatBoost F1-Score: 0.4644

--- Processing Fold 4/5 for window 3

Unnamed: 0,Window Size,Fold,Algorithm,F1-Score (weighted)
0,3,1,Logistic Regression,0.364657
1,3,1,XGBoost,0.340621
2,3,1,LightGBM,0.400751
3,3,1,CatBoost,0.382153
4,3,2,Logistic Regression,0.478192
5,3,2,XGBoost,0.410332
6,3,2,LightGBM,0.384029
7,3,2,CatBoost,0.375335
8,3,3,Logistic Regression,0.382073
9,3,3,XGBoost,0.541086



--- Average Performance Summary ---


Unnamed: 0,Window Size,Algorithm,mean,std
0,3,CatBoost,0.38927,0.067113
1,3,LightGBM,0.397107,0.081144
2,3,Logistic Regression,0.387333,0.05747
3,3,XGBoost,0.397222,0.09216


## Old

In [4]:
dataset.columns

Index(['user_id', 'date', 'stress_level', 'environmental_temperature_mean',
       'environmental_temperature_max', 'environmental_temperature_min',
       'environmental_humidity_mean', 'environmental_humidity_max',
       'environmental_humidity_min', 'environmental_precipitation',
       'environmental_cloudcover', 'individual_sleep_duration',
       'individual_sleep_rate', 'organizational_social_interaction',
       'organizational_social_voice_sum', 'organizational_social_voice_count',
       'organizational_social_voice_mean', 'organizational_social_voice_max',
       'individual_minutes_stationary', 'individual_minutes_walking',
       'individual_minutes_running', 'individual_minutes_unknown',
       'environmental_minutes_silence', 'environmental_minutes_voice',
       'environmental_minutes_noise', 'environmental_minutes_unknown',
       'organizational_work_hours', 'deadlines', 'days_until_next_deadline',
       'weekday', 'individual_personality_extraversion',
       'indi

In [5]:
rename_map = {
    'deadlines': 'organizational_deadlines',
    'days_until_next_deadline': 'organizational_days_until_next_deadline',
    'weekday': 'environmental_weekday'
}

dataset = dataset.rename(columns=rename_map)

In [7]:
def generate_features_for_columns(df, feature_columns, window_size, feature_function):
    """
    Applies a feature generation function to a list of specified columns.

    Parameters:
    -----------
    df : pandas.DataFrame
        The input dataframe.
    feature_columns : list
        A list of column names to generate features for.
    window_size : int
        The rolling window size to use.
    feature_function : function
        The function to apply (e.g., add_stress_rolling_features).

    Returns:
    --------
    pandas.DataFrame
        The dataframe enriched with all the new features.
    """
    df_enriched = df.copy()
    
    # Track original columns to avoid creating features on features
    original_cols = set(df_enriched.columns)
    
    for col in feature_columns:
        if col in original_cols:
            print(f"Generating features for column: '{col}' with window size {window_size}...")
            df_enriched = feature_function(df_enriched, window_size, col)
        else:
            print(f"Warning: Column '{col}' not found in the initial dataframe. Skipping.")
            
    print("\nFeature generation complete.")
    return df_enriched



In [12]:
enriched_df.describe()

Unnamed: 0,user_id,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,environmental_cloudcover,...,stress_level_rolling_q75_3d,stress_level_rolling_range_3d,stress_level_rolling_iqr_3d,stress_level_rolling_cv_3d,stress_level_rolling_trend_slope_3d,stress_level_rolling_direction_changes_3d,stress_level_rolling_entropy_3d,stress_level_rolling_zscore_3d,stress_level_rolling_time_since_peak_3d,stress_level_rolling_time_since_trough_3d
count,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,...,647.0,647.0,647.0,600.0,600.0,647.0,647.0,600.0,647.0,647.0
mean,33.62037,1.154321,8.512854,14.699537,3.327778,68.407986,88.521605,43.833333,2.281636,48.63098,...,1.268934,0.476043,0.238022,0.442335,0.013333,0.0,0.418856,0.015321,0.744977,0.76507
std,17.982157,0.742368,5.562435,6.753744,4.765486,12.982973,12.694466,13.07971,3.664127,31.175947,...,0.625293,0.603625,0.301813,0.581263,0.798468,0.0,0.493753,0.475369,0.436211,0.424283
min,4.0,0.0,-1.525,1.0,-6.1,44.291667,53.0,19.0,0.0,0.041667,...,0.0,0.0,0.0,0.0,-2.0,0.0,0.0,-0.707107,0.0,0.0
25%,17.0,1.0,3.854167,9.0,-0.6,58.75,80.0,35.0,0.0,27.25,...,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,33.0,1.0,7.454167,14.1,2.8,67.791667,94.0,40.0,0.1,39.083333,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
75%,51.0,2.0,13.508333,20.5,6.8,78.958333,99.0,54.0,2.3,77.375,...,1.75,1.0,0.5,0.471405,0.0,0.0,1.0,0.0,1.0,1.0
max,59.0,2.0,18.45,26.4,13.9,94.25,100.0,84.0,15.0,99.916667,...,2.0,2.0,1.0,1.414214,2.0,0.0,1.0,0.707107,1.0,1.0


In [18]:
np.random.seed(24091993)

# English: Suppress Optuna's trial logs for a cleaner output
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings("ignore", category=UserWarning)

# enriched_df = enriched_df[enriched_df['user_id'] != 59]# dataset.copy()

# English: Filter out users who do not have all three stress classes
#user_class_diversity = enriched_df.groupby('user_id')['stress_level'].nunique()
#complete_users = user_class_diversity[user_class_diversity == 3].index
#df_complete_stress = enriched_df[enriched_df['user_id'].isin(complete_users)].copy()

# English: From the remaining users, select the top 20 by response count
#user_counts_filtered = df_complete_stress['user_id'].value_counts()
#num_top_users = min(20, len(user_counts_filtered))
#top_users_from_complete = user_counts_filtered.head(num_top_users).index
#df_final_selection = df_complete_stress[df_complete_stress['user_id'].isin(top_users_from_complete)].copy()

# English: Now, handle NaNs and Infs
enriched_df_filled = enriched_df.dropna()
#enriched_df_filled.replace([np.inf, -np.inf], 0, inplace=True)
df_model = enriched_df_filled.sort_values(by='date').reset_index(drop=True)

# English: Define X, Y, and groups for the entire process
Y = df_model['stress_level']
X = df_model.drop(columns=['stress_level', 'user_id', 'date'])

correlation_threshold = 0.98

# Apply the function
X, dropped_columns = remove_highly_correlated_features(X, threshold=correlation_threshold)
print("List of dropped columns:", dropped_columns)

groups = df_model['user_id']

# ==============================================================================
# STAGE 1: GLOBAL FEATURE SELECTION WITH RFECV
# ==============================================================================
print("--- STAGE 1: Finding the globally optimal set of features with RFECV ---")

# English: Define your feature domains based on their prefixes
# (Adjust these lists based on your actual column names)
environmental_cols = [col for col in X.columns if 'environmental_' in col]
individual_cols = [col for col in X.columns if 'individual_' in col]
organizational_cols = [col for col in X.columns if 'organizational_' in col]
stress_history_cols = [col for col in X.columns if 'stress_level_' in col] # Assuming lagged features start with this

feature_domains = {
    "environmental": environmental_cols,
    "individual": individual_cols,
    "organizational": organizational_cols,
    "stress_history": stress_history_cols
}

best_features_per_domain = {}
N_FEATURES_PER_DOMAIN = 1

for domain, cols in feature_domains.items():
    print(f"\n--- Running RFECV for domain: {domain} ({len(cols)} features) ---")
    if not cols:
        print("No columns found for this domain. Skipping.")
        continue
        
    X_domain = X[cols]
    
    # Initialize RFECV for this domain
    estimator = XGBClassifier(objective='multiclass', random_state=24091993, n_jobs=-1)
    cv_strategy = GroupKFold(n_splits=5)
    rfecv_domain = RFECV(
        estimator=estimator,
        step=1,
        cv=cv_strategy,
        scoring='f1_weighted',
        n_jobs=-1,
        min_features_to_select=N_FEATURES_PER_DOMAIN # Select at least N
    )
    
    # Fit on the domain-specific data
    rfecv_domain.fit(X_domain, Y, groups=groups)
    
    # Store the best features for this domain
    selected_cols = X_domain.columns[rfecv_domain.support_].tolist()
    best_features_per_domain[domain] = selected_cols
    print(f"Selected {len(selected_cols)} features for {domain}: {selected_cols}")

# --- Combine the best features from all domains ---
final_selected_features = []
for domain_features in best_features_per_domain.values():
    final_selected_features.extend(domain_features)

# Remove duplicates if any feature was selected in multiple domains
final_selected_features = list(dict.fromkeys(final_selected_features)) 

print(f"\n--- Final combined set of {len(final_selected_features)} features ---")
print(final_selected_features)

# Now, use this `final_selected_features` list to create your final X,
# and proceed with hyperparameter tuning and model evaluation.
X_selected = X[final_selected_features]


# ==============================================================================
# STAGE 2: GLOBAL HYPERPARAMETER TUNING WITH OPTUNA (ON SELECTED FEATURES)
# ==============================================================================
print("\n--- STAGE 2: Finding optimal hyperparameters with Optuna on selected features ---")

def objective(trial, x_data, y_data, group_data):
    param = {
        'verbosity': 0, 'objective': 'multiclass', 'random_state': 24091993,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
    }
    
    gkf = GroupKFold(n_splits=5)
    f1_scores = []
    for train_idx, test_idx in gkf.split(x_data, y_data, groups=group_data):
        X_train, X_test = x_data.iloc[train_idx], x_data.iloc[test_idx]
        y_train, y_test = y_data.iloc[train_idx], y_data.iloc[test_idx]
        
        # English: Apply class weights inside the objective function
        class_weights_fold = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        sample_weights_fold = np.array([class_weights_fold[cls] for cls in y_train])
        
        model = XGBClassifier(**param)
        model.fit(X_train, y_train, sample_weight=sample_weights_fold)
        preds = model.predict(X_test)
        f1_scores.append(f1_score(y_test, preds, average='weighted', zero_division=0))
        
    return np.mean(f1_scores)

# English: Run Optuna study on the data with ONLY the selected features
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_selected, Y, groups), n_trials=50)

best_params = study.best_trial.params
print("\nBest hyperparameters found:", best_params)

# ==============================================================================
# STAGE 3: FINAL UNBIASED EVALUATION
# ==============================================================================
print(f"\n--- STAGE 3: Final evaluation using {len(final_selected_features)} best features and optimal hyperparameters ---")

n_splits = 5
gkf_final = GroupKFold(n_splits=n_splits)
all_accuracies = []
all_f1_scores = []

for fold, (train_idx, test_idx) in enumerate(gkf_final.split(X_selected, Y, groups=groups)):
    print(f"\n--- Fold {fold + 1}/{n_splits} ---")
    
    # English: Use the pre-selected features (X_selected) for splitting
    X_train, X_test = X_selected.iloc[train_idx], X_selected.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    
    # English: Compute sample weights for the current training fold
    class_weights = compute_class_weight('balanced', classes=np.unique(Y_train), y=Y_train)
    sample_weights = np.array([class_weights[cls] for cls in Y_train])    

    # English: Initialize model with the best global parameters
    model = XGBClassifier(objective='multiclass', random_state=24091993, **best_params)
    model.fit(X_train, Y_train, sample_weight=sample_weights)
    
    # English: Evaluate the model
    predictions = model.predict(X_test)
    accuracy = accuracy_score(Y_test, predictions)
    f1 = f1_score(Y_test, predictions, average='weighted', zero_division=0)
    
    all_accuracies.append(accuracy)
    all_f1_scores.append(f1)
    
    print(f"Fold Accuracy: {accuracy:.4f}")
    print(f"Fold F1-Score (Weighted): {f1:.4f}")

# English: Display final results
print("\n--- Final Cross-Validation Results ---")
print(f"Mean Accuracy: {np.mean(all_accuracies):.4f} ± {np.std(all_accuracies):.4f}")
print(f"Mean F1-Score (Weighted): {np.mean(all_f1_scores):.4f} ± {np.std(all_f1_scores):.4f}")


Removed 18 columns with zero or single unique values: ['environmental_minutes_unknown', 'environmental_minutes_unknown_rolling_mean_3d', 'environmental_minutes_unknown_rolling_std_3d', 'environmental_minutes_unknown_rolling_min_3d', 'environmental_minutes_unknown_rolling_max_3d', 'environmental_minutes_unknown_rolling_median_3d', 'environmental_minutes_unknown_rolling_q25_3d', 'environmental_minutes_unknown_rolling_q75_3d', 'environmental_minutes_unknown_rolling_range_3d', 'environmental_minutes_unknown_rolling_iqr_3d', 'environmental_minutes_unknown_rolling_cv_3d', 'environmental_minutes_unknown_rolling_trend_slope_3d', 'environmental_minutes_unknown_rolling_direction_changes_3d', 'environmental_minutes_unknown_rolling_entropy_3d', 'environmental_minutes_unknown_rolling_zscore_3d', 'environmental_minutes_unknown_rolling_time_since_peak_3d', 'environmental_minutes_unknown_rolling_time_since_trough_3d', 'stress_level_rolling_direction_changes_3d']
List of dropped columns: ['environmenta

In [19]:
# English: Import the datetime library at the top of your script
import datetime

# --- Option 2: Append results to a log file with a timestamp ---

# English: Define the output filename
results_log_filename = 'experiment_log.txt'

# English: Get the current timestamp
timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# English: Open the file in append mode ('a') to add new results
with open(results_log_filename, 'a') as f:
    print(f"Appending results to {results_log_filename}...")
    
    f.write(f"\n-----------------------------------------------------\n")
    f.write(f"\n-----------------------------------------------------\n")
    
    # English: Write a separator and timestamp for this run
    f.write(f"\n--- Experiment Run: {timestamp} ---\n")
    
    # English: Write the metrics
    f.write(f"Mean Accuracy: {np.mean(all_accuracies):.4f} ± {np.std(all_accuracies):.4f}\n")
    f.write(f"Mean F1-Score (Weighted): {np.mean(all_f1_scores):.4f} ± {np.std(all_f1_scores):.4f}\n")
    f.write("\nBest hyperparameters found: " + str(best_params))
    f.write(f"\n--- Final combined set of {len(final_selected_features)} features ---")
    f.write(str(final_selected_features))
    
    f.write(f"\n-----------------------------------------------------\n")
    f.write(f"\n-----------------------------------------------------\n")
print("Results successfully logged.")

Appending results to experiment_log.txt...
Results successfully logged.


In [20]:
np.random.seed(24091993)

# English: Suppress Optuna's trial logs for a cleaner output
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings("ignore", category=UserWarning)

enriched_df = enriched_df[enriched_df['user_id'] != 59]# dataset.copy()

# English: Filter out users who do not have all three stress classes
#user_class_diversity = enriched_df.groupby('user_id')['stress_level'].nunique()
#complete_users = user_class_diversity[user_class_diversity == 3].index
#df_complete_stress = enriched_df[enriched_df['user_id'].isin(complete_users)].copy()

# English: From the remaining users, select the top 20 by response count
#user_counts_filtered = df_complete_stress['user_id'].value_counts()
#num_top_users = min(20, len(user_counts_filtered))
#top_users_from_complete = user_counts_filtered.head(num_top_users).index
#df_final_selection = df_complete_stress[df_complete_stress['user_id'].isin(top_users_from_complete)].copy()

# English: Now, handle NaNs and Infs
enriched_df_filled = enriched_df.dropna()
#enriched_df_filled.replace([np.inf, -np.inf], 0, inplace=True)
df_model = enriched_df_filled.sort_values(by='date').reset_index(drop=True)

# English: Define X, Y, and groups for the entire process
Y = df_model['stress_level']
X = df_model.drop(columns=['stress_level', 'user_id', 'date'])

correlation_threshold = 0.98

# Apply the function
X, dropped_columns = remove_highly_correlated_features(X, threshold=correlation_threshold)
print("List of dropped columns:", dropped_columns)

groups = df_model['user_id']

# ==============================================================================
# STAGE 1: GLOBAL FEATURE SELECTION WITH RFECV
# ==============================================================================
print("--- STAGE 1: Finding the globally optimal set of features with RFECV ---")
        
# Initialize RFECV for this domain
estimator = XGBClassifier(objective='multiclass', random_state=24091993, n_jobs=-1)
cv_strategy = GroupKFold(n_splits=5)
rfecv = RFECV(
    estimator=estimator,
    step=1,
    cv=cv_strategy,
    scoring='f1_weighted',
    n_jobs=-1,
    min_features_to_select=1 # Select at least N
)

# Fit on the domain-specific data
rfecv.fit(X, Y, groups=groups)

# Store the best features for this domain
selected_cols = X.columns[rfecv.support_].tolist()

# Remove duplicates if any feature was selected in multiple domains
final_selected_features = selected_cols

print(f"\n--- Final combined set of {len(final_selected_features)} features ---")
print(final_selected_features)

# Now, use this `final_selected_features` list to create your final X,
# and proceed with hyperparameter tuning and model evaluation.
X_selected = X[final_selected_features]


# ==============================================================================
# STAGE 2: GLOBAL HYPERPARAMETER TUNING WITH OPTUNA (ON SELECTED FEATURES)
# ==============================================================================
print("\n--- STAGE 2: Finding optimal hyperparameters with Optuna on selected features ---")

def objective(trial, x_data, y_data, group_data):
    param = {
        'verbosity': 0, 'objective': 'multiclass', 'random_state': 24091993,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
    }
    
    gkf = GroupKFold(n_splits=5)
    f1_scores = []
    for train_idx, test_idx in gkf.split(x_data, y_data, groups=group_data):
        X_train, X_test = x_data.iloc[train_idx], x_data.iloc[test_idx]
        y_train, y_test = y_data.iloc[train_idx], y_data.iloc[test_idx]
        
        # English: Apply class weights inside the objective function
        class_weights_fold = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        sample_weights_fold = np.array([class_weights_fold[cls] for cls in y_train])
        
        model = XGBClassifier(**param)
        model.fit(X_train, y_train, sample_weight=sample_weights_fold)
        preds = model.predict(X_test)
        f1_scores.append(f1_score(y_test, preds, average='weighted', zero_division=0))
        
    return np.mean(f1_scores)

# English: Run Optuna study on the data with ONLY the selected features
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_selected, Y, groups), n_trials=50)

best_params = study.best_trial.params
print("\nBest hyperparameters found:", best_params)

# ==============================================================================
# STAGE 3: FINAL UNBIASED EVALUATION
# ==============================================================================
print(f"\n--- STAGE 3: Final evaluation using {len(final_selected_features)} best features and optimal hyperparameters ---")

n_splits = 5
gkf_final = GroupKFold(n_splits=n_splits)
all_accuracies = []
all_f1_scores = []

for fold, (train_idx, test_idx) in enumerate(gkf_final.split(X_selected, Y, groups=groups)):
    print(f"\n--- Fold {fold + 1}/{n_splits} ---")
    
    # English: Use the pre-selected features (X_selected) for splitting
    X_train, X_test = X_selected.iloc[train_idx], X_selected.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    
    # English: Compute sample weights for the current training fold
    class_weights = compute_class_weight('balanced', classes=np.unique(Y_train), y=Y_train)
    sample_weights = np.array([class_weights[cls] for cls in Y_train])    

    # English: Initialize model with the best global parameters
    model = XGBClassifier(objective='multiclass', random_state=24091993, **best_params)
    model.fit(X_train, Y_train, sample_weight=sample_weights)
    
    # English: Evaluate the model
    predictions = model.predict(X_test)
    accuracy = accuracy_score(Y_test, predictions)
    f1 = f1_score(Y_test, predictions, average='weighted', zero_division=0)
    
    all_accuracies.append(accuracy)
    all_f1_scores.append(f1)
    
    print(f"Fold Accuracy: {accuracy:.4f}")
    print(f"Fold F1-Score (Weighted): {f1:.4f}")

# English: Display final results
print("\n--- Final Cross-Validation Results ---")
print(f"Mean Accuracy: {np.mean(all_accuracies):.4f} ± {np.std(all_accuracies):.4f}")
print(f"Mean F1-Score (Weighted): {np.mean(all_f1_scores):.4f} ± {np.std(all_f1_scores):.4f}")


Removed 18 columns with zero or single unique values: ['environmental_minutes_unknown', 'environmental_minutes_unknown_rolling_mean_3d', 'environmental_minutes_unknown_rolling_std_3d', 'environmental_minutes_unknown_rolling_min_3d', 'environmental_minutes_unknown_rolling_max_3d', 'environmental_minutes_unknown_rolling_median_3d', 'environmental_minutes_unknown_rolling_q25_3d', 'environmental_minutes_unknown_rolling_q75_3d', 'environmental_minutes_unknown_rolling_range_3d', 'environmental_minutes_unknown_rolling_iqr_3d', 'environmental_minutes_unknown_rolling_cv_3d', 'environmental_minutes_unknown_rolling_trend_slope_3d', 'environmental_minutes_unknown_rolling_direction_changes_3d', 'environmental_minutes_unknown_rolling_entropy_3d', 'environmental_minutes_unknown_rolling_zscore_3d', 'environmental_minutes_unknown_rolling_time_since_peak_3d', 'environmental_minutes_unknown_rolling_time_since_trough_3d', 'stress_level_rolling_direction_changes_3d']
List of dropped columns: ['environmenta

In [21]:
# English: Import the datetime library at the top of your script
import datetime

# --- Option 2: Append results to a log file with a timestamp ---

# English: Define the output filename
results_log_filename = 'experiment_log.txt'

# English: Get the current timestamp
timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# English: Open the file in append mode ('a') to add new results
with open(results_log_filename, 'a') as f:
    print(f"Appending results to {results_log_filename}...")
    
    f.write(f"\n-----------------------------------------------------\n")
    f.write(f"\n-----------------------------------------------------\n")
    # English: Write a separator and timestamp for this run
    f.write(f"\n--- Experiment Run: {timestamp} ---\n")
    
    # English: Write the metrics
    f.write(f"Mean Accuracy: {np.mean(all_accuracies):.4f} ± {np.std(all_accuracies):.4f}\n")
    f.write(f"Mean F1-Score (Weighted): {np.mean(all_f1_scores):.4f} ± {np.std(all_f1_scores):.4f}\n")
    f.write("\nBest hyperparameters found: " + str(best_params))
    f.write(f"\n--- Final combined set of {len(final_selected_features)} features ---")
    f.write(str(final_selected_features))
    f.write(f"\n-----------------------------------------------------\n")
    f.write(f"\n-----------------------------------------------------\n")
print("Results successfully logged.")

Appending results to experiment_log.txt...
Results successfully logged.


In [17]:
np.mean(all_f1_scores)

0.5158066523448012

Unnamed: 0,user_id,date,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,...,stress_level_rolling_q75_3d,stress_level_rolling_range_3d,stress_level_rolling_iqr_3d,stress_level_rolling_cv_3d,stress_level_rolling_trend_slope_3d,stress_level_rolling_direction_changes_3d,stress_level_rolling_entropy_3d,stress_level_rolling_zscore_3d,stress_level_rolling_time_since_peak_3d,stress_level_rolling_time_since_trough_3d
3,4,2013-03-28,0,3.450000,8.0,0.9,76.333333,95.0,47.0,1.5,...,0.75,1.0,0.5,1.414214,1.0,0.0,1.0,-7.071068e-01,0.0,1.0
4,4,2013-03-29,1,3.354167,8.6,-1.6,75.833333,95.0,55.0,1.3,...,1.00,0.0,0.0,0.000000,0.0,0.0,0.0,-1.000000e+08,1.0,1.0
2,4,2013-04-03,2,-1.150000,4.0,-4.2,45.833333,58.0,29.0,0.0,...,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,1.000000e+08,1.0,1.0
5,4,2013-04-04,0,1.929167,8.6,-2.2,47.041667,58.0,33.0,0.0,...,1.75,1.0,0.5,0.471405,1.0,0.0,1.0,7.071068e-01,0.0,1.0
6,4,2013-04-05,2,3.525000,9.9,-2.0,58.875000,78.0,40.0,0.0,...,1.50,2.0,1.0,1.414214,-2.0,0.0,1.0,-7.071068e-01,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,59,2013-05-21,1,18.033333,24.4,13.9,87.875000,97.0,67.0,5.5,...,1.75,1.0,0.5,0.471405,-1.0,0.0,1.0,-7.071068e-01,1.0,0.0
644,59,2013-05-22,1,14.208333,24.5,8.5,87.708333,99.0,63.0,6.2,...,1.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000e+00,1.0,1.0
645,59,2013-05-23,1,18.450000,24.7,13.7,88.083333,99.0,68.0,1.9,...,1.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000e+00,1.0,1.0
646,59,2013-05-24,2,13.508333,19.4,6.9,94.250000,100.0,84.0,11.7,...,1.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000e+00,1.0,1.0
