In [60]:
import pandas as pd
import numpy as np
from scipy.stats import entropy, linregress

In [61]:
dataset = pd.read_csv('processed/studentlife_2014.csv')

In [62]:
dataset

Unnamed: 0,user_id,date,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,...,organizational_deadlines,organizational_days_until_next_deadline,environmental_weekday,individual_personality_extraversion,individual_personality_agreeableness,individual_personality_conscientiousness,individual_personality_neuroticism,individual_personality_openness,individual_previous_stress_level,individual_days_since_previous_stress_measurement
0,4,2013-03-27,0,0.466667,7.2,-6.1,64.125000,75.0,46.0,0.0,...,0.0,12.0,2,1,4,0,15,17,,
1,4,2013-03-28,1,3.450000,8.0,0.9,76.333333,95.0,47.0,1.5,...,0.0,11.0,3,1,4,0,15,17,0.0,1.0
2,4,2013-03-29,1,3.354167,8.6,-1.6,75.833333,95.0,55.0,1.3,...,0.0,10.0,4,1,4,0,15,17,1.0,1.0
3,4,2013-04-02,2,-1.525000,1.0,-3.6,44.291667,53.0,32.0,0.0,...,0.0,6.0,1,1,4,0,15,17,1.0,4.0
4,4,2013-04-03,2,-1.150000,4.0,-4.2,45.833333,58.0,29.0,0.0,...,0.0,5.0,2,1,4,0,15,17,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,59,2013-05-21,0,18.033333,24.4,13.9,87.875000,97.0,67.0,5.5,...,0.0,3.0,1,14,13,-1,5,23,0.0,1.0
644,59,2013-05-22,0,14.208333,24.5,8.5,87.708333,99.0,63.0,6.2,...,0.0,2.0,2,14,13,-1,5,23,0.0,1.0
645,59,2013-05-23,0,18.450000,24.7,13.7,88.083333,99.0,68.0,1.9,...,0.0,1.0,3,14,13,-1,5,23,0.0,1.0
646,59,2013-05-24,1,13.508333,19.4,6.9,94.250000,100.0,84.0,11.7,...,1.0,5.0,4,14,13,-1,5,23,0.0,1.0


In [63]:
dataset.columns

Index(['user_id', 'date', 'stress_level', 'environmental_temperature_mean',
       'environmental_temperature_max', 'environmental_temperature_min',
       'environmental_humidity_mean', 'environmental_humidity_max',
       'environmental_humidity_min', 'environmental_precipitation',
       'environmental_cloudcover', 'individual_sleep_duration',
       'individual_sleep_rate', 'organizational_social_interaction',
       'organizational_social_voice_sum', 'organizational_social_voice_count',
       'organizational_social_voice_mean', 'organizational_social_voice_max',
       'individual_minutes_stationary', 'individual_minutes_walking',
       'individual_minutes_running', 'individual_minutes_unknown',
       'environmental_minutes_silence', 'environmental_minutes_voice',
       'environmental_minutes_noise', 'environmental_minutes_unknown',
       'organizational_work_hours', 'organizational_deadlines',
       'organizational_days_until_next_deadline', 'environmental_weekday',
       

In [64]:
def add_stress_rolling_features(df, window_size, target_col, user_col='user_id', date_col='date', all_features=False, is_target_col=False):
    """
    Add extended rolling window features optimized for stress prediction, including trend, 
    variability, volatility, and complexity measures.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe containing user data.
    window_size : int
        Number of days for the rolling window.
    target_col : str
        Column name for the feature to aggregate.
    user_col : str, optional
        Column name for user identifier (default is 'user_id').
    date_col : str, optional
        Column name for date (default is 'date').
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame with new rolling statistical features.
    """
    df_copy = df.copy()
    df_copy = df_copy.sort_values(by=[user_col, date_col])
    
    grouped = df_copy.groupby(user_col)
    
    # --- Basic and Distributional Statistics ---
    if is_target_col:
        base_series = grouped[target_col].shift(1)
        rolling_obj = base_series.rolling(window=window_size - 1, min_periods=1)
    else:
        base_series = df_copy[target_col]
        rolling_obj = grouped[target_col].rolling(window=window_size, min_periods=1)
    
    df_copy[f'{target_col}_rolling_mean_{window_size}d'] = rolling_obj.mean().reset_index(level=0, drop=True)
    df_copy[f'{target_col}_rolling_std_{window_size}d'] = rolling_obj.std().reset_index(level=0, drop=True)
    df_copy[f'{target_col}_rolling_min_{window_size}d'] = rolling_obj.min().reset_index(level=0, drop=True)
    df_copy[f'{target_col}_rolling_max_{window_size}d'] = rolling_obj.max().reset_index(level=0, drop=True)
    df_copy[f'{target_col}_rolling_median_{window_size}d'] = rolling_obj.median().reset_index(level=0, drop=True)
    df_copy[f'{target_col}_rolling_q25_{window_size}d'] = rolling_obj.quantile(0.25).reset_index(level=0, drop=True)
    df_copy[f'{target_col}_rolling_q75_{window_size}d'] = rolling_obj.quantile(0.75).reset_index(level=0, drop=True)
    
    # --- Stress-Specific Features ---

    # Volatility and Variability
    df_copy[f'{target_col}_rolling_range_{window_size}d'] = df_copy[f'{target_col}_rolling_max_{window_size}d'] - df_copy[f'{target_col}_rolling_min_{window_size}d']
    df_copy[f'{target_col}_rolling_iqr_{window_size}d'] = df_copy[f'{target_col}_rolling_q75_{window_size}d'] - df_copy[f'{target_col}_rolling_q25_{window_size}d']
    df_copy[f'{target_col}_rolling_cv_{window_size}d'] = df_copy[f'{target_col}_rolling_std_{window_size}d'] / (df_copy[f'{target_col}_rolling_mean_{window_size}d'] + 1e-8)

    # Trend and Momentum
    def slope_func(x):
        if len(x) < 2: return np.nan
        return linregress(np.arange(len(x)), x).slope

    df_copy[f'{target_col}_rolling_trend_slope_{window_size}d'] = rolling_obj.apply(slope_func, raw=True).reset_index(level=0, drop=True)

    def direction_changes(x):
        if len(x) < 2: return 0
        # Calculate changes in sign of the first difference
        return np.sum(np.diff(np.sign(np.diff(x))) != 0)

    df_copy[f'{target_col}_rolling_direction_changes_{window_size}d'] = rolling_obj.apply(direction_changes, raw=True).reset_index(level=0, drop=True)


    # Complexity and Context
    def entropy_func(x):
        x_clean = x[~np.isnan(x)]
        if len(x_clean) < 2: return 0
        hist = np.histogram(x_clean, bins=max(2, min(len(x_clean), 5)), density=True)[0]
        return entropy(hist[hist > 0], base=2)

    df_copy[f'{target_col}_rolling_entropy_{window_size}d'] = rolling_obj.apply(entropy_func, raw=True).reset_index(level=0, drop=True)
    
    # Z-score of the last value within the window
    df_copy[f'{target_col}_rolling_zscore_{window_size}d'] = (base_series - df_copy[f'{target_col}_rolling_mean_{window_size}d']) / (df_copy[f'{target_col}_rolling_std_{window_size}d'] + 1e-8)

    # Days since the last peak (max value) in the window
    def time_since_peak(x):
        x_clean = x[~np.isnan(x)]
        if len(x_clean) == 0: return np.nan
        # Argmax sobre los datos limpios, pero el índice se basa en la longitud original para el contexto
        return len(x) - 1 - np.argmax(x)

    df_copy[f'{target_col}_rolling_time_since_peak_{window_size}d'] = rolling_obj.apply(time_since_peak, raw=True).reset_index(level=0, drop=True)
    
    def time_since_trough(x):
        if len(x) == 0: return 0
        return len(x) - 1 - np.argmin(x)
    df_copy[f'{target_col}_rolling_time_since_trough_{window_size}d'] = rolling_obj.apply(time_since_trough, raw=True).reset_index(level=0, drop=True)

    if all_features:
        df_copy[f'{target_col}_rolling_skew_{window_size}d'] = rolling_obj.skew().reset_index(level=0, drop=True)
        df_copy[f'{target_col}_rolling_kurt_{window_size}d'] = rolling_obj.kurt().reset_index(level=0, drop=True)
        
        # Acceleration (mean of second differences)
        def acceleration(x):
            if len(x) < 3: return 0
            return np.mean(np.diff(x, n=2))
            
        df_copy[f'{target_col}_rolling_acceleration_{window_size}d'] = rolling_obj.apply(acceleration, raw=True).reset_index(level=0, drop=True)

        # Rate of change (net change over the window)
        def rate_of_change(x):
            if len(x) < 2: return 0
            return x.iloc[-1] - x.iloc[0]

        df_copy[f'{target_col}_rolling_rate_of_change_{window_size}d'] = rolling_obj.apply(rate_of_change, raw=False).reset_index(level=0, drop=True)

        def mean_diff(x):
            x = x[~np.isnan(x)]
            if len(x) < 2: return 0
            return np.mean(np.diff(x))
        df_copy[f'{target_col}_rolling_mean_diff_{window_size}d'] = rolling_obj.apply(mean_diff, raw=True).reset_index(level=0, drop=True)

        # Diferencia simple del último valor con la mediana
        df_copy[f'{target_col}_last_vs_median_{window_size}d'] = df_copy[target_col] - df_copy[f'{target_col}_rolling_median_{window_size}d']

    return df_copy

In [65]:
def generate_features_for_columns(df, feature_columns, window_size, feature_function):
    """
    Applies a feature generation function to a list of specified columns.

    Parameters:
    -----------
    df : pandas.DataFrame
        The input dataframe.
    feature_columns : list
        A list of column names to generate features for.
    window_size : int
        The rolling window size to use.
    feature_function : function
        The function to apply (e.g., add_stress_rolling_features).

    Returns:
    --------
    pandas.DataFrame
        The dataframe enriched with all the new features.
    """
    df_enriched = df.copy()
    
    # Track original columns to avoid creating features on features
    original_cols = set(df_enriched.columns)
    
    for col in feature_columns:
        if col in original_cols:
            print(f"Generating features for column: '{col}' with window size {window_size}...")
            df_enriched = feature_function(df_enriched, window_size, col)
        else:
            print(f"Warning: Column '{col}' not found in the initial dataframe. Skipping.")
            
    print("\nFeature generation complete.")
    return df_enriched



In [66]:
def remove_highly_correlated_features(df, threshold=0.95):
    """
    Finds and removes one of each pair of highly correlated features in a dataframe.

    Parameters:
    -----------
    df : pandas.DataFrame
        The input dataframe with numerical features.
    threshold : float, optional
        The correlation threshold above which a feature is considered redundant. 
        Defaults to 0.95.

    Returns:
    --------
    pandas.DataFrame
        A new dataframe with highly correlated features removed.
    list
        A list of the column names that were dropped.
    """
    # Create a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    
    # --- Step 1: Remove zero-variance columns ---
    # These columns have no predictive power and can cause issues with correlation calculation.
    cols_to_drop_zerovar = df_copy.columns[df_copy.nunique() <= 1]
    if not cols_to_drop_zerovar.empty:
        df_copy.drop(columns=cols_to_drop_zerovar, inplace=True)
        print(f"Removed {len(cols_to_drop_zerovar)} columns with zero or single unique values: {cols_to_drop_zerovar.tolist()}")
    
    # --- Step 2: Calculate the correlation matrix ---
    # Use .abs() because a strong negative correlation (-0.95) is as redundant as a strong positive one.
    corr_matrix = df_copy.corr().abs()
    
    # --- Step 3: Identify one of each highly correlated pair ---
    # Select the upper triangle of the correlation matrix to avoid duplicates
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find features with correlation greater than the threshold
    cols_to_drop_corr = [column for column in upper.columns if any(upper[column] > threshold)]
    
    # --- Step 4: Drop the identified features ---
    df_reduced = df_copy.drop(columns=cols_to_drop_corr)
    
    # Combine all dropped columns for the report
    all_dropped_cols = cols_to_drop_zerovar.tolist() + cols_to_drop_corr
    
    return df_reduced, all_dropped_cols

In [None]:
# 1. Ensure your main feature generation function is defined
# The `add_stress_rolling_features` function from above should be defined here.

# 2. Define the list of columns to process
predictor_columns = [
    'environmental_temperature_mean',
    'environmental_temperature_max', 'environmental_temperature_min',
    'environmental_humidity_mean', 'environmental_humidity_max',
    'environmental_humidity_min', 'environmental_precipitation',
    'environmental_cloudcover', 'individual_sleep_duration', 'individual_sleep_rate',
    'organizational_social_interaction', 'organizational_social_voice_sum',
    'organizational_social_voice_count', 'organizational_social_voice_mean',
    'organizational_social_voice_max', 'individual_minutes_stationary',
    'individual_minutes_walking', 'individual_minutes_running',
    'individual_minutes_unknown', 'environmental_minutes_silence',
    'environmental_minutes_voice', 'environmental_minutes_noise',
    'environmental_minutes_unknown', 'organizational_work_hours',
    'organizational_deadlines', 'organizational_days_until_next_deadline'
]
# Define the target column separately
target_column = 'stress_level'

# 3. Define the window size and execute the feature generation process
# Let's assume your dataframe is named `dataset`
window_size = 3
enriched_df = dataset.copy()

# --- Step A: Generate features for all predictor variables ---
print(f"--- Generating features for {len(predictor_columns)} predictor columns... ---")
for col in predictor_columns:
    print(f"Processing: {col}")
    enriched_df = add_stress_rolling_features(
        df=enriched_df, 
        window_size=window_size, 
        target_col=col,
        is_target_col=False  # Use standard window [T, T-1, T-2]
    )

# --- Step B: Generate features for the historical stress level ---
print(f"\n--- Generating lagged features for the target column: '{target_column}'... ---")
if target_column in enriched_df.columns:
    enriched_df = add_stress_rolling_features(
        df=enriched_df,
        window_size=window_size,
        target_col=target_column,
        is_target_col=True  # Use lagged window [T-1, T-2]
    )
else:
    print(f"Warning: Target column '{target_column}' not found in the dataframe. Skipping.")

print("\n--- Feature generation complete. ---")

# The `enriched_df` now contains all the desired features,
# with the correct temporal context for both predictors and the target.

--- Generating features for 26 predictor columns... ---
Processing: environmental_temperature_mean
Processing: environmental_temperature_max
Processing: environmental_temperature_min
Processing: environmental_humidity_mean
Processing: environmental_humidity_max
Processing: environmental_humidity_min
Processing: environmental_precipitation
Processing: environmental_cloudcover
Processing: individual_sleep_duration
Processing: individual_sleep_rate
Processing: organizational_social_interaction
Processing: organizational_social_voice_sum
Processing: organizational_social_voice_count
Processing: organizational_social_voice_mean
Processing: organizational_social_voice_max
Processing: individual_minutes_stationary
Processing: individual_minutes_walking
Processing: individual_minutes_running
Processing: individual_minutes_unknown
Processing: environmental_minutes_silence
Processing: environmental_minutes_voice
Processing: environmental_minutes_noise
Processing: environmental_minutes_unknown
Pr

In [68]:
enriched_df

Unnamed: 0,user_id,date,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,...,stress_level_rolling_q75_5d,stress_level_rolling_range_5d,stress_level_rolling_iqr_5d,stress_level_rolling_cv_5d,stress_level_rolling_trend_slope_5d,stress_level_rolling_direction_changes_5d,stress_level_rolling_entropy_5d,stress_level_rolling_zscore_5d,stress_level_rolling_time_since_peak_5d,stress_level_rolling_time_since_trough_5d
0,4,2013-03-27,0,0.466667,7.2,-6.1,64.125000,75.0,46.0,0.0,...,,,,,,,,,,
1,4,2013-03-28,1,3.450000,8.0,0.9,76.333333,95.0,47.0,1.5,...,0.00,0.0,0.00,,,0.0,0.000000,,1.0,1.0
2,4,2013-03-29,1,3.354167,8.6,-1.6,75.833333,95.0,55.0,1.3,...,0.75,1.0,0.50,1.414214,,1.0,1.000000,0.707107,2.0,2.0
3,4,2013-04-02,2,-1.525000,1.0,-3.6,44.291667,53.0,32.0,0.0,...,1.00,1.0,0.50,0.866025,,2.0,0.918296,0.577350,3.0,3.0
4,4,2013-04-03,2,-1.150000,4.0,-4.2,45.833333,58.0,29.0,0.0,...,1.25,2.0,0.50,0.816497,0.6,2.0,1.500000,1.224745,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,59,2013-05-21,0,18.033333,24.4,13.9,87.875000,97.0,67.0,5.5,...,1.25,2.0,0.50,0.816497,-0.4,1.0,1.500000,-1.224745,2.0,0.0
644,59,2013-05-22,0,14.208333,24.5,8.5,87.708333,99.0,63.0,6.2,...,1.25,2.0,1.25,1.276569,-0.7,1.0,1.500000,-0.783349,3.0,1.0
645,59,2013-05-23,0,18.450000,24.7,13.7,88.083333,99.0,68.0,1.9,...,0.25,1.0,0.25,2.000000,-0.3,1.0,0.811278,-0.500000,3.0,2.0
646,59,2013-05-24,1,13.508333,19.4,6.9,94.250000,100.0,84.0,11.7,...,0.00,0.0,0.00,0.000000,0.0,0.0,0.000000,0.000000,3.0,3.0


In [69]:
enriched_df.describe()

Unnamed: 0,user_id,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,environmental_cloudcover,...,stress_level_rolling_q75_5d,stress_level_rolling_range_5d,stress_level_rolling_iqr_5d,stress_level_rolling_cv_5d,stress_level_rolling_trend_slope_5d,stress_level_rolling_direction_changes_5d,stress_level_rolling_entropy_5d,stress_level_rolling_zscore_5d,stress_level_rolling_time_since_peak_5d,stress_level_rolling_time_since_trough_5d
count,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,648.0,...,647.0,647.0,647.0,646.0,552.0,647.0,647.0,623.0,647.0,647.0
mean,33.62037,1.154321,8.512854,14.699537,3.327778,68.407986,88.521605,43.833333,2.281636,48.63098,...,1.401855,0.981453,0.489181,0.61486,0.012681,1.309119,0.782409,0.007576,2.18238,2.153014
std,17.982157,0.742368,5.562435,6.753744,4.765486,12.982973,12.694466,13.07971,3.664127,31.175947,...,0.574069,0.678937,0.45404,0.561456,0.290913,0.787063,0.499478,0.782488,1.042799,1.06583
min,4.0,0.0,-1.525,1.0,-6.1,44.291667,53.0,19.0,0.0,0.041667,...,0.0,0.0,0.0,0.0,-0.8,0.0,0.0,-1.5,0.0,0.0
25%,17.0,1.0,3.854167,9.0,-0.6,58.75,80.0,35.0,0.0,27.25,...,1.0,1.0,0.25,0.285714,-0.2,1.0,0.811278,-0.5,1.0,1.0
50%,33.0,1.0,7.454167,14.1,2.8,67.791667,94.0,40.0,0.1,39.083333,...,1.25,1.0,0.25,0.4,0.0,2.0,0.811278,0.0,3.0,3.0
75%,51.0,2.0,13.508333,20.5,6.8,78.958333,99.0,54.0,2.3,77.375,...,2.0,1.0,1.0,0.866025,0.2,2.0,1.0,0.5,3.0,3.0
max,59.0,2.0,18.45,26.4,13.9,94.25,100.0,84.0,15.0,99.916667,...,2.0,2.0,2.0,2.0,0.8,2.0,1.584963,1.5,3.0,3.0


In [70]:
enriched_df.to_csv(f'augmented/studentlife_2014_{window_size}.csv', index=False)