In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('processed/studentlife_2014.csv')

In [3]:
dataset

Unnamed: 0,user_id,date,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,...,organizational_deadlines,organizational_days_until_next_deadline,environmental_weekday,individual_personality_extraversion,individual_personality_agreeableness,individual_personality_conscientiousness,individual_personality_neuroticism,individual_personality_openness,individual_previous_stress_level,individual_days_since_previous_stress_measurement
0,4,2013-03-27,0,0.466667,7.2,-6.1,64.125000,75.0,46.0,0.0,...,0.0,12.0,2,1,4,0,15,17,,
1,4,2013-03-28,1,3.450000,8.0,0.9,76.333333,95.0,47.0,1.5,...,0.0,11.0,3,1,4,0,15,17,0.0,1.0
2,4,2013-03-29,1,3.354167,8.6,-1.6,75.833333,95.0,55.0,1.3,...,0.0,10.0,4,1,4,0,15,17,1.0,1.0
3,4,2013-04-02,1,-1.525000,1.0,-3.6,44.291667,53.0,32.0,0.0,...,0.0,6.0,1,1,4,0,15,17,1.0,4.0
4,4,2013-04-03,2,-1.150000,4.0,-4.2,45.833333,58.0,29.0,0.0,...,0.0,5.0,2,1,4,0,15,17,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,59,2013-05-21,0,18.033333,24.4,13.9,87.875000,97.0,67.0,5.5,...,0.0,3.0,1,14,13,-1,5,23,0.0,1.0
644,59,2013-05-22,0,14.208333,24.5,8.5,87.708333,99.0,63.0,6.2,...,0.0,2.0,2,14,13,-1,5,23,0.0,1.0
645,59,2013-05-23,0,18.450000,24.7,13.7,88.083333,99.0,68.0,1.9,...,0.0,1.0,3,14,13,-1,5,23,0.0,1.0
646,59,2013-05-24,1,13.508333,19.4,6.9,94.250000,100.0,84.0,11.7,...,1.0,5.0,4,14,13,-1,5,23,0.0,1.0


In [4]:
# English: Assume 'dataset' is your dataframe
personality_cols = [col for col in dataset.columns if 'individual_personality_' in col]
other_features_cols = [col for col in dataset.columns if col not in personality_cols and col not in ['user_id', 'stress_level', 'date', 'individual_previous_stress_level', 'individual_days_since_previous_stress_measurement']]

In [5]:
from sklearn.preprocessing import MinMaxScaler

# Initialize scalers
scaler_personality = MinMaxScaler()
scaler_features = MinMaxScaler()

# Create normalized dataframes
df_personality_norm = pd.DataFrame(
    scaler_personality.fit_transform(dataset[personality_cols]),
    columns=personality_cols,
    index=dataset.index
)

df_features_norm = pd.DataFrame(
    scaler_features.fit_transform(dataset[other_features_cols]),
    columns=other_features_cols,
    index=dataset.index
)


In [6]:
# This dictionary will hold our new interaction features
interaction_features = {}

# Loop through each personality trait and each other feature
for p_col in df_personality_norm.columns:
    for f_col in df_features_norm.columns:
        # Define a clear name for the new feature
        interaction_name = f"interaction_{p_col.split('_')[-1]}_x_{f_col}"
        
        # Multiply the normalized columns to create the interaction
        interaction_features[interaction_name] = df_personality_norm[p_col] * df_features_norm[f_col]

# Convert the dictionary to a dataframe
df_interactions = pd.DataFrame(interaction_features)


In [7]:
# Combine the new interaction features with the original features
df_final_enriched = pd.concat([dataset, df_interactions], axis=1)

# Now, 'df_final_enriched' contains both the original data and the new,
# well-scaled interaction features, ready for model training.


In [8]:
df_final_enriched

Unnamed: 0,user_id,date,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,...,interaction_openness_x_individual_minutes_running,interaction_openness_x_individual_minutes_unknown,interaction_openness_x_environmental_minutes_silence,interaction_openness_x_environmental_minutes_voice,interaction_openness_x_environmental_minutes_noise,interaction_openness_x_environmental_minutes_unknown,interaction_openness_x_organizational_work_hours,interaction_openness_x_organizational_deadlines,interaction_openness_x_organizational_days_until_next_deadline,interaction_openness_x_environmental_weekday
0,4,2013-03-27,0,0.466667,7.2,-6.1,64.125000,75.0,46.0,0.0,...,0.011690,0.003478,0.079166,0.075928,0.085113,0.0,0.094862,0.000000,0.089674,0.130435
1,4,2013-03-28,1,3.450000,8.0,0.9,76.333333,95.0,47.0,1.5,...,0.017842,0.002087,0.093742,0.113680,0.078353,0.0,0.094862,0.000000,0.081522,0.195652
2,4,2013-03-29,1,3.354167,8.6,-1.6,75.833333,95.0,55.0,1.3,...,0.025841,0.006957,0.083187,0.124284,0.088493,0.0,0.047431,0.000000,0.073370,0.260870
3,4,2013-04-02,1,-1.525000,1.0,-3.6,44.291667,53.0,32.0,0.0,...,0.017227,0.002783,0.120885,0.082715,0.054079,0.0,0.071146,0.000000,0.040761,0.065217
4,4,2013-04-03,2,-1.150000,4.0,-4.2,45.833333,58.0,29.0,0.0,...,0.014151,0.001391,0.087962,0.127253,0.082655,0.0,0.047431,0.000000,0.032609,0.130435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,59,2013-05-21,0,18.033333,24.4,13.9,87.875000,97.0,67.0,5.5,...,0.034454,0.015304,0.216637,0.160339,0.481180,0.0,0.094862,0.000000,0.032609,0.130435
644,59,2013-05-22,0,14.208333,24.5,8.5,87.708333,99.0,63.0,6.2,...,0.017227,0.022261,0.213622,0.105196,0.521739,0.0,0.000000,0.000000,0.016304,0.260870
645,59,2013-05-23,0,18.450000,24.7,13.7,88.083333,99.0,68.0,1.9,...,0.008614,0.006957,0.083438,0.039873,0.227377,0.0,0.047431,0.000000,0.000000,0.391304
646,59,2013-05-24,1,13.508333,19.4,6.9,94.250000,100.0,84.0,11.7,...,0.014766,0.033391,0.181955,0.151007,0.513750,0.0,0.047431,0.173913,0.065217,0.521739


In [9]:
df_final_enriched.to_csv('augmented/studentlife_2014_interactions.csv', index=False)