In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from utils.StudentLifeDataLoader import StudentLifeDataLoader
from functools import reduce

In [2]:
seed = 24091993


In [3]:

studentlife = StudentLifeDataLoader('studentlife')


In [4]:
stress_data = studentlife.get_stress_data()
social_data = studentlife.get_social_data()
sleep_data = studentlife.get_sleep_data()
conversation_data = studentlife.get_conversation_data()

In [5]:
conversation_data

Unnamed: 0,date,social_voice_sum,social_voice_count,social_voice_mean,social_voice_max,user_id
0,2013-03-27,21485,29,740.862069,3550,16
1,2013-03-28,29824,28,1065.142857,4993,16
2,2013-03-29,27317,31,881.193548,8375,16
3,2013-03-30,24311,51,476.686275,2097,16
4,2013-03-31,19829,35,566.542857,2563,16
...,...,...,...,...,...,...
1397,2013-05-26,14387,23,625.521739,4094,14
1398,2013-05-27,878,3,292.666667,666,14
1399,2013-05-28,28555,50,571.100000,4437,14
1400,2013-05-29,45498,70,649.971429,5165,14


In [6]:
stress_data.drop_duplicates(['date', 'user_id']).dropna(subset=['stress_level'])

Unnamed: 0,stress_level,user_id,date,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,environmental_cloudcover,extraversion,agreeableness,neuroticism,openness,conscientiousness
0,3,51,2013-04-03,2,1,2,1,1,0,0,1,9,10,4,28,10
1,2,51,2013-03-30,2,2,2,2,4,0,0,0,9,10,4,28,10
2,2,51,2013-04-04,2,2,2,1,1,1,0,0,9,10,4,28,10
3,2,51,2013-04-05,2,2,2,2,3,1,0,1,9,10,4,28,10
4,1,51,2013-04-06,2,1,2,1,2,0,0,0,9,10,4,28,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,3,33,2013-05-18,3,3,3,1,4,0,0,1,5,6,11,34,-7
876,3,33,2013-05-19,3,3,3,2,4,1,0,2,5,6,11,34,-7
877,3,33,2013-05-22,3,3,3,4,4,2,0,4,5,6,11,34,-7
878,2,33,2013-05-24,3,3,3,4,4,4,0,4,5,6,11,34,-7


In [7]:
df_merged = reduce(lambda left,right: pd.merge(left,right,on=['user_id', 'date'],
                                            how='outer'), [stress_data, social_data, sleep_data, conversation_data])

In [8]:
df_merged.drop_duplicates(['date', 'user_id']).dropna(subset=['stress_level'])

Unnamed: 0,stress_level,user_id,date,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,...,agreeableness,neuroticism,openness,conscientiousness,social_level,sleep_duration,social_voice_sum,social_voice_count,social_voice_mean,social_voice_max
0,1.0,4,2013-03-27,2.0,2.0,2.0,2.0,3.0,1.0,0.0,...,4.0,15.0,17.0,0.0,3.0,6.0,25142.0,41.0,613.219512,3469.0
1,1.0,4,2013-03-28,2.0,2.0,2.0,3.0,4.0,1.0,0.0,...,4.0,15.0,17.0,0.0,,6.0,25256.0,37.0,682.594595,3328.0
2,2.0,4,2013-03-29,2.0,2.0,2.0,3.0,4.0,2.0,0.0,...,4.0,15.0,17.0,0.0,3.0,6.0,28051.0,39.0,719.256410,4280.0
3,1.0,4,2013-03-30,2.0,2.0,2.0,2.0,4.0,0.0,0.0,...,4.0,15.0,17.0,0.0,3.0,7.0,17375.0,33.0,526.515152,2321.0
4,1.0,4,2013-03-31,2.0,2.0,2.0,1.0,2.0,1.0,0.0,...,4.0,15.0,17.0,0.0,3.0,7.0,26301.0,37.0,710.837838,4276.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1487,2.0,59,2013-06-03,4.0,3.0,4.0,3.0,4.0,2.0,0.0,...,13.0,5.0,23.0,-1.0,,,,,,
1488,2.0,59,2013-06-04,3.0,3.0,3.0,2.0,2.0,1.0,0.0,...,13.0,5.0,23.0,-1.0,3.0,13.0,,,,
1489,2.0,59,2013-06-05,3.0,3.0,3.0,2.0,4.0,1.0,0.0,...,13.0,5.0,23.0,-1.0,,,,,,
1490,2.0,59,2013-06-06,3.0,3.0,3.0,2.0,3.0,1.0,0.0,...,13.0,5.0,23.0,-1.0,3.0,15.0,,,,


In [9]:
df = df_merged.copy()

In [10]:
# Convert the 'date' column to datetime if it's not already in datetime format
df['date'] = pd.to_datetime(df['date'])

df.set_index('date', inplace=True)
# Group by 'user_id'
grouped = df.groupby('user_id')

In [11]:
# Define a function to filter each group
def filter_group(group):
    # Find the first and last non-null stress_level index
    first_non_null = group['stress_level'].first_valid_index()
    last_non_null = group['stress_level'].last_valid_index()
    
    # If either first_non_null or last_non_null is not found, return an empty DataFrame
    if first_non_null is None or last_non_null is None:
        return pd.DataFrame(columns=group.columns)
    
    # Slice the group based on first and last non-null index
    return group.loc[first_non_null:last_non_null]

def interpolate_column(group, column):
    group[column] = group[column].interpolate(method='nearest')
    return group

In [12]:
# Apply the filtering function to each group and concatenate the results
filtered_df = pd.concat([filter_group(group) for _, group in grouped])

# Reset the index if needed
filtered_df = filtered_df.reset_index()

In [13]:
# interpolate the social level by nearest for each user_id
# Group by 'user_id'
grouped = filtered_df.groupby('user_id')

# Apply the interpolation function to each group and concatenate the results
interpolated_df = grouped.apply(lambda x: interpolate_column(x, 'social_level'))

# Reset the index if needed
interpolated_df = interpolated_df.reset_index(drop=True)

grouped = interpolated_df.groupby('user_id')

# Apply the interpolation function to each group and concatenate the results
interpolated_df = grouped.apply(lambda x: interpolate_column(x, 'sleep_duration'))

# Reset the index if needed
interpolated_df = interpolated_df.reset_index(drop=True)

  interpolated_df = grouped.apply(lambda x: interpolate_column(x, 'social_level'))
  interpolated_df = grouped.apply(lambda x: interpolate_column(x, 'sleep_duration'))


In [14]:
interpolated_df.drop_duplicates(['date', 'user_id']).dropna(subset=['stress_level'])

Unnamed: 0,date,stress_level,user_id,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,...,agreeableness,neuroticism,openness,conscientiousness,social_level,sleep_duration,social_voice_sum,social_voice_count,social_voice_mean,social_voice_max
0,2013-03-27,1.0,4,2.0,2.0,2.0,2.0,3.0,1.0,0.0,...,4.0,15.0,17.0,0.0,3.0,6.0,25142.0,41.0,613.219512,3469.0
1,2013-03-28,1.0,4,2.0,2.0,2.0,3.0,4.0,1.0,0.0,...,4.0,15.0,17.0,0.0,3.0,6.0,25256.0,37.0,682.594595,3328.0
2,2013-03-29,2.0,4,2.0,2.0,2.0,3.0,4.0,2.0,0.0,...,4.0,15.0,17.0,0.0,3.0,6.0,28051.0,39.0,719.256410,4280.0
3,2013-03-30,1.0,4,2.0,2.0,2.0,2.0,4.0,0.0,0.0,...,4.0,15.0,17.0,0.0,3.0,7.0,17375.0,33.0,526.515152,2321.0
4,2013-03-31,1.0,4,2.0,2.0,2.0,1.0,2.0,1.0,0.0,...,4.0,15.0,17.0,0.0,3.0,7.0,26301.0,37.0,710.837838,4276.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1323,2013-06-03,2.0,59,4.0,3.0,4.0,3.0,4.0,2.0,0.0,...,13.0,5.0,23.0,-1.0,4.0,1.0,,,,
1324,2013-06-04,2.0,59,3.0,3.0,3.0,2.0,2.0,1.0,0.0,...,13.0,5.0,23.0,-1.0,3.0,13.0,,,,
1325,2013-06-05,2.0,59,3.0,3.0,3.0,2.0,4.0,1.0,0.0,...,13.0,5.0,23.0,-1.0,3.0,13.0,,,,
1326,2013-06-06,2.0,59,3.0,3.0,3.0,2.0,3.0,1.0,0.0,...,13.0,5.0,23.0,-1.0,3.0,15.0,,,,


In [15]:
interpolated_df = interpolated_df.dropna(subset=['stress_level'])

interpolated_df['weekday'] = interpolated_df['date'].dt.dayofweek

In [16]:
# stress level from 1 2 3 to 0 1 2
# interpolated_df['stress_level'] = interpolated_df['stress_level'].apply(lambda x: 0 if x == 1 else 1 if x == 2 else 2 if x == 3 else 3)

In [17]:
# machine learning classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, StratifiedGroupKFold, StratifiedKFold, RandomizedSearchCV
from sklearn.model_selection import cross_val_score

np.random.seed(seed)

model = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.5,
    early_stopping_rounds=20,
    verbose=False,
    loss_function='MultiClass'
)

scaler = StandardScaler()
ambient_columns = [i for i in interpolated_df.columns if i.startswith('ambient_')]
X = interpolated_df.drop(columns=['stress_level', 'date'])
X = scaler.fit_transform(X)
y = interpolated_df['stress_level']


def kfold_cross_validation(model, X, y, n_folds=10):
    kf = KFold(n_splits=n_folds, shuffle=True)
    scores = cross_val_score(model, X, y, cv=kf, scoring='f1_weighted')
    return scores

scores = kfold_cross_validation(model, X, y)
print(scores)
print(scores.mean())


[0.48787309 0.50077418 0.50215794 0.46351072 0.56160537 0.40785202
 0.54955662 0.46287487 0.47773667 0.51292043]
0.49268619126607244


In [40]:
# machine learning classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, StratifiedGroupKFold, StratifiedKFold, RandomizedSearchCV
from sklearn.model_selection import cross_val_score

np.random.seed(seed)

model = CatBoostClassifier(verbose=False, random_seed=63,early_stopping_rounds=20,)

'''
model = CatBoostClassifier(
    iterations=2000,
    random_seed=63,
    learning_rate=0.5,
    early_stopping_rounds=20,
    verbose=False,
    loss_function='MultiClass'
)'''
parameters = {'depth'         : [3, 4,5,6,7,8,9, 10, 16],
                 'learning_rate' : [0.05, 0.1, 0.25, 0.4, 0.5, 0.7],
                  'iterations'    : [200, 500, 1000, 2000],
                 }
#model = XGBClassifier(n_estimators=10, max_depth=2, learning_rate=1)


scaler = MinMaxScaler()
ambient_columns = [i for i in interpolated_df.columns if i.startswith('ambient_')]
X = interpolated_df.drop(columns=['stress_level', 'date'])
X = scaler.fit_transform(X)
y = interpolated_df['stress_level']

# split
Grid_CBC = RandomizedSearchCV(estimator=model, param_distributions = parameters, cv = 5, n_jobs=-1, scoring='f1_weighted')
Grid_CBC.fit(X, y)

print("\n The best estimator across ALL searched params:\n",Grid_CBC.best_estimator_)
print("\n The best score across ALL searched params:\n",Grid_CBC.best_score_)
print("\n The best parameters across ALL searched params:\n",Grid_CBC.best_params_)

def kfold_cross_validation(model, X, y, n_folds=5):
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='f1_weighted')
    return scores

model.set_params(**Grid_CBC.best_params_)

scores = kfold_cross_validation(model, X, y)
print(scores)
print(scores.mean())



 The best estimator across ALL searched params:
 <catboost.core.CatBoostClassifier object at 0x7fd420424e50>

 The best score across ALL searched params:
 0.31023121661144437

 The best parameters across ALL searched params:
 {'learning_rate': 0.4, 'iterations': 500, 'depth': 10}
[0.4317924  0.49247907 0.46241696 0.40523518 0.47503663]
0.4533920482175534


In [24]:
interpolated_df

Unnamed: 0,date,stress_level,user_id,ambient_temperature_mean,ambient_temperature_max,ambient_temperature_min,ambient_humidity_mean,ambient_humidity_max,ambient_humidity_min,ambient_precipitation,ambient_cloudcover,extraversion,agreeableness,neuroticism,openness,conscientiousness,social_level,sleep_duration,weekday
0,2013-03-27,1.0,4,0.466667,7.2,-6.1,64.125000,75.0,46.0,0.0,27.791667,1.0,4.0,15.0,17.0,0.0,3.0,6.0,2
1,2013-03-28,1.0,4,3.450000,8.0,0.9,76.333333,95.0,47.0,1.5,84.541667,1.0,4.0,15.0,17.0,0.0,3.0,6.0,3
2,2013-03-29,2.0,4,3.354167,8.6,-1.6,75.833333,95.0,55.0,1.3,27.250000,1.0,4.0,15.0,17.0,0.0,3.0,6.0,4
3,2013-03-30,1.0,4,3.141667,9.0,-2.6,69.750000,95.0,30.0,0.0,19.041667,1.0,4.0,15.0,17.0,0.0,3.0,7.0,5
4,2013-03-31,1.0,4,4.675000,10.9,-1.7,49.125000,65.0,35.0,0.0,35.291667,1.0,4.0,15.0,17.0,0.0,3.0,7.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037,2013-05-20,3.0,58,15.695833,22.6,11.0,85.833333,100.0,63.0,1.9,99.916667,-1.0,15.0,13.0,24.0,8.0,1.0,6.0,0
1038,2013-05-21,2.0,58,18.033333,24.4,13.9,87.875000,97.0,67.0,5.5,59.125000,-1.0,15.0,13.0,24.0,8.0,1.0,7.0,1
1039,2013-05-22,1.0,58,14.208333,24.5,8.5,87.708333,99.0,63.0,6.2,96.000000,-1.0,15.0,13.0,24.0,8.0,1.0,9.0,2
1040,2013-05-23,3.0,58,18.450000,24.7,13.7,88.083333,99.0,68.0,1.9,93.666667,-1.0,15.0,13.0,24.0,8.0,1.0,9.0,3
