In [4]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from utils.StudentLifeDataLoader import StudentLifeDataLoader
from functools import reduce


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:

studentlife = StudentLifeDataLoader('studentlife')


In [6]:
a = studentlife.get_stress_data()

In [8]:
a['stress_level'].value_counts()

stress_level
2    373
3    223
1    214
Name: count, dtype: int64

In [9]:
c = studentlife.get_social_data()

In [10]:
d = studentlife.get_sleep_data()

In [12]:
df_merged = reduce(lambda left,right: pd.merge(left,right,on=['user_id', 'date'],
                                            how='outer'), [a, c, d])

In [13]:
df_merged

Unnamed: 0,stress_level,user_id,date,social_level,sleep_duration
0,1.0,4,2013-03-27,3.0,6.0
1,1.0,4,2013-03-28,,6.0
2,2.0,4,2013-03-29,3.0,6.0
3,1.0,4,2013-03-30,3.0,7.0
4,1.0,4,2013-03-31,3.0,7.0
...,...,...,...,...,...
1088,,58,2013-05-27,1.0,7.0
1089,1.0,58,2013-05-28,2.0,9.0
1090,,58,2013-05-29,2.0,5.0
1091,,58,2013-05-30,1.0,7.0


In [14]:
df = df_merged.copy()

In [15]:
# Convert the 'date' column to datetime if it's not already in datetime format
df['date'] = pd.to_datetime(df['date'])

df.set_index('date', inplace=True)
# Group by 'user_id'
grouped = df.groupby('user_id')

In [16]:
# Define a function to filter each group
def filter_group(group):
    # Find the first and last non-null stress_level index
    first_non_null = group['stress_level'].first_valid_index()
    last_non_null = group['stress_level'].last_valid_index()
    
    # If either first_non_null or last_non_null is not found, return an empty DataFrame
    if first_non_null is None or last_non_null is None:
        return pd.DataFrame(columns=group.columns)
    
    # Slice the group based on first and last non-null index
    return group.loc[first_non_null:last_non_null]

In [24]:
def interpolate_social_level(group, column):
    group[column] = group[column].interpolate(method='nearest')
    return group

In [17]:
# Apply the filtering function to each group and concatenate the results
filtered_df = pd.concat([filter_group(group) for _, group in grouped])

# Reset the index if needed
filtered_df = filtered_df.reset_index()

In [18]:
filtered_df.drop_duplicates()

Unnamed: 0,date,stress_level,user_id,social_level,sleep_duration
0,2013-03-27,1.0,4,3.0,6.0
1,2013-03-28,1.0,4,,6.0
2,2013-03-29,2.0,4,3.0,6.0
3,2013-03-30,1.0,4,3.0,7.0
4,2013-03-31,1.0,4,3.0,7.0
...,...,...,...,...,...
1040,2013-05-23,3.0,58,,
1041,2013-05-25,,58,1.0,9.0
1042,2013-05-26,,58,2.0,
1043,2013-05-27,,58,1.0,7.0


In [32]:
# interpolate the social level by nearest for each user_id
# Group by 'user_id'
grouped = filtered_df.groupby('user_id')

# Apply the interpolation function to each group and concatenate the results
interpolated_df = grouped.apply(lambda x: interpolate_social_level(x, 'social_level'))

# Reset the index if needed
interpolated_df = interpolated_df.reset_index(drop=True)

grouped = interpolated_df.groupby('user_id')

# Apply the interpolation function to each group and concatenate the results
interpolated_df = grouped.apply(lambda x: interpolate_social_level(x, 'sleep_duration'))

# Reset the index if needed
interpolated_df = interpolated_df.reset_index(drop=True)

  interpolated_df = grouped.apply(lambda x: interpolate_social_level(x, 'social_level'))
  interpolated_df = grouped.apply(lambda x: interpolate_social_level(x, 'sleep_duration'))


In [33]:
interpolated_df

Unnamed: 0,date,stress_level,user_id,social_level,sleep_duration
0,2013-03-27,1.0,4,3.0,6.0
1,2013-03-28,1.0,4,3.0,6.0
2,2013-03-29,2.0,4,3.0,6.0
3,2013-03-30,1.0,4,3.0,7.0
4,2013-03-31,1.0,4,3.0,7.0
...,...,...,...,...,...
1040,2013-05-23,3.0,58,1.0,9.0
1041,2013-05-25,,58,1.0,9.0
1042,2013-05-26,,58,2.0,9.0
1043,2013-05-27,,58,1.0,7.0


In [38]:
interpolated_df = interpolated_df.dropna()

In [79]:
# create new column with weekday
interpolated_df['weekday'] = interpolated_df['date'].dt.dayofweek

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interpolated_df['weekday'] = interpolated_df['date'].dt.dayofweek


In [None]:
# stress level from 1 2 3 to 0 1 2
interpolated_df['stress_level'] = interpolated_df['stress_level'].apply(lambda x: 0 if x == 1 else 1 if x == 2 else 2 if x == 3 else 3)

In [96]:
# machine learning classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

model = CatBoostClassifier(
    iterations=2000,
    random_seed=63,
    learning_rate=0.5,
    early_stopping_rounds=20,
    verbose=False,
    loss_function='MultiClass'
)

model = XGBClassifier(n_estimators=10, max_depth=2, learning_rate=1)

# kfold cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


def kfold_cross_validation(model, X, y, n_folds=5):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='f1_macro')
    return scores


scaler = StandardScaler()
X = interpolated_df.drop(columns=['stress_level', 'date'])
X = scaler.fit_transform(X)
y = interpolated_df['stress_level']

scores = kfold_cross_validation(model, X, y)
print(scores)
print(scores.mean())


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/sergio/sb/resources/.virtualenvs/venv-multitask-stress-aiae/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/sergio/sb/resources/.virtualenvs/venv-multitask-stress-aiae/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/home/sergio/sb/resources/.virtualenvs/venv-multitask-stress-aiae/lib/python3.10/site-packages/xgboost/sklearn.py", line 1471, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [1. 2. 3.]


In [74]:
X = interpolated_df.drop(columns=['stress_level', 'date'])

# scale features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
y = interpolated_df['stress_level']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.33      0.48      0.39        29
         2.0       0.57      0.57      0.57        75
         3.0       0.41      0.29      0.34        42

    accuracy                           0.47       146
   macro avg       0.44      0.45      0.44       146
weighted avg       0.48      0.47      0.47       146

