In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv('../data/processed/studentlife_2014.csv')

In [3]:
dataset.fillna(0, inplace=True)

In [4]:
dataset

Unnamed: 0,user_id,date,stress_level,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,...,individual_minutes_running,individual_minutes_unknown,environmental_minutes_silence,environmental_minutes_voice,environmental_minutes_noise,environmental_minutes_unknown,organizational_work_hours,deadlines,days_until_next_deadline,weekday
0,4,2013-04-02,1,2.0,1.0,2.0,1.0,1.0,0.0,0.0,...,28.0,4.0,518.0,195.0,176.0,0.0,4.0,0.0,6.0,1
1,4,2013-03-27,0,2.0,2.0,2.0,2.0,3.0,1.0,0.0,...,19.0,5.0,352.0,179.0,277.0,0.0,5.0,0.0,12.0,2
2,4,2013-04-03,2,2.0,1.0,2.0,1.0,1.0,0.0,0.0,...,23.0,2.0,387.0,300.0,269.0,0.0,3.0,0.0,5.0,2
3,4,2013-03-28,0,2.0,2.0,2.0,3.0,4.0,1.0,0.0,...,29.0,3.0,410.0,268.0,255.0,0.0,3.0,0.0,11.0,3
4,4,2013-03-29,1,2.0,2.0,2.0,3.0,4.0,2.0,0.0,...,42.0,10.0,368.0,293.0,288.0,0.0,3.0,0.0,10.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,59,2013-05-21,1,3.0,3.0,4.0,4.0,4.0,3.0,0.0,...,28.0,11.0,468.0,189.0,783.0,0.0,3.0,0.0,3.0,1
644,59,2013-05-22,1,3.0,3.0,3.0,4.0,4.0,2.0,0.0,...,14.0,16.0,462.0,124.0,849.0,0.0,1.0,0.0,2.0,2
645,59,2013-05-23,1,3.0,3.0,4.0,4.0,4.0,3.0,0.0,...,7.0,5.0,203.0,47.0,370.0,0.0,2.0,0.0,1.0,3
646,59,2013-05-24,2,3.0,3.0,3.0,4.0,4.0,4.0,0.0,...,12.0,24.0,399.0,178.0,836.0,0.0,2.0,1.0,5.0,4


## Basic model

In [5]:
# habría que hacer un kfold, hay muchas formas:
# (i) entrenar con unos usuarios y evaluar con otros
# (ii) entrenar con unas fechas y evaluar con otras
# (iii) hacer el kfold con todos los datos
# me parece más interesante la (i)
np.random.seed(24091993)
users = dataset['user_id'].unique()

# create kfold splits by user
kfold = []
for i in range(4):
    train_users, test_users = train_test_split(users, test_size=0.25)
    kfold.append((train_users, test_users))

# train and evaluate
results = []
for i, (train_users, test_users) in enumerate(kfold):
    print(f'Kfold {i+1}')
    train = dataset[dataset['user_id'].isin(train_users)]
    test = dataset[dataset['user_id'].isin(test_users)]

    X_train = train.drop(columns=['user_id', 'stress_level', 'date'])
    y_train = train['stress_level']

    X_test = test.drop(columns=['user_id', 'stress_level', 'date'])
    y_test = test['stress_level']

    model = XGBClassifier()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')

    results.append((accuracy, f1, recall, precision))

results = pd.DataFrame(results, columns=['accuracy', 'f1', 'recall', 'precision'])
results.describe()

Kfold 1
Kfold 2
Kfold 3
Kfold 4


Unnamed: 0,accuracy,f1,recall,precision
count,4.0,4.0,4.0,4.0
mean,0.428927,0.418408,0.428927,0.426376
std,0.044477,0.042089,0.044477,0.053351
min,0.385965,0.373721,0.385965,0.368577
25%,0.400413,0.402313,0.400413,0.400918
50%,0.420969,0.412281,0.420969,0.419995
75%,0.449483,0.428375,0.449483,0.445454
max,0.487805,0.475347,0.487805,0.496938


In [6]:
# XGBoostClassifier evaluated by accuracy, f1_score, recall, precision
X = dataset.drop(['stress_level', 'date'], axis=1)
y = dataset['stress_level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f1)

0.47156593406593406


## Time Series Model

In [7]:
# step 3 - specify the classifier
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier




In [8]:
dataset_timeseries = dataset.copy().set_index('user_id')
dataset_timeseries.set_index(pd.DatetimeIndex(dataset_timeseries['date']), inplace=True, append=True, drop=True)
dataset_timeseries.sort_index(inplace=True)
X = dataset_timeseries.drop(['stress_level', 'date'], axis=1)
y = dataset_timeseries[['stress_level']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
X_train.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,environmental_cloudcover,individual_sleep_duration,organizational_social_interaction,...,individual_minutes_running,individual_minutes_unknown,environmental_minutes_silence,environmental_minutes_voice,environmental_minutes_noise,environmental_minutes_unknown,organizational_work_hours,deadlines,days_until_next_deadline,weekday
user_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
4,2013-03-27,2.0,2.0,2.0,2.0,3.0,1.0,0.0,1.0,6.0,3.0,...,19.0,5.0,352.0,179.0,277.0,0.0,5.0,0.0,12.0,2
4,2013-03-28,2.0,2.0,2.0,3.0,4.0,1.0,0.0,4.0,6.0,3.0,...,29.0,3.0,410.0,268.0,255.0,0.0,3.0,0.0,11.0,3
4,2013-04-02,2.0,1.0,2.0,1.0,1.0,0.0,0.0,1.0,8.0,3.0,...,28.0,4.0,518.0,195.0,176.0,0.0,4.0,0.0,6.0,1
4,2013-04-03,2.0,1.0,2.0,1.0,1.0,0.0,0.0,1.0,8.0,3.0,...,23.0,2.0,387.0,300.0,269.0,0.0,3.0,0.0,5.0,2
4,2013-04-04,2.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,7.0,3.0,...,25.0,2.0,468.0,160.0,284.0,0.0,3.0,0.0,4.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,2013-05-16,3.0,3.0,3.0,2.0,4.0,0.0,0.0,0.0,5.0,5.0,...,21.0,38.0,582.0,295.0,526.0,0.0,3.0,2.0,8.0,3
59,2013-05-20,3.0,3.0,3.0,3.0,4.0,2.0,0.0,4.0,12.0,5.0,...,24.0,59.0,988.0,161.0,258.0,0.0,1.0,0.0,4.0,0
59,2013-05-23,3.0,3.0,4.0,4.0,4.0,3.0,0.0,4.0,2.0,4.0,...,7.0,5.0,203.0,47.0,370.0,0.0,2.0,0.0,1.0,3
59,2013-05-24,3.0,3.0,3.0,4.0,4.0,4.0,0.0,4.0,8.0,4.0,...,12.0,24.0,399.0,178.0,836.0,0.0,2.0,1.0,5.0,4


In [10]:
# example 1 - 3-NN with simple dynamic time warping distance (requires numba)
classifier = KNeighborsTimeSeriesClassifier(n_neighbors=3)
classifier.fit(X_train.sort_index(), y_train.sort_index())

ValueError: Mismatch in number of cases. Number in X = 24 nos in y = 518

In [132]:
X_train.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,environmental_temperature_mean,environmental_temperature_max,environmental_temperature_min,environmental_humidity_mean,environmental_humidity_max,environmental_humidity_min,environmental_precipitation,environmental_cloudcover,individual_sleep_duration,organizational_social_interaction,...,individual_minutes_running,individual_minutes_unknown,environmental_minutes_silence,environmental_minutes_voice,environmental_minutes_noise,environmental_minutes_unknown,organizational_work_hours,deadlines,days_until_next_deadline,weekday
user_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
4,2013-03-27,2.0,2.0,2.0,2.0,3.0,1.0,0.0,1.0,6.0,3.0,...,19.0,5.0,352.0,179.0,277.0,0.0,5.0,0.0,12.0,2
4,2013-03-28,2.0,2.0,2.0,3.0,4.0,1.0,0.0,4.0,6.0,3.0,...,29.0,3.0,410.0,268.0,255.0,0.0,3.0,0.0,11.0,3
4,2013-04-02,2.0,1.0,2.0,1.0,1.0,0.0,0.0,1.0,8.0,3.0,...,28.0,4.0,518.0,195.0,176.0,0.0,4.0,0.0,6.0,1
4,2013-04-03,2.0,1.0,2.0,1.0,1.0,0.0,0.0,1.0,8.0,3.0,...,23.0,2.0,387.0,300.0,269.0,0.0,3.0,0.0,5.0,2
4,2013-04-04,2.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,7.0,3.0,...,25.0,2.0,468.0,160.0,284.0,0.0,3.0,0.0,4.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,2013-05-16,3.0,3.0,3.0,2.0,4.0,0.0,0.0,0.0,5.0,5.0,...,21.0,38.0,582.0,295.0,526.0,0.0,3.0,2.0,8.0,3
59,2013-05-20,3.0,3.0,3.0,3.0,4.0,2.0,0.0,4.0,12.0,5.0,...,24.0,59.0,988.0,161.0,258.0,0.0,1.0,0.0,4.0,0
59,2013-05-23,3.0,3.0,4.0,4.0,4.0,3.0,0.0,4.0,2.0,4.0,...,7.0,5.0,203.0,47.0,370.0,0.0,2.0,0.0,1.0,3
59,2013-05-24,3.0,3.0,3.0,4.0,4.0,4.0,0.0,4.0,8.0,4.0,...,12.0,24.0,399.0,178.0,836.0,0.0,2.0,1.0,5.0,4


In [161]:
from pyts.classification import TimeSeriesForest

clf = TimeSeriesForest(random_state=43)
clf.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [162]:
y_pred = clf.predict(X_test)

In [163]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f1)

0.39161044875330586
