In [1]:
import setup_jwlab
import pandas as pd
import numpy as np

from jwlab.ml_prep import load_ml_df, y_to_binary
from sklearn.model_selection import cross_val_score

In [2]:
df = load_ml_df("~/projects/def-jwerker/kjslakov/data/ml_df_sktime.pkl")
df_2 = load_ml_df("~/projects/def-jwerker/kjslakov/data/ml_df_readys.pkl")

In [3]:
print(df.shape)
print(df_2.shape)

(2248, 61)
(2248, 18002)


In [5]:
# y = y_to_binary(df.label.values.flatten())
y = df_2.participant.values.flatten()
y[y <= 11] = 0
y[y > 11] = 1
df = df.drop(columns=["label"], axis=1)

In [6]:
np.mean(y == 0)

0.36966192170818507

In [7]:
np.mean(y == 1)

0.630338078291815

In [8]:
from scipy.stats import entropy, kurtosis, skew
funcs = [np.mean, np.min, np.max, np.var, skew, kurtosis]
dfs_applied = [df.applymap(f) for f in funcs]

In [9]:
from functools import reduce

suffix = 0
def join_dfs(a, b):
    global suffix
    suffix = suffix + 1
    return a.join(b, rsuffix="_%d" % suffix)

df_concat = reduce(join_dfs, dfs_applied)

In [10]:
df_concat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50_5,51_5,52_5,53_5,54_5,55_5,56_5,57_5,58_5,59_5
0,-151.23244,22.157523,26.303145,44.579699,199.476753,56.594964,117.655501,49.145668,88.434821,58.744395,...,-0.485558,-0.939972,-0.620084,0.483815,0.184526,-0.438635,0.006767,-0.355159,0.333178,-0.062767
1,-27.893581,3.691948,50.399791,86.667209,-238.189343,81.695979,59.823272,76.898736,54.264736,-91.232829,...,-0.517229,-0.646067,-0.570152,-0.114087,-1.402864,-0.423634,-0.739819,-0.999389,-0.566632,-0.314265
2,-106.118602,89.092548,77.907323,-21.185311,-95.925419,15.472448,-42.93492,87.031385,-37.913738,-124.838052,...,-1.10587,0.537901,-1.356271,-0.874381,0.198879,0.260676,-1.091921,0.224772,-0.592836,-1.299025
3,50.789833,-17.497619,-22.559299,-17.253944,-6.673186,-4.100224,-7.758851,1.822259,20.539736,-8.713552,...,-1.092328,0.992546,-1.279781,-1.191468,-0.101664,1.055147,-1.200286,-0.259127,-0.496652,-0.972953
4,26.498732,-6.528111,7.153571,24.768155,-16.449073,-2.005957,32.413792,-27.353653,14.684888,-14.464744,...,0.458793,-0.957491,0.834448,-0.123365,0.130202,-0.629557,-0.723317,-1.146189,-0.225354,-0.874202


In [11]:
df_normalized = (df_concat - df_concat.mean()) / df_concat.std()
df_normalized.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50_5,51_5,52_5,53_5,54_5,55_5,56_5,57_5,58_5,59_5
0,-0.200222,0.118339,0.109668,0.078047,0.500283,0.188545,0.163598,0.053868,0.134007,0.046864,...,-0.089577,-0.565041,-0.187967,0.713228,0.220743,-0.083677,0.329732,0.003936,0.612799,0.208482
1,-0.018864,-0.006806,0.241084,0.134228,-0.59123,0.269561,0.091026,0.102379,0.087271,-0.0401,...,-0.115001,-0.323018,-0.148939,0.226491,-0.55619,-0.068544,-0.363142,-0.545593,-0.246371,-0.005476
2,-0.133886,0.571974,0.391102,-0.009741,-0.236433,0.055818,-0.037922,0.120091,-0.038804,-0.059586,...,-0.587535,0.651946,-0.763392,-0.392447,0.227768,0.621771,-0.689913,0.498617,-0.271392,-0.843243
3,0.096833,-0.150413,-0.156814,-0.004494,-0.013843,-0.007355,0.00622,-0.028851,0.041144,0.007749,...,-0.576664,1.026333,-0.703606,-0.650579,0.08067,1.423213,-0.790483,0.085851,-0.179552,-0.565844
4,0.061115,-0.07607,0.005231,0.051601,-0.038223,-0.000596,0.056631,-0.079848,0.033137,0.004414,...,0.668506,-0.579468,0.948937,0.218937,0.194155,-0.276274,-0.347828,-0.670813,0.079493,-0.481833


In [12]:
from sklearn.svm import LinearSVC
model = LinearSVC(max_iter=1000, C=1e-3)

scores = cross_val_score(model, df_normalized, y, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.58980044 0.59333333 0.54342984 0.62360802 0.59465479]
Accuracy: 0.59 (+/- 0.05)


In [13]:
from sklearn.svm import SVC
model = SVC(gamma='scale')

scores = cross_val_score(model, df_concat, y, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.62971175 0.63111111 0.63028953 0.63028953 0.63028953]
Accuracy: 0.63 (+/- 0.00)


In [14]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, df_concat, y, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.70953437 0.63111111 0.61247216 0.56124722 0.67706013]
Accuracy: 0.64 (+/- 0.10)
