In [1]:
import setup_jwlab
import pandas as pd
import numpy as np

from jwlab.ml_prep import load_ml_df, y_to_binary
from sklearn.model_selection import cross_val_score

In [2]:
df = load_ml_df("~/projects/def-jwerker/kjslakov/data/ml_df_sktime.pkl")

In [5]:
y = y_to_binary(df.label.values.flatten())
df = df.drop(columns=["label"], axis=1)

In [6]:
np.mean(y==0)

0.5800711743772242

In [7]:
from scipy.stats import entropy, kurtosis, skew
funcs = [np.mean, np.min, np.max, np.var, skew, kurtosis]
dfs_applied = [df.applymap(f) for f in funcs]

In [8]:
from functools import reduce

suffix = 0
def join_dfs(a, b):
    global suffix
    suffix = suffix + 1
    return a.join(b, rsuffix="_%d" % suffix)

df_concat = reduce(join_dfs, dfs_applied)

In [9]:
df_normalized = (df_concat - df_concat.mean()) / df_concat.std()
df_normalized.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50_5,51_5,52_5,53_5,54_5,55_5,56_5,57_5,58_5,59_5
0,-0.200222,0.118339,0.109668,0.078047,0.500283,0.188545,0.163598,0.053868,0.134007,0.046864,...,-0.089577,-0.565041,-0.187967,0.713228,0.220743,-0.083677,0.329732,0.003936,0.612799,0.208482
1,-0.018864,-0.006806,0.241084,0.134228,-0.59123,0.269561,0.091026,0.102379,0.087271,-0.0401,...,-0.115001,-0.323018,-0.148939,0.226491,-0.55619,-0.068544,-0.363142,-0.545593,-0.246371,-0.005476
2,-0.133886,0.571974,0.391102,-0.009741,-0.236433,0.055818,-0.037922,0.120091,-0.038804,-0.059586,...,-0.587535,0.651946,-0.763392,-0.392447,0.227768,0.621771,-0.689913,0.498617,-0.271392,-0.843243
3,0.096833,-0.150413,-0.156814,-0.004494,-0.013843,-0.007355,0.00622,-0.028851,0.041144,0.007749,...,-0.576664,1.026333,-0.703606,-0.650579,0.08067,1.423213,-0.790483,0.085851,-0.179552,-0.565844
4,0.061115,-0.07607,0.005231,0.051601,-0.038223,-0.000596,0.056631,-0.079848,0.033137,0.004414,...,0.668506,-0.579468,0.948937,0.218937,0.194155,-0.276274,-0.347828,-0.670813,0.079493,-0.481833


In [10]:
print(len(y))
print(df_normalized.shape)

2248
(2248, 360)


In [11]:
from sklearn.svm import LinearSVC
model = LinearSVC(max_iter=1000, C=1e-3)

scores = cross_val_score(model, df_normalized, y, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_normalized, y, test_size=0.2)
model.fit(X_train, y_train)
print("train error: %0.2f" % np.mean(model.predict(X_train) != y_train))
print("test error: %0.2f" % np.mean(model.predict(X_test) != y_test))

[0.56       0.54       0.56888889 0.54666667 0.55803571]
Accuracy: 0.55 (+/- 0.02)
train error: 0.37
test error: 0.45


In [12]:
np.mean(y_test == 0)

0.5844444444444444

In [9]:
# from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV
# parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
#                     'C': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]},
#                    {'kernel': ['linear'], 'C': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}]
# svc = SVC()
# model = GridSearchCV(svc, parameters)
# model.fit(df_normalized, y)

# model.cv_results_

In [10]:
# model.best_score_

In [13]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, df_normalized, y, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.59555556 0.55555556 0.56666667 0.56888889 0.53571429]
Accuracy: 0.56 (+/- 0.04)


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_normalized, y, test_size=0.2)
model.fit(X_train, y_train)
print("train error: %0.2f" % np.mean(model.predict(X_train) != y_train))
print("test error: %0.2f" % np.mean(model.predict(X_test) != y_test))

train error: 0.00
test error: 0.44


In [16]:
from sklearn.svm import SVC
model = SVC(gamma='scale')

scores = cross_val_score(model, df_normalized, y, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.58       0.56       0.58       0.58       0.58258929]
Accuracy: 0.58 (+/- 0.02)


In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_normalized, y, test_size=0.2)
model.fit(X_train, y_train)
print("train error: %0.2f" % np.mean(model.predict(X_train) != y_train))
print("test error: %0.2f" % np.mean(model.predict(X_test) != y_test))

train error: 0.39
test error: 0.42
