In [1]:
import setup_jwlab
import pandas as pd
import numpy as np

from jwlab.ml_prep import load_ml_df, y_to_binary
from sklearn.model_selection import cross_val_score

In [2]:
df = load_ml_df("~/projects/def-jwerker/kjslakov/data/ml_df_sktime.pkl")

In [3]:
y = y_to_binary(df.label.values.flatten())
df = df.drop(columns=["label"], axis=1)

In [4]:
np.mean(y==0)

0.4992343032159265

In [5]:
from scipy.stats import entropy, kurtosis, skew
funcs = [np.mean, np.min, np.max, np.var, skew, kurtosis]
dfs_applied = [df.applymap(f) for f in funcs]

In [6]:
from functools import reduce

suffix = 0
def join_dfs(a, b):
    global suffix
    suffix = suffix + 1
    return a.join(b, rsuffix="_%d" % suffix)

df_concat = reduce(join_dfs, dfs_applied)

In [7]:
df_normalized = (df_concat - df_concat.mean()) / df_concat.std()
df_normalized.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50_5,51_5,52_5,53_5,54_5,55_5,56_5,57_5,58_5,59_5
0,0.18791,-0.0296,0.869662,1.717514,-0.162854,1.066368,1.434642,0.355487,1.334462,0.273382,...,0.085211,0.890953,-0.089563,-1.027253,0.638892,2.414758,0.714436,1.789928,-0.346868,0.777242
1,-0.900831,-0.332462,-0.002406,0.214909,-1.067826,-0.258041,0.413386,-0.671108,0.272186,-1.122412,...,-0.193416,0.278766,-0.035308,-0.036481,0.287076,-0.925892,-0.851055,-0.258952,0.05296,-1.020129
2,0.460155,-0.950344,-1.348063,-1.486009,-0.636273,-0.900603,-1.872286,0.204289,-1.651713,0.01235,...,-0.863809,-1.073993,0.402742,-1.517739,-0.073876,-0.756683,1.981996,-0.810542,-0.222696,0.973361
3,0.766241,-0.277888,0.08196,0.93985,0.46948,0.720015,1.029198,0.207837,0.509955,0.618532,...,0.513532,0.079215,0.577426,0.666278,-0.675101,-1.202393,-1.29577,-1.157052,-0.379513,-0.754641
4,-0.57054,0.168296,0.932536,0.950528,-0.759816,0.565641,0.681278,0.255447,0.960692,-0.259308,...,0.241122,-0.669293,-0.062295,-0.868215,-0.01087,0.16581,-0.428153,1.55487,-0.057441,-0.220786


In [10]:
from sklearn.svm import LinearSVC
model = LinearSVC(max_iter=10000)

scores = cross_val_score(model, df_normalized, y, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.5530303  0.45038168 0.44615385 0.52307692 0.52307692]
Accuracy: 0.50 (+/- 0.09)


In [16]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}]
svc = SVC()
model = GridSearchCV(svc, parameters)
model.fit(df_normalized, y)

model.cv_results_



{'mean_fit_time': array([0.11929933, 0.09420101, 0.09144155, 0.09134722, 0.09143051,
        0.0913465 , 0.0915726 , 0.09141437, 0.09163316, 0.09148709,
        0.09158587, 0.09148153, 0.09036692, 0.09150974, 0.09322222,
        0.08893108, 0.09939321, 0.08581781, 0.0993909 , 0.12149779,
        0.08381414, 0.08380095, 0.08381915, 0.0828774 , 0.07942168,
        0.10366424, 0.36618002, 0.34570511, 0.34692152, 0.36155677]),
 'std_fit_time': array([0.00828089, 0.00389099, 0.0002808 , 0.00027937, 0.0002884 ,
        0.0002802 , 0.00018641, 0.00023365, 0.00017324, 0.0001888 ,
        0.00018912, 0.0001743 , 0.00082584, 0.0001885 , 0.00050926,
        0.00030564, 0.0001821 , 0.00162275, 0.0001741 , 0.00187048,
        0.00016051, 0.0001745 , 0.00016943, 0.00029843, 0.00184345,
        0.00646905, 0.03511097, 0.03608276, 0.0365813 , 0.04614094]),
 'mean_score_time': array([0.05417212, 0.0435497 , 0.04370952, 0.04356488, 0.04370761,
        0.04356631, 0.04372056, 0.04355812, 0.04391233, 0.04

In [17]:
model.best_score_

0.5237366003062787