# Training Model: SVM

## 1. Importing libraries ...

In [1]:
import pandas as pd
import numpy as np
import setup_jwlab
from jwlab.constants import cleaned_data_filepath
from jwlab.ml_prep import prep_ml
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

## 2. Importing Participants..

In [2]:

participants = ["904", "905", "906", "909", "910", "912", "908", "913", "914", "916", "917", "919", "920", "921", "923", "924","927", "928", "929", "930", "932"]
#participants = [ "909", "910", "912", "908", "913", "914", "916", "917", "919", "920", "921", "923", "924","927", "928", "929", "930", "932"]
#9m with >40 trials
#participants = [ "909", "912", "908", "913", "914", "916", "917", "919", "920", "921", "924","927", "930"]

#12m all
#participants = ["105", "109", "111", "112", "115", "116", "117", "119", "121", "122", "120", "124"]
#12m with >40 trials
#participants = ["109", "111", "112", "115", "124"]

#participants = [ "909", "910", "912", "908", "913", "914", "916", "917", "919", "920", "921", "923", "924","927", "928", "929", "930", "932",
#               "109", "111", "112", "115", "116", "117", "119", "121", "122", "120", "124"]



## 3. Set up averaging

In [3]:
X, y, good_trial_count = prep_ml(cleaned_data_filepath, participants, downsample_num=1000, averaging="no_averaging")
Xt, yt, good_trial_count_t = prep_ml(cleaned_data_filepath, participants, downsample_num=1000, averaging="average_trials")
Xa, ya, good_trial_count_a = prep_ml(cleaned_data_filepath, participants, downsample_num=1000, averaging="average_trials_and_participants")


loaded
loaded
loaded


In [4]:
(n, d) = X[0][0].shape
assert n == y[0][0].shape[0]


In [5]:
unique, counts = np.unique(y[0][0], return_counts=True)
dict(zip(unique, counts))

{0.0: 443, 1.0: 455}

In [6]:
(n, d) = Xt[0][0].shape
assert n == yt[0][0].shape[0]


n

288

In [7]:
d

60000

In [8]:
unique, counts = np.unique(yt[0][0], return_counts=True)
dict(zip(unique, counts))

{0.0: 144, 1.0: 144}

In [9]:
(n, d) = Xa[0][0].shape
assert n == ya[0][0].shape[0]
n

16

In [10]:
d

60000

In [11]:
unique, counts = np.unique(ya[0][0], return_counts=True)
dict(zip(unique, counts))

{0.0: 8, 1.0: 8}

## 4. Setting up the SVM model ...

In [4]:
model = LinearSVC(C=1e-9, max_iter=5000)
#model = SVC(gamma=.001, kernel = 'rbf', C = 1e-6)

## 5. Training and testing the model ...

### 5.1. Train on raw, test on raw (validation)

In [5]:
# from sklearn import preprocessing

# scaler = preprocessing.StandardScaler().fit(X)
# X,Xp = scaler.transform(X), scaler.transform(Xp)
model.fit(X[0][0], y[0][0])
np.mean(model.predict(X[0][0]) != y[0][0])

0.34210526315789475

### 5.2. Train on raw, test on avg by trial (word repetition) 

In [6]:
# from sklearn import preprocessing

# scaler = preprocessing.StandardScaler().fit(X)
# X,Xp = scaler.transform(X), scaler.transform(Xp)
model.fit(X[0][0], y[0][0])
np.mean(model.predict(Xt[0][0]) != yt[0][0])

0.5

### 5.3. Train on raw, test on avg by word and ps 

In [7]:
# from sklearn import preprocessing

# scaler = preprocessing.StandardScaler().fit(X)
# X,Xp = scaler.transform(X), scaler.transform(Xp)
model.fit(X[0][0], y[0][0])
np.mean(model.predict(Xa[0][0]) != ya[0][0])

0.5

# 6 Subset analysis 

## 6.1 Generating random subsets of the chosen participant list

In [3]:
participants_train, participants_test = train_test_split(participants,test_size=0.2)
print(len(participants_train), len(participants_test))

16 5


### 6.2 Create train and test sets

In [4]:
#X_train, y_train, good_trial_count_train = prep_ml(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="no_averaging")
#X_test, y_test, good_trial_count_test = prep_ml(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="no_averaging")

#X_train, y_train, good_trial_count_train = prep_ml(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="average_trials")
#X_test, y_test, good_trial_count_test = prep_ml(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="average_trials")

X_train, y_train, good_trial_count_train = prep_ml(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="average_trials_and_participants")
X_test, y_test, good_trial_count_test = prep_ml(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="average_trials_and_participants")

loaded
loaded


## 6.3 Radial Classification

In [5]:
model = SVC(gamma=.001, kernel = 'rbf', C = 100)

model.fit(X_train[0][0], y_train[0][0])
np.mean(model.predict(X_test[0][0]) != y_test[0][0])

0.375

## 6.4 Alternate accuracy measurements

In [6]:
from sklearn.model_selection import train_test_split

y_pred = model.predict(X_test[0][0])

from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
print('Accuracy Score : ' + str(accuracy_score(y_test[0][0],y_pred)))
print('Precision Score : ' + str(precision_score(y_test[0][0],y_pred)))
print('Recall Score : ' + str(recall_score(y_test[0][0],y_pred)))
print('F1 Score : ' + str(f1_score(y_test[0][0],y_pred)))

from sklearn.metrics import confusion_matrix
print('Confusion Matrix : \n' + str(confusion_matrix(y_test[0][0],y_pred)))

Accuracy Score : 0.625
Precision Score : 0.6666666666666666
Recall Score : 0.5
F1 Score : 0.5714285714285715
Confusion Matrix : 
[[6 2]
 [4 4]]


### 7 Optimization

In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
parameters = {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}
svc = SVC()
model = GridSearchCV(svc, parameters, verbose=True)
model.fit(X_train[0][0], y_train[0][0])

model.cv_results_

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    1.0s finished


{'mean_fit_time': array([0.01533707, 0.01125042, 0.01127108, 0.01142406, 0.01188405,
        0.01396902, 0.01315212, 0.01238441, 0.01135008, 0.01142303,
        0.01096161, 0.01115004, 0.01114011, 0.01089986, 0.00914105,
        0.01125018, 0.00860969, 0.01042938, 0.00923697, 0.01035364]),
 'std_fit_time': array([0.00349347, 0.00118876, 0.00177156, 0.00149933, 0.00156353,
        0.00181708, 0.00165377, 0.00121899, 0.00165258, 0.00172106,
        0.0015802 , 0.00156902, 0.00127248, 0.00148752, 0.00043475,
        0.00183623, 0.00085888, 0.00061779, 0.00085441, 0.00061386]),
 'mean_score_time': array([0.00413736, 0.00404056, 0.00403063, 0.00452662, 0.00621271,
        0.00438857, 0.00462866, 0.0044059 , 0.0044593 , 0.00399057,
        0.00404533, 0.00432277, 0.00398231, 0.00401743, 0.0033133 ,
        0.00418552, 0.00271765, 0.00340962, 0.00280865, 0.00311271]),
 'std_score_time': array([0.00038008, 0.00044562, 0.0004793 , 0.00010416, 0.00167595,
        0.00062238, 0.00057281, 0.000315

In [6]:
model.best_score_

0.75

In [7]:
model.best_estimator_

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)