# Training Model: SVM

## 1. Importing libraries ...

In [1]:
import pandas as pd
import numpy as np
import setup_jwlab
from jwlab.constants import cleaned_data_filepath
from jwlab.ml_prep import prep_ml, prep_ml_first20
from jwlab.ml_prep_multigroup import prep_ml_multigroup
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

## 2. Importing Participants..

In [2]:


participants = ["904", "905", "906", "909", "910", "912", "908", "913", "914", "916", "917", "919", "920", "921", "923", "924","927", "928", "929", "930", "932"]

#9m with >40 trials
#participants = [ "909", "912", "908", "913", "914", "916", "917", "919", "920", "921", "924","927", "930"]

#12m all
#participants = ["105", "107", "109", "111", "112", "115", "116", "117", "119", "121", "122", "120", "124"]

#12m with >40 trials
#participants = ["109", "111", "112", "115", "124"]

#all participants
#participants = [ "909", "910", "912", "908", "913", "914", "916", "917", "919", "920", "921", "923", "924","927", "928", "929", "930", "932",
#               "109", "111", "112", "115", "116", "117", "119", "121", "122", "120", "124"]



## 3. Set up averaging

In [3]:
X, y, good_trial_count = prep_ml(cleaned_data_filepath, participants, downsample_num=1000, averaging="no_averaging")
#Xt, yt, good_trial_count_t = prep_ml(cleaned_data_filepath, participants, downsample_num=1000, averaging="average_trials")
#Xa, ya, good_trial_count_a = prep_ml(cleaned_data_filepath, participants, downsample_num=1000, averaging="average_trials_and_participants")


loaded


In [4]:
good_trial_count

[array([0.45]), [cell
  1    15
  2    13
  3    12
  4    13
  dtype: int64]]

## 3. b) Different groups

In [None]:
X, y, good_trial_count = prep_ml_multigroup(cleaned_data_filepath, participants, downsample_num=1000, averaging="no_averaging")
Xt, yt, good_trial_count_t = prep_ml_multigroup(cleaned_data_filepath, participants, downsample_num=1000, averaging="average_trials")
Xa, ya, good_trial_count_a = prep_ml_multigroup(cleaned_data_filepath, participants, downsample_num=1000, averaging="average_trials_and_participants")


In [None]:
#good_trial_count

In [None]:
(n, d) = X[0][0].shape
assert n == y[0][0].shape[0]


In [None]:
d

In [None]:
unique, counts = np.unique(y[0][0], return_counts=True)
dict(zip(unique, counts))

In [None]:
(n, d) = Xt[0][0].shape
assert n == yt[0][0].shape[0]


n

In [None]:
d

In [None]:
unique, counts = np.unique(yt[0][0], return_counts=True)
dict(zip(unique, counts))

In [None]:
(n, d) = Xa[0][0].shape
assert n == ya[0][0].shape[0]
n

In [None]:
d

In [None]:
unique, counts = np.unique(ya[0][0], return_counts=True)
dict(zip(unique, counts))

## 3. c) First 20 trials


In [None]:
X, y, good_trial_count = prep_ml_first20(cleaned_data_filepath, participants, downsample_num=1000, averaging="no_averaging")
Xt, yt, good_trial_count_t = prep_ml_first20(cleaned_data_filepath, participants, downsample_num=1000, averaging="average_trials")
Xa, ya, good_trial_count_a = prep_ml_first20(cleaned_data_filepath, participants, downsample_num=1000, averaging="average_trials_and_participants")



In [None]:
(n, d) = X[0][0].shape
assert n == y[0][0].shape[0]
n

In [None]:
d

## 4. Setting up the SVM model ...

In [None]:
model = LinearSVC(C=1e-9, max_iter=5000)
#model = SVC(gamma=.001, kernel = 'rbf', C = 1e-6)

## 5. Training and testing the model ...

### 5.1. Train on raw, test on raw (validation)

In [None]:
# from sklearn import preprocessing

# scaler = preprocessing.StandardScaler().fit(X)
# X,Xp = scaler.transform(X), scaler.transform(Xp)
model.fit(X[0][0], y[0][0])
np.mean(model.predict(X[0][0]) != y[0][0])

### 5.2. Train on raw, test on avg by trial (word repetition) 

In [None]:
# from sklearn import preprocessing

# scaler = preprocessing.StandardScaler().fit(X)
# X,Xp = scaler.transform(X), scaler.transform(Xp)
model.fit(X[0][0], y[0][0])
np.mean(model.predict(Xt[0][0]) != yt[0][0])

### 5.3. Train on raw, test on avg by word and ps 

In [None]:
# from sklearn import preprocessing

# scaler = preprocessing.StandardScaler().fit(X)
# X,Xp = scaler.transform(X), scaler.transform(Xp)
model.fit(X[0][0], y[0][0])
np.mean(model.predict(Xa[0][0]) != ya[0][0])

# 6 Subset analysis 

## 6.1 Generating random subsets of the chosen participant list

In [None]:
participants_train, participants_test = train_test_split(participants,test_size=0.2)
print(len(participants_train), len(participants_test))

### 6.2.1 Create train and test sets: animates/inanimates

In [None]:
#X_train, y_train, good_trial_count_train = prep_ml(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="no_averaging")
#X_test, y_test, good_trial_count_test = prep_ml(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="no_averaging")

X_train, y_train, good_trial_count_train = prep_ml(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="average_trials")
X_test, y_test, good_trial_count_test = prep_ml(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="average_trials")

#X_train, y_train, good_trial_count_train = prep_ml(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="average_trials_and_participants")
#X_test, y_test, good_trial_count_test = prep_ml(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="average_trials_and_participants")

### 6.2.2 Create train and test sets - multi group

In [None]:
#X_train, y_train, good_trial_count_train = prep_ml_multigroup(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="no_averaging")
#X_test, y_test, good_trial_count_test = prep_ml_multigroup(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="no_averaging")

#X_train, y_train, good_trial_count_train = prep_ml_multigroup(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="average_trials")
#X_test, y_test, good_trial_count_test = prep_ml_multigroup(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="average_trials")

X_train, y_train, good_trial_count_train = prep_ml_multigroup(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="average_trials_and_participants")
X_test, y_test, good_trial_count_test = prep_ml_multigroup(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="average_trials_and_participants")

### 6.2.3 Create train and test sets - first 20 trials

In [None]:
#X_train, y_train, good_trial_count_train = prep_ml_first20(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="no_averaging")
#X_test, y_test, good_trial_count_test = prep_ml_first20(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="no_averaging")

#X_train, y_train, good_trial_count_train = prep_ml_first20(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="average_trials")
#X_test, y_test, good_trial_count_test = prep_ml_first20(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="average_trials")

X_train, y_train, good_trial_count_train = prep_ml_first20(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="average_trials_and_participants")
X_test, y_test, good_trial_count_test = prep_ml_first20(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="average_trials_and_participants")

In [None]:
(n, d) = X_train[0][0].shape
assert n == y_test[0][0].shape[0]
n

In [None]:
X_train[0][0].shape

### 6.2.4 Classification

In [None]:
#model = SVC(gamma=.001, kernel = 'rbf', C=1e-6)
model = SVC(kernel = 'rbf')
model.fit(X_train[0][0], y_train[0][0])
np.mean(model.predict(X_test[0][0]) != y_test[0][0])

## 6.3 Monte Carlo Classification

In [None]:

errorScores = []

# r iterations of a 5 fold
for r in range(2):
    kgp = np.array_split(participants, 5)

    for i in range(5):
        participants_test = kgp[i]
        participants_train = np.concatenate((kgp[(i+1)%5], kgp[(i+2)%5],kgp[(i+3)%5],kgp[(i+4)%5] ), axis=0)
        
        #X_train, y_train, good_trial_count_train = prep_ml(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="no_averaging")
        #X_test, y_test, good_trial_count_test = prep_ml(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="no_averaging")

        X_train, y_train, good_trial_count_train = prep_ml(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="average_trials")
        X_test, y_test, good_trial_count_test = prep_ml(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="average_trials")


        #X_train, y_train, good_trial_count_train = prep_ml(cleaned_data_filepath, participants_train, downsample_num=1000, averaging="average_trials_and_participants")
        #X_test, y_test, good_trial_count_test = prep_ml(cleaned_data_filepath, participants_test, downsample_num=1000, averaging="average_trials_and_participants")

    
        #model = SVC(gamma=.001, kernel = 'rbf', C=1e-06)
        model = LinearSVC(C=1e-9, max_iter=5000)
        model.fit(X_train[0][0], y_train[0][0])
        errorScore = np.mean(model.predict(X_test[0][0]) != y_test[0][0])
        errorScores.append(errorScore)
    
errorScores



errorScores

In [None]:
np.mean(errorScores)

In [None]:
np.std(errorScores)

## 6.4 Alternate accuracy measurements

In [None]:
from sklearn.model_selection import train_test_split

y_pred = model.predict(X_test[0][0])

from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
print('Accuracy Score : ' + str(accuracy_score(y_test[0][0],y_pred)))
print('Precision Score : ' + str(precision_score(y_test[0][0],y_pred)))
print('Recall Score : ' + str(recall_score(y_test[0][0],y_pred)))
print('F1 Score : ' + str(f1_score(y_test[0][0],y_pred)))

from sklearn.metrics import confusion_matrix
print('Confusion Matrix : \n' + str(confusion_matrix(y_test[0][0],y_pred)))

### 7 Optimization

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
parameters = {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}
svc = SVC()
model = GridSearchCV(svc, parameters, verbose=True)
model.fit(X_train[0][0], y_train[0][0])

model.cv_results_

In [None]:
model.best_score_

In [None]:
model.best_estimator_

# Cross Validation (5 fold)

In [4]:

model = LinearSVC(max_iter=5000, C=1e-9)
#model = SVC(gamma=.001, kernel = 'rbf', C = 1e-6)


In [None]:
scores = cross_val_score(model, X[0][0], y[0][0], cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [6]:
scores = cross_val_score(model, Xt[0][0], yt[0][0], cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


[0.53571429 0.49107143 0.44642857]
Accuracy: 0.49 (+/- 0.07)


In [None]:
scores = cross_val_score(model, Xa[0][0], ya[0][0], cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Repeated N-Folds

In [4]:
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

testScores = []

X = X[0][0]
y = y[0][0]

# X = Xt[0][0]
# y = yt[0][0]

rkf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2652124)
for train_index, test_index in rkf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = LinearSVC(C=1e-9, max_iter=5000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    testScore = accuracy_score(y_test,y_pred)
    testScores.append(testScore)


testScores

ValueError: Found input variables with inconsistent numbers of samples: [1042, 833]

In [5]:
print(X.shape)

(1042, 60000)


In [6]:
print(X_train.shape)

(1042, 833)


In [6]:
print(np.mean(testScores))
print(np.std(testScores))

0.4537971905179982
0.04343181289395874


In [7]:
X.shape

(336, 60000)

In [8]:
X_train.shape

(269, 60000)