In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from sklearn import preprocessing

In [3]:
filepath = '/Users/rabarry/Documents/Data Science Project/Flicker_SVM_cond.csv'
flick = pd.read_csv(filepath)

In [4]:
flick.describe()

Unnamed: 0,part,CB,AGEgrp,Cond,T5,O1,T6,O2
count,236.0,236.0,236.0,236.0,236.0,236.0,236.0,236.0
mean,216.364407,1.423729,0.398305,0.5,0.137462,0.260106,0.166236,0.280719
std,92.930417,0.495199,0.490589,0.501063,0.217415,0.331852,0.239162,0.330535
min,100.0,1.0,0.0,0.0,-0.293793,-0.602527,-0.308235,-0.28566
25%,136.0,1.0,0.0,0.0,-0.005333,0.040931,-0.00772,0.049511
50%,176.0,1.0,0.0,0.5,0.097704,0.173076,0.118794,0.17329
75%,322.0,2.0,1.0,1.0,0.230907,0.421064,0.313945,0.449729
max,351.0,2.0,1.0,1.0,1.069666,2.00684,1.037423,1.93158


# Now try classification methods

We'll split the data into training and test sets first.

In [5]:
from sklearn.model_selection import train_test_split

Split into babies and adults and run separately for the ages

In [10]:
Type1 = flick['AGEgrp']==1 #Adults
Type2 = flick['AGEgrp']==0 #Infants
Adult = flick[Type1]
Infant = flick[Type2]

In [16]:
Adult.shape , Infant.shape

((94, 11), (142, 11))

Specify which columns are the features

In [17]:
Adult_X = Adult[['T5', 'O1', 'O2', 'T6']]
Infant_X = Infant[['T5', 'O1', 'O2', 'T6']]

In [18]:
Adult_X.shape, Infant_X.shape

((94, 4), (142, 4))

In [19]:
Adult_X.head()

Unnamed: 0,T5,O1,O2,T6
71,-0.010387,-0.048322,0.157702,0.408852
72,0.048279,0.133431,0.151073,0.068142
73,0.094795,0.062732,0.128669,0.174961
74,0.240412,0.194844,0.166047,0.301236
75,0.114125,0.043617,0.047323,0.113662


Specify which column is your group ID

In [20]:
Adult_y = Adult['Cond']
Infant_y = Infant['Cond']

In [24]:
Infant_y.head()

0    1
1    1
2    1
3    1
4    1
Name: Cond, dtype: int64

In [25]:
#Get rid of the row nums
Infant_y = np.ravel(Infant_y)
Adult_y = np.ravel(Adult_y)

In [26]:
Cond = ['face', 'object']

In [27]:
feature_names = ['T5', 'O1', 'O2', 'T6']

In [125]:
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(Adult_X, Adult_y, test_size=0.20, random_state=0)
X_train_I, X_test_I, y_train_I, y_test_I = train_test_split(Infant_X, Infant_y, test_size=0.20, random_state=0)

In [126]:
X_train_I.shape, y_train_I.shape

((113, 4), (113,))

Standardize the X data

In [127]:
scaler_A = preprocessing.StandardScaler().fit(X_train_A) 
scaler_I = preprocessing.StandardScaler().fit(X_train_I) 

In [128]:
scaler_A.mean_ , scaler_A.scale_ , scaler_I.mean_ , scaler_I.scale_                                       

(array([0.23892847, 0.31969993, 0.35133382, 0.2994626 ]),
 array([0.24946736, 0.36134885, 0.38877099, 0.23224176]),
 array([0.07384875, 0.21470711, 0.23915866, 0.08734896]),
 array([0.173304  , 0.30144057, 0.29452711, 0.19914373]))

In [129]:
X_train_scaled_A = scaler_A.transform(X_train_A) 
X_train_scaled_I = scaler_I.transform(X_train_I) 

In [130]:
X_test_scaled_A = scaler_A.transform(X_test_A)
X_test_scaled_I = scaler_I.transform(X_test_I)

# try K nearest neighbors

In [131]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model_A = model.fit(X_train_scaled_A, y_train_A)
model_I = model.fit(X_train_scaled_I, y_train_I)

In [132]:
# check the accuracy on the training set
model_A.score(X_train_scaled_A, y_train_A), model_I.score(X_train_scaled_I, y_train_I)

(0.38666666666666666, 0.6460176991150443)

In [133]:
# predict class labels for the test set
predicted_A = model_A.predict(X_test_scaled_A)
predicted_I = model_I.predict(X_test_scaled_I)
print (predicted_A, predicted_I)

[0 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 0 1] [1 1 0 0 1 1 0 0 0 0 0 1 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 1 0]


In [134]:
print(y_test_A, y_test_I)

[1 1 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 1 0] [1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 1 1 1 1 0 0 1]


In [135]:
# generate evaluation metrics
from sklearn import metrics
print (metrics.accuracy_score(y_test_A, predicted_A), metrics.accuracy_score(y_test_I, predicted_I))

0.3684210526315789 0.5172413793103449


In [136]:
#print (metrics.confusion_matrix(y_test_A, predicted_A))
print (metrics.confusion_matrix(y_test_I, predicted_I))

[[8 6]
 [8 7]]


# Try k folds cross-validation

In [137]:
#K folds cross-validation
from sklearn import model_selection
model = KNeighborsClassifier()
kfold = model_selection.KFold(n_splits=6, random_state=12323, shuffle=True) # note shuffle is true so that samples are randomly assigned to the folds.

In [138]:
Adult_X_scaled = scaler_A.transform(Adult_X)
Infant_X_scaled = scaler_I.transform(Infant_X)

In [139]:
results_A = model_selection.cross_val_score(model, Adult_X_scaled, Adult_y, cv=kfold)
results_I = model_selection.cross_val_score(model, Infant_X_scaled, Infant_y, cv=kfold)
results_A , results_I

(array([0.625     , 0.625     , 0.8125    , 0.625     , 0.6       ,
        0.73333333]),
 array([0.5       , 0.54166667, 0.58333333, 0.41666667, 0.39130435,
        0.47826087]))

In [140]:
print("Adult Accuracy: %.3f%% (%.3f%%)" % (results_A.mean()*100.0, results_A.std()*100.0) )
print("Infant Accuracy: %.3f%% (%.3f%%)" % (results_I.mean()*100.0, results_I.std()*100.0) )

Adult Accuracy: 67.014% (7.669%)
Infant Accuracy: 48.521% (6.658%)


# Try leave one out cross validation

In [141]:
#leave one out cross validation (LOOCV)
model = KNeighborsClassifier()
loocv = model_selection.LeaveOneOut()
results_A = model_selection.cross_val_score(model, Adult_X_scaled, Adult_y, cv=loocv)
results_I = model_selection.cross_val_score(model, Infant_X_scaled, Infant_y, cv=loocv)
print("Adult Accuracy: %.3f%% (%.3f%%)" % (results_A.mean()*100.0, results_A.std()*100.0))
print("Infant Accuracy: %.3f%% (%.3f%%)" % (results_I.mean()*100.0, results_I.std()*100.0))

Adult Accuracy: 62.766% (48.343%)
Infant Accuracy: 47.887% (49.955%)


# Try SVM
Start with a linear kernel

In [160]:
#linear SVM 'liner', polynomial = 'poly', 'rbf', 'sigmoid'
from sklearn.svm import SVC
clf_A = SVC(kernel = 'sigmoid')
clf_I = SVC(kernel = 'sigmoid')
clf_A.fit(X_train_scaled_A, y_train_A) 
clf_I.fit(X_train_scaled_I, y_train_I) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='sigmoid', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [161]:
y_pred_A = clf_A.predict(X_test_scaled_A)
y_pred_I = clf_I.predict(X_test_scaled_I)

In [162]:
print(y_pred_A, y_pred_I)

[1 0 1 1 1 0 1 1 1 0 1 1 1 0 0 0 0 1 1] [1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 1 0 1 1 1 0 1 0 1 0 0]


In [163]:
print(y_test_A, y_test_I)

[1 1 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 1 0] [1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 1 1 1 1 0 0 1]


In [164]:
#evaluate the SVM
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test_A, y_pred_A))
print(classification_report(y_test_A, y_pred_A))

[[5 6]
 [2 6]]
              precision    recall  f1-score   support

           0       0.71      0.45      0.56        11
           1       0.50      0.75      0.60         8

    accuracy                           0.58        19
   macro avg       0.61      0.60      0.58        19
weighted avg       0.62      0.58      0.57        19



In [165]:
print(confusion_matrix(y_test_I, y_pred_I))
print(classification_report(y_test_I, y_pred_I))

[[ 6  8]
 [10  5]]
              precision    recall  f1-score   support

           0       0.38      0.43      0.40        14
           1       0.38      0.33      0.36        15

    accuracy                           0.38        29
   macro avg       0.38      0.38      0.38        29
weighted avg       0.38      0.38      0.38        29

