# HIML TakeHome3

In [30]:
import scipy.io as sio
import numpy as np
from os import getcwd
from os.path import join 
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.ar_model import AutoReg
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import neighbors

In [31]:
fname = join(getcwd(),'data','data3SS2009.mat')

mat_contents = sio.loadmat(fname)

dataset = mat_contents['dataset']

N, Chno, Nc = dataset.shape
# N: number of samples
# Chno: number of channels
# Nc: number of cases

Y = mat_contents['labels'].reshape(Nc)

# print sizes 
pd.DataFrame(np.array([N, Chno, Nc,len(Y)]))

Unnamed: 0,0
0,8192
1,5
2,850
3,850


# AutoRegression + PCA + scaling



In [32]:
# AutoRegression
X_ = np.empty((0, 850), float)
for i in range(4):
    chdata = dataset[:,i+1,:]
    ch_corr = []
    for j in range(np.shape(chdata)[1]):
        res = AutoReg(chdata[:,j],lags=29,old_names=False).fit()
        ch_corr.append(res.params)
    
    X_ = np.append(X_, np.array(ch_corr).transpose() , axis=0)
X = np.transpose(X_)

# np.shape(X)

# pca
pca = PCA(random_state=0, whiten=True, n_components=.95)
X_pca = pca.fit_transform(X)
Xpca_scaled = MinMaxScaler(feature_range=(-1,1)).fit_transform(X_pca)

# nPC = np.shape(Xpca_scaled)[1]
# nPC

In [33]:
X_train, X_test, Y_train, Y_test = train_test_split(Xpca_scaled,Y,test_size=0.25,random_state=42)  

# Softmax classification

In [34]:
softmax_reg = LogisticRegression(multi_class="multinomial",solver="lbfgs",max_iter=200)
softmax_reg.fit(X_train,Y_train)
softmax_reg.score(X_test,Y_test)

0.9107981220657277

Pelo classificador linear, a acurácia em relação ao test-set foi de aprox. 91%

# SVM classification

Varia-se o tipo de kernel para entre rbf (Radial Basis Function), sigmoid e polinomial. 


In [35]:
# variando o tipo de kernel
kernels = ['rbf','sigmoid','poly']
scores_SVM = []
for i in range(len(kernels)):
    clf = svm.SVC(kernel=kernels[i],random_state=0)
    clf.fit(X_train,Y_train)
    scores_SVM.append(clf.score(X_test,Y_test))
print(scores_SVM)

[0.9248826291079812, 0.784037558685446, 0.9107981220657277]


Nota-se que com o rbf, a acurácia do classificador em relaçao ao test-set foi de 92.5%. 

Para o caso polinomial, varia-se o grau da funçao de 2 a 5

In [36]:
# com o kernel = poly, varia o grau da função polinomial
degs = [2,3,4,5]
scores_SVM_poly = []
for i in range(len(degs)):
    clf = svm.SVC(kernel='poly',degree=degs[i],random_state=0)
    clf.fit(X_train,Y_train)
    scores_SVM_poly.append(clf.score(X_test,Y_test))
print(scores_SVM_poly)

[0.9154929577464789, 0.9107981220657277, 0.9154929577464789, 0.9014084507042254]


Com grau 2 e 4, obtemos os melhores resultados, ~91.5%

Para o caso do rbf, varia-se o gamma, que determina a influência de um exemplo de treinamento.

In [38]:
# com o kernel = rbf, varia o gamma
gammas = [0.1,1,10]
scores_SVM_gam = []
for i in range(len(gammas)):
    clf = svm.SVC(kernel='rbf',gamma=gammas[i],random_state=0)
    clf.fit(X_train,Y_train)
    scores_SVM_gam.append(clf.score(X_test,Y_test))
print(scores_SVM_gam)

[0.8403755868544601, 0.92018779342723, 0.9295774647887324]


Com gamma = 10, obtemos uma acurácia de ~93%

# kNN classification

Explora-se o número de vizinhos do classificador kNN

In [25]:
# variando o número de vizinhos
Nneig = [i for i in range(5,16)]
scores_kNN = []
for i in range(len(Nneig)):
    clf = neighbors.KNeighborsClassifier(n_neighbors=int(Nneig[i]))
    clf.fit(X_train,Y_train)
    scores_kNN.append(clf.score(X_test,Y_test))
print(scores_kNN)

[0.9295774647887324, 0.9389671361502347, 0.9342723004694836, 0.92018779342723, 0.92018779342723, 0.9248826291079812, 0.9248826291079812, 0.9154929577464789, 0.9154929577464789, 0.9154929577464789, 0.9154929577464789]


Com 6 vizinhos, obtém-se uma acurácia de 93.4%.

A seguir, varia-se o parâmetro da métrica Minkowski. Quando p =1, usa-se a manhattan_distance, p = 2, euclidean_distance , e p qualquer, minkowski_distance.

In [29]:
# variando o power parameter for the Minkowski metric
ps = [1,2,3,4]
scores_kNN_ps = []
for i in range(len(ps)):
    clf = neighbors.KNeighborsClassifier(n_neighbors=6,p=ps[i])
    clf.fit(X_train,Y_train)
    scores_kNN_ps.append(clf.score(X_test,Y_test))
print(scores_kNN_ps)

[0.9248826291079812, 0.9389671361502347, 0.92018779342723, 0.9154929577464789]


Para p = 2, obtém-se umma acurácia de 93.9% (a melhor entre todos os testes realizados)