In [1]:
import numpy as np
from scipy import stats
import scipy.io as sio
from os import getcwd
from os.path import join 
import pandas as pd
from scipy.stats.stats import mode
from sklearn.model_selection import (train_test_split, RepeatedKFold, RandomizedSearchCV)
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from statsmodels.tsa.ar_model import AutoReg
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression 
from sklearn import svm
from sklearn import neighbors

In [2]:
fname = join(getcwd(),'data','data3SS2009.mat')

mat_contents = sio.loadmat(fname)

dataset = mat_contents['dataset']

N, Chno, Nc = dataset.shape
# N: number of samples
# Chno: number of channels
# Nc: number of cases

Y = mat_contents['labels'].reshape(Nc)

# print sizes 
pd.DataFrame(np.array([N, Chno, Nc,len(Y)]))

Unnamed: 0,0
0,8192
1,5
2,850
3,850


In [3]:
# AutoRegression
X_ = np.empty((0, 850), float)
for i in range(4):
    chdata = dataset[:,i+1,:]
    ch_corr = []
    for j in range(np.shape(chdata)[1]):
        res = AutoReg(chdata[:,j],lags=29,old_names=False).fit()
        ch_corr.append(res.params)
    
    X_ = np.append(X_, np.array(ch_corr).transpose() , axis=0)
X = np.transpose(X_)

# np.shape(X)

# pca
pca = PCA(random_state=0, whiten=True, n_components=.95)
X_pca = pca.fit_transform(X)
Xpca_scaled = MinMaxScaler(feature_range=(-1,1)).fit_transform(X_pca)

X_train, X_test, Y_train, Y_test = train_test_split(Xpca_scaled,Y,test_size=0.4,random_state=42)  

# Models and hyperparameters dictionaries

In [4]:
models = []
models.append(("LREG",LogisticRegression()))
models.append(("SVC",svm.SVC()))
models.append(("KNN",neighbors.KNeighborsClassifier()))

param_grid = []

param_grid.append(
    [
        {
            "LREG__C": stats.loguniform(0.1, 100),
            "LREG__solver": ["auto","lbfgs", "liblinear","saga"],
            "LREG__multi_class": ["auto","ovr", "multinomial"],
        }
    ]
)
param_grid.append(
    [
        {
            "SVC__C": stats.loguniform(0.1, 1000),
            "SVC__kernel": ["linear", "poly","rbf","sigmoid"],
            "SVC__degree": stats.randint(2,5),
            "SVC__gamma": stats.loguniform(0.0001,1),
        }
    ]
)
param_grid.append(
    [
        {
            "KNN__n_neighbors": stats.randint(2,100),
            "KNN__weights": ["uniform", "distance"],
        }
    ]
)


# Hyperparameter search

Realizando a busca aleatória com validação cruzada, guardando os resultados de yh para os classificadores padrão e para os classificadores 'otimizados'


In [7]:
rkf = RepeatedKFold(n_splits=5,n_repeats=50,random_state=42)
scores = np.zeros((np.shape(models)[0],1))
scores_std = np.zeros((np.shape(models)[0],1))

Res_cols = ['y','yh_LREG','yh_LREG_Opt','yh_SVC','yh_SVC_Opt','yh_KNN','yh_KNN_Opt']
Results =  pd.DataFrame([], columns=Res_cols)
Results['y'] = Y_test

BestParams = []

j = 1
for i in range(len(models)):
    clf=Pipeline([models[i]])
    pgrid = param_grid[i]

    randSearch = RandomizedSearchCV(clf, verbose=0,scoring="accuracy",param_distributions=pgrid,n_iter=100,n_jobs=2,cv=rkf,refit=True)
    
    randSearch.fit(X_train,Y_train)

    clf.fit(X_train,Y_train)
    
    scores[i] = randSearch.score(X_test,Y_test)
    yh = randSearch.predict(X_test)

    scores_std[i] = clf.score(X_test,Y_test)
    yh_std = clf.predict(X_test)

    Results[Res_cols[j]] = yh_std
    Results[Res_cols[j+1]] = yh
    
    j = j+2

    BestParams.append(randSearch.best_params_.copy())



    




        nan        nan        nan        nan        nan        nan
 0.8254902  0.86447059        nan 0.90921569        nan 0.87882353
 0.90717647 0.73392157 0.76352941 0.86321569 0.90690196 0.68345098
        nan        nan 0.86556863 0.90666667 0.73960784 0.85992157
 0.75505882 0.78929412 0.80203922        nan 0.89047059        nan
        nan        nan 0.67588235        nan 0.70113725 0.87729412
 0.8787451  0.90098039        nan 0.76839216        nan 0.84564706
 0.87372549 0.87960784        nan 0.79541176 0.86537255 0.90909804
 0.89592157 0.67047059 0.84133333        nan 0.8607451  0.88309804
        nan 0.87898039 0.89913725 0.86054902        nan 0.87811765
 0.78207843 0.90098039        nan 0.82439216 0.87882353 0.8825098
        nan 0.87556863 0.75631373        nan 0.7472549         nan
 0.86733333 0.87592157 0.88101961        nan 0.87329412 0.88290196
        nan 0.87560784 0.76556863 0.75580392        nan 0.86796078
 0.73278431        nan 0.89796078        nan 0.88619608 0.76529

A seguir é impressa a tabela com os resultados dos classificadores, sendo as colunas '_Opt' representantes dos modelos com os parâmetros otimizados pela random search. 

In [8]:
print(Results)

      y  yh_LREG  yh_LREG_Opt  yh_SVC  yh_SVC_Opt  yh_KNN  yh_KNN_Opt
0    11       11           11      11          11      11          11
1     8        8            8       8           8       8           8
2     3        3            3       3           3       3           3
3    14       14           14      14          14      14          14
4     1        1            1       1           1       1           1
..   ..      ...          ...     ...         ...     ...         ...
335  16       15           15      15          15      15          15
336  16       15           16      15          16      16          15
337  11       11           11      11          11      11          11
338  11       11           11      11          11      11          11
339   2        1            2       2           2       2           2

[340 rows x 7 columns]


A seguir, são apresentados os parâmetros ótimos de cada classificador para este problema. Para a logistic regression, por exemplo, os parâmetros multi_class e solver foram iguais aos do default, e o C (default = 1.0) foi de 63.
Para o SVC, o kernel selecionado foi o mesmo do classificador padrão, rbf
Para o kNN, o número de vizinhos selecionado foi de 4, sendo o valor default = 5. 

In [9]:
BestParams

[{'LREG__C': 63.81725254593122,
  'LREG__multi_class': 'auto',
  'LREG__solver': 'lbfgs'},
 {'SVC__C': 379.00637673798536,
  'SVC__degree': 2,
  'SVC__gamma': 0.5551113100958404,
  'SVC__kernel': 'rbf'},
 {'KNN__n_neighbors': 4, 'KNN__weights': 'uniform'}]

In [11]:
ScoresTableHeaders = ['Method','score_Opt','score_Std']

ScoresTable =  pd.DataFrame([], columns=ScoresTableHeaders)

Methods = [['Linear Reg'],['SVC'],['kNN']]
ScoresTable[ScoresTableHeaders[0]] = Methods
ScoresTable[ScoresTableHeaders[1]] = scores
ScoresTable[ScoresTableHeaders[2]] = scores_std
ScoresTable

Unnamed: 0,Method,score_Opt,score_Std
0,[Linear Reg],0.944118,0.858824
1,[SVC],0.952941,0.911765
2,[kNN],0.911765,0.938235


Como visto, a pontuação dos modelos otimizados foram superiores aos dos modelos padrão, com excessão do kNN, cujo modelo default usa N = 5, e o modelo otimizado pela random search possui N = 4
