In [1]:
import numpy as np
from scipy import stats
import scipy.io as sio
from os import getcwd
from os.path import join 
import pandas as pd
from scipy.stats.stats import mode
from sklearn.model_selection import (train_test_split, RepeatedKFold, RandomizedSearchCV)
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from statsmodels.tsa.ar_model import AutoReg
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression 
from sklearn import svm
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [2]:
fname = join(getcwd(),'data','data3SS2009.mat')

mat_contents = sio.loadmat(fname)

dataset = mat_contents['dataset']

N, Chno, Nc = dataset.shape
# N: number of samples
# Chno: number of channels
# Nc: number of cases

Y = mat_contents['labels'].reshape(Nc)

# print sizes 
pd.DataFrame(np.array([N, Chno, Nc,len(Y)]))

Unnamed: 0,0
0,8192
1,5
2,850
3,850


In [3]:
# AutoRegression
X_ = np.empty((0, 850), float)
for i in range(4):
    chdata = dataset[:,i+1,:]
    ch_corr = []
    for j in range(np.shape(chdata)[1]):
        res = AutoReg(chdata[:,j],lags=29,old_names=False).fit()
        ch_corr.append(res.params)
    
    X_ = np.append(X_, np.array(ch_corr).transpose() , axis=0)
X = np.transpose(X_)

# np.shape(X)

# pca
pca = PCA(random_state=0, whiten=True, n_components=.95)
X_pca = pca.fit_transform(X)
Xpca_scaled = MinMaxScaler(feature_range=(-1,1)).fit_transform(X_pca)

X_train, X_test, Y_train, Y_test = train_test_split(Xpca_scaled,Y,test_size=0.4,random_state=42)  

# Models and hyperparameters dictionaries

In [4]:
models = []
models.append(("LREG",LogisticRegression()))
models.append(("SVC",svm.SVC()))
models.append(("KNN",neighbors.KNeighborsClassifier()))
models.append(("DTC",DecisionTreeClassifier()))
models.append(("RFC",RandomForestClassifier()))

param_grid = []

param_grid.append(
    [
        {
            "LREG__C": stats.loguniform(0.1, 100),
            "LREG__solver": ["auto","lbfgs", "liblinear","saga"],
            "LREG__multi_class": ["auto","ovr", "multinomial"],
        }
    ]
)
param_grid.append(
    [
        {
            "SVC__C": stats.loguniform(0.1, 1000),
            "SVC__kernel": ["linear", "poly","rbf","sigmoid"],
            "SVC__degree": stats.randint(2,5),
            "SVC__gamma": stats.loguniform(0.0001,1),
        }
    ]
)
param_grid.append(
    [
        {
            "KNN__n_neighbors": stats.randint(2,100),
            "KNN__weights": ["uniform", "distance"],
        }
    ]
)

param_grid.append(
    [
        {
            "DTC__criterion": ["gini","entropy"],
            "DTC__splitter": ["best", "random"],
            "DTC__max_features": stats.uniform(),
            "DTC__min_samples_split": stats.randint(2,6),
        }
    ]
)

param_grid.append(
    [
        {
            "RFC__n_estimators": stats.randint(2,200),
            "RFC__criterion": ["gini", "entropy"],
            "RFC__max_features": stats.uniform(),
            "RFC__max_depth": stats.randint(2,100),
        }
    ]
)

# Hyperparameter search

Realizando a busca aleatória com validação cruzada, guardando os resultados de yh para os classificadores padrão e para os classificadores 'otimizados'


In [6]:
rkf = RepeatedKFold(n_splits=5,n_repeats=50,random_state=42)
scores = np.zeros((np.shape(models)[0],1))
scores_std = np.zeros((np.shape(models)[0],1))

Res_cols = ['y','yh_LREG','yh_LREG_Opt','yh_SVC','yh_SVC_Opt','yh_KNN','yh_KNN_Opt','y_DTC','yh_DTC_Opt','yh_RFC','yh_RFC_Opt']
Results =  pd.DataFrame([], columns=Res_cols)
Results['y'] = Y_test

BestParams = []

j = 1
for i in range(len(models)):
    clf=Pipeline([models[i]])
    pgrid = param_grid[i]

    randSearch = RandomizedSearchCV(clf, verbose=0,scoring="accuracy",param_distributions=pgrid,n_iter=100,n_jobs=2,cv=rkf,refit=True)
    
    randSearch.fit(X_train,Y_train)

    clf.fit(X_train,Y_train)
    
    scores[i] = randSearch.score(X_test,Y_test)
    yh = randSearch.predict(X_test)

    scores_std[i] = clf.score(X_test,Y_test)
    yh_std = clf.predict(X_test)

    Results[Res_cols[j]] = yh_std
    Results[Res_cols[j+1]] = yh
    
    j = j+2

    BestParams.append(randSearch.best_params_.copy())



    




 0.78980392 0.83019608        nan        nan        nan 0.88066667
        nan        nan 0.85247059 0.88309804        nan 0.8274902
        nan 0.90094118        nan 0.87568627 0.86960784        nan
        nan        nan 0.7227451  0.88              nan        nan
 0.90321569 0.86968627 0.86039216        nan        nan 0.86890196
        nan 0.87376471        nan 0.75701961 0.86317647        nan
 0.78164706 0.83380392 0.78270588 0.90270588 0.7252549  0.84541176
 0.86313725 0.85341176 0.90913725        nan 0.7854902  0.76733333
 0.79337255        nan 0.85952941 0.89227451 0.85560784 0.8992549
        nan 0.88388235 0.90372549        nan 0.78701961 0.87094118
 0.8912549  0.9032549  0.70427451 0.87807843        nan 0.88294118
 0.90698039 0.68639216 0.67435294 0.88411765        nan 0.73658824
 0.72321569 0.90666667        nan 0.84741176 0.90917647        nan
        nan 0.89694118 0.8554902         nan 0.7992549  0.85627451
 0.90078431        nan 0.88576471 0.884      0.76043137        n

A seguir é impressa a tabela com os resultados dos classificadores, sendo as colunas '_Opt' representantes dos modelos com os parâmetros otimizados pela random search. 

In [7]:
print(Results)

      y  yh_LREG  yh_LREG_Opt  yh_SVC  yh_SVC_Opt  yh_KNN  yh_KNN_Opt  y_DTC  \
0    11       11           11      11          11      11          11     11   
1     8        8            8       8           8       8           8      8   
2     3        3            3       3           3       3           3      3   
3    14       14           14      14          14      14          14     14   
4     1        1            1       1           1       1           1      1   
..   ..      ...          ...     ...         ...     ...         ...    ...   
335  16       15           15      15          15      15          15     16   
336  16       15           16      15          16      16          15     16   
337  11       11           11      11          11      11          11     11   
338  11       11           11      11          11      11          11     11   
339   2        1            2       2           2       2           2      2   

     yh_DTC_Opt  yh_RFC  yh_RFC_Opt  
0

A seguir, são apresentados os parâmetros ótimos de cada classificador para este problema. Para a logistic regression, por exemplo, os parâmetros multi_class e solver foram iguais aos do default, e o C (default = 1.0) foi de 63. <br>
Para o SVC, o kernel selecionado foi o mesmo do classificador padrão, rbf  <br>
Para o kNN, o número de vizinhos selecionado foi de 4, sendo o valor default = 5.  <br>
Para o decision tree classifier, o número mínimo de amostrar para a separação dos nós é de 4 <br>
Para o random forest classifier, o número de estimadores ótimo foi de 191 <br>

In [8]:
BestParams

[{'LREG__C': 56.27135342945326,
  'LREG__multi_class': 'auto',
  'LREG__solver': 'lbfgs'},
 {'SVC__C': 390.61839350657897,
  'SVC__degree': 2,
  'SVC__gamma': 0.2967998882031199,
  'SVC__kernel': 'rbf'},
 {'KNN__n_neighbors': 4, 'KNN__weights': 'uniform'},
 {'DTC__criterion': 'entropy',
  'DTC__max_features': 0.9744587727411228,
  'DTC__min_samples_split': 4,
  'DTC__splitter': 'best'},
 {'RFC__criterion': 'gini',
  'RFC__max_depth': 50,
  'RFC__max_features': 0.5544000835685161,
  'RFC__n_estimators': 191}]

In [10]:
ScoresTableHeaders = ['Method','score_Opt','score_Std']

ScoresTable =  pd.DataFrame([], columns=ScoresTableHeaders)

Methods = [['Linear Reg'],['SVC'],['kNN'],['DTC'],['RFC']]
ScoresTable[ScoresTableHeaders[0]] = Methods
ScoresTable[ScoresTableHeaders[1]] = scores
ScoresTable[ScoresTableHeaders[2]] = scores_std
ScoresTable

Unnamed: 0,Method,score_Opt,score_Std
0,[Linear Reg],0.944118,0.858824
1,[SVC],0.95,0.911765
2,[kNN],0.911765,0.938235
3,[DTC],0.888235,0.876471
4,[RFC],0.929412,0.926471


Como visto, a pontuação dos modelos otimizados foram superiores aos dos modelos padrão, com excessão do kNN, cujo modelo default usa N = 5, e o modelo otimizado pela random search possui N = 4
