In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns
import os
import pickle as pck

from sklearn.feature_extraction import image
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV

In [2]:
def evaluar_metodos(X, y, cv):
    '''
    Se van a evaluar SVC, LogisticRegression y RandomForestClassifier
    Devuelve un diccionario con listas
    0 : Mejor parámetro
    1 : Mejor Score
    2 : Tiempo
    '''
    valores_modelos = {}
    
    parametros = {'SVC' : {'kernel' : ('poly', 'rbf', 'sigmoid'), 
                          'C' : [1, 10, 100]},
                  'LGR' : {'solver' : ('newton-cg', 'liblinear'), 
                          'C' : [1, 10, 100]},
                  'RTF' : {'n_estimators' : [100, 200, 300], 
                         'max_depth' : [2, 4, 8, 10],
                         'random_state' :[2, 4, 8, 10]}
                 }
    
    modelos = {'SVC' : SVC(gamma = 'scale'),
               'LGR' : LogisticRegression(max_iter = 300),
               'RTF' : RandomForestClassifier()     
              }
    
    for m in modelos:
        clf = GridSearchCV(modelos[m], parametros[m], cv = cv)
        clf.fit(X, y)
        valores_modelos[m] = [clf.best_score_, clf.best_params_, clf.refit_time_]       
   
    return valores_modelos

In [3]:
arch_feature = ''
with (open('features/features-train.pck', 'rb')) as pc_file:
    try:
        arch_feature = pck.load(pc_file)
    except EOFError as e:
        print(e)

In [4]:
arch_label = ''
with (open('features/label-train.pck', 'rb')) as pc_file:
    try:
        arch_label = pck.load(pc_file)
    except EOFError as e:
        print(e)

In [5]:
targetNames = np.unique(arch_label)
le = LabelEncoder()
target = le.fit_transform(arch_label)


scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_features = scaler.fit_transform(arch_feature)

In [6]:
(X_train, X_test, y_train, y_test) = train_test_split(np.array(rescaled_features),
                                                      np.array(target),
                                                      test_size = 0.2,
                                                      random_state = 10)

print("Train data  : {}".format(X_train.shape))
print("Test data   : {}".format(X_test.shape))
print("Train etiquetas: {}".format(y_train.shape))
print("Test etiquetas : {}".format(y_test.shape))

Train data  : (948, 532)
Test data   : (238, 532)
Train etiquetas: (948,)
Test etiquetas : (238,)


In [9]:
def evaluar_modelos(X, y, cv):
    valores_modelos = {}
    
    parametros = {'SVC' : {'estimator__kernel' :['poly', 'rbf', 'sigmoid'], 
                          'estimator__C' : [1, 10, 100]},
                  'LGR' : {'estimator__solver' : ['newton-cg', 'liblinear'], 
                          'estimator__C' : [1, 10, 100]},
                  'RTF' : {'estimator__n_estimators' : [100, 200, 300], 
                         'estimator__max_depth' : [2, 4, 8, 10],
                         'estimator__random_state' :[2, 4, 8, 10]},
                  'CART': {'estimator__max_depth': [1, 3, 5, 7], 
                           'estimator__max_features': [10, 20, 30, 40, 50]},
                  'NBG' : {}
                 }
    
    
    modelos = {'SVC' : OneVsRestClassifier(SVC(gamma = 'scale')),
               'LGR' : OneVsRestClassifier(LogisticRegression(max_iter = 300)),
               'RTF' : OneVsRestClassifier(RandomForestClassifier()),
               'CART': OneVsRestClassifier(DecisionTreeClassifier(random_state = 9)),
               'NBG' : OneVsRestClassifier(GaussianNB())
              }    
    
    for m in modelos:
        clf = GridSearchCV(modelos[m], parametros[m], cv = cv, iid = False)
        clf.fit(X, y)
        valores_modelos[m] = [clf.best_score_, clf.best_params_, clf.refit_time_]       
   
    return valores_modelos

In [None]:
def evaluar_modelos_test(X, y, cv):
    valores_modelos = {}
    
    parametros = {
                  'KNN' : {'estimator__n_neighbors' : [3, 5, 11, 19], 
                           'estimator__weights' : ['uniform', 'distance']},
                  'CART': {'estimator__max_depth': [1, 3, 5, 7], 
                           'estimator__max_features': [10, 20, 30, 40, 50]},
                  'NBG' : {}
                 }
    
    
    modelos = {'KNN' : OneVsRestClassifier(KNeighborsClassifier()),
               'CART': OneVsRestClassifier(DecisionTreeClassifier(random_state = 9)),
               'NBG' : OneVsRestClassifier(GaussianNB())
              }    
    
    for m in modelos:
        clf = GridSearchCV(modelos[m], parametros[m], cv = cv, iid = False)
        clf.fit(X, y)
        valores_modelos[m] = [clf.best_score_, clf.best_params_, clf.refit_time_]       
   
    return valores_modelos

In [10]:
evaluar_modelos(X_train, y_train, 10)

{'SVC': [0.8819774169466218,
  {'estimator__C': 10, 'estimator__kernel': 'rbf'},
  0.2947821617126465],
 'LGR': [0.8639375699888019,
  {'estimator__C': 10, 'estimator__solver': 'newton-cg'},
  0.052110910415649414],
 'RTF': [0.8862105729749906,
  {'estimator__max_depth': 10,
   'estimator__n_estimators': 100,
   'estimator__random_state': 10},
  0.2893836498260498],
 'CART': [0.8049883351997013,
  {'estimator__max_depth': 7, 'estimator__max_features': 20},
  0.004080533981323242],
 'NBG': [0.7552682904068682, {}, 0.004555702209472656]}