## Entrenamiento de Modelos con Grid Search y Validación Cruzada
## Construcción de Pipeline

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import datasets, tree, metrics

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

### Carga de Datos

In [2]:
#Ejemplo con Datos de Calidad de Vinos
TopPerformers_Cuenta = pd.read_csv('./winequality-red.csv')

TopPerformers_Cuenta = TopPerformers_Cuenta.rename(columns={'fixed acidity': 'Factor A', 'volatile acidity':'Factor B','citric acid':'Factor C',
                                                           'residual sugar':'Factor D','chlorides':'Factor E','free sulfur dioxide':'Factor F',
                                                           'total sulfur dioxide':'Factor G','density':'Factor H','pH':'Factor I','sulphates':'Factor J',
                                                           'alcohol':'Factor K','quality':'Cuenta'})

TopPerformers_Cuenta.head()

Unnamed: 0,Factor A,Factor B,Factor C,Factor D,Factor E,Factor F,Factor G,Factor H,Factor I,Factor J,Factor K,Cuenta
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


### Normalización y Estandarización

In [3]:
X = TopPerformers_Cuenta[TopPerformers_Cuenta.columns[0:11]].values
y = TopPerformers_Cuenta['Cuenta'].values

scaler = StandardScaler()
Xstan = scaler.fit_transform(X)
dataStanX = pd.DataFrame(data = Xstan, columns = TopPerformers_Cuenta.columns[0:11])
dataStanX

Unnamed: 0,Factor A,Factor B,Factor C,Factor D,Factor E,Factor F,Factor G,Factor H,Factor I,Factor J,Factor K
0,-0.528360,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246
1,-0.298547,1.967442,-1.391472,0.043416,0.223875,0.872638,0.624363,0.028261,-0.719933,0.128950,-0.584777
2,-0.298547,1.297065,-1.186070,-0.169427,0.096353,-0.083669,0.229047,0.134264,-0.331177,-0.048089,-0.584777
3,1.654856,-1.384443,1.484154,-0.453218,-0.264960,0.107592,0.411500,0.664277,-0.979104,-0.461180,-0.584777
4,-0.528360,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246
...,...,...,...,...,...,...,...,...,...,...,...
1594,-1.217796,0.403229,-0.980669,-0.382271,0.053845,1.542054,-0.075043,-0.978765,0.899886,-0.461180,0.072294
1595,-1.390155,0.123905,-0.877968,-0.240375,-0.541259,2.211469,0.137820,-0.862162,1.353436,0.601055,0.729364
1596,-1.160343,-0.099554,-0.723916,-0.169427,-0.243707,1.255161,-0.196679,-0.533554,0.705508,0.542042,0.541630
1597,-1.390155,0.654620,-0.775267,-0.382271,-0.264960,1.542054,-0.075043,-0.676657,1.677400,0.305990,-0.209308


### Train-Test Split para Validación Cruzada

In [4]:
X_train, X_test, y_train, y_test = train_test_split(dataStanX, y, test_size = 0.3, random_state = 10)

### Grid Search de Hiperparametros
Se define un grid de parámetros para cada modelo de clasificación, con el objetivo de encontrar la combinación óptima en entrenamiento.
Se utiliza validación cruzada con CV = 10

### Regresión Logística

In [30]:
clf_LR = LogisticRegression()
clf_LR.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [31]:
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"], "solver":["lbfgs","newton-cg"],"max_iter":[1000]} # l1 lasso l2 ridge
logreg_cv=GridSearchCV(clf_LR,grid,cv=10)

In [32]:
logreg_cv.fit(X_train,y_train)

Traceback (most recent call last):
  File "C:\Users\Dell G5 15\.conda\envs\datascience\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Dell G5 15\.conda\envs\datascience\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Dell G5 15\.conda\envs\datascience\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\Dell G5 15\.conda\envs\datascience\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Dell G5 15\.conda\envs\datascience\lib\site-packages\sklearn\linear_model\_logistic.py",

GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'max_iter': [1000], 'penalty': ['l1', 'l2'],
                         'solver': ['lbfgs', 'newton-cg']})

In [33]:
print("tuned hyperparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

tuned hyperparameters :(best parameters)  {'C': 1.0, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'lbfgs'}
accuracy : 0.615693371943372


### Support Vector Machines

In [34]:
clf_SVC = SVC(probability=True)
clf_SVC.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [35]:
grid_SVC={"C":np.logspace(-3,3,7), "kernel":["linear","poly","rbf"],"degree":[1,2,3],"max_iter":[1000]}
logreg_cvSVC=GridSearchCV(clf_SVC,grid_SVC,cv=10)

In [36]:
logreg_cvSVC.fit(X_train,y_train)



GridSearchCV(cv=10, estimator=SVC(probability=True),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'degree': [1, 2, 3],
                         'kernel': ['linear', 'poly', 'rbf'],
                         'max_iter': [1000]})

In [37]:
print("tuned hyperparameters :(best parameters) ",logreg_cvSVC.best_params_)
print("accuracy :",logreg_cvSVC.best_score_)

tuned hyperparameters :(best parameters)  {'C': 10.0, 'degree': 1, 'kernel': 'rbf', 'max_iter': 1000}
accuracy : 0.6210907335907336


### Decision Trees

In [38]:
clf_DT = DecisionTreeClassifier() 
clf_DT.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [45]:
#grid_DT={"max_leaf_nodes": list(range(2, 100)), "min_samples_split": [2, 3, 4]}
grid_DT={"max_depth": list(range(5, 25))}
logreg_cvDT=GridSearchCV(clf_DT,grid_DT,cv=10)

In [46]:
logreg_cvDT.fit(X_train,y_train)



GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                       16, 17, 18, 19, 20, 21, 22, 23, 24]})

In [47]:
print("tuned hyperparameters :(best parameters) ",logreg_cvDT.best_params_)
print("accuracy :",logreg_cvDT.best_score_)

tuned hyperparameters :(best parameters)  {'max_depth': 15}
accuracy : 0.6166344916344917


### Random Forests

In [48]:
clf_RF = DecisionTreeClassifier() 
clf_RF.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [49]:
grid_RF={"max_depth": list(range(5, 25))}
logreg_cvRF=GridSearchCV(clf_RF,grid_RF,cv=10)

In [50]:
logreg_cvRF.fit(X_train,y_train)



GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                       16, 17, 18, 19, 20, 21, 22, 23, 24]})

In [51]:
print("tuned hyperparameters :(best parameters) ",logreg_cvDT.best_params_)
print("accuracy :",logreg_cvRF.best_score_)

tuned hyperparameters :(best parameters)  {'max_depth': 15}
accuracy : 0.6184362934362934


### KNN

In [52]:
clf_KNN = KNeighborsClassifier()
clf_KNN.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [75]:
grid_KNN={"n_neighbors": list(range(5, 50)),"leaf_size": list(np.linspace(5,100,20))}
logreg_cvKNN=GridSearchCV(clf_KNN,grid_KNN,cv=10)

In [76]:
logreg_cvKNN.fit(X_train,y_train)



GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'leaf_size': [5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0,
                                       40.0, 45.0, 50.0, 55.0, 60.0, 65.0, 70.0,
                                       75.0, 80.0, 85.0, 90.0, 95.0, 100.0],
                         'n_neighbors': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                         16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                         26, 27, 28, 29, 30, 31, 32, 33, 34, ...]})

In [77]:
print("tuned hyperparameters :(best parameters) ",logreg_cvKNN.best_params_)
print("accuracy :",logreg_cvKNN.best_score_)

tuned hyperparameters :(best parameters)  {'leaf_size': 5.0, 'n_neighbors': 24}
accuracy : 0.6014559202059202


### MLP

In [78]:
clf_MLP = MLPClassifier()
clf_MLP.get_params()

{'activation': 'relu',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 200,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [104]:
grid_MLP={"solver": ["adam","lbfgs","sgd"],"alpha": np.logspace(-5,0,3),"hidden_layer_sizes":[(32,8,4)],"max_iter":[1000]}
logreg_cvMLP=GridSearchCV(clf_MLP,grid_MLP,cv=10)

In [105]:
logreg_cvMLP.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

GridSearchCV(cv=10, estimator=MLPClassifier(),
             param_grid={'alpha': array([1.00000000e-05, 3.16227766e-03, 1.00000000e+00]),
                         'hidden_layer_sizes': [(32, 8, 4)], 'max_iter': [1000],
                         'solver': ['adam', 'lbfgs', 'sgd']})

### Pipeline de Modelos

In [5]:
clf_LR = LogisticRegression(max_iter=1000, multi_class = 'multinomial', solver = 'lbfgs')
clf_SVC = SVC(kernel='linear',C=0.1,probability=True)
clf_DT = DecisionTreeClassifier(max_depth = 10) 
clf_RF = RandomForestClassifier(max_depth = 10, n_estimators=15)
clf_KNN = KNeighborsClassifier(n_neighbors=25)
clf_MLP = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(32,8,4), random_state=1, max_iter= 1000)

Votacion = VotingClassifier(estimators=[('LR', clf_LR),('SV',clf_SVC),('DT',clf_DT),('RF',clf_RF),('KNN',clf_KNN),('MLP',clf_MLP)], voting='soft')

pipeclf = Pipeline([('SCALE', StandardScaler()),
                     ('Votacion', Votacion)])


In [6]:
print('Start training')
pipeclf.fit(X_train, y_train)

Start training




Pipeline(steps=[('SCALE', StandardScaler()),
                ('Votacion',
                 VotingClassifier(estimators=[('LR',
                                               LogisticRegression(max_iter=1000,
                                                                  multi_class='multinomial')),
                                              ('SV',
                                               SVC(C=0.1, kernel='linear',
                                                   probability=True)),
                                              ('DT',
                                               DecisionTreeClassifier(max_depth=10)),
                                              ('RF',
                                               RandomForestClassifier(max_depth=10,
                                                                      n_estimators=15)),
                                              ('KNN',
                                               KNeighborsClassifier(n_neighbors

In [7]:
preds_test = pipeclf.predict(X_test)
score_test = accuracy_score(y_test, preds_test)
print("Precision Test {0}".format(score_test))

preds_train = pipeclf.predict(X_train)
score_train = accuracy_score(y_train, preds_train)
print("Precision Train {0}".format(score_train))

Precision Test 0.63125
Precision Train 0.870420017873101


In [8]:
proba = pipeclf.predict_proba(X_test)
preds = pipeclf.predict(X_test)

In [9]:
preds

array([6, 5, 6, 6, 5, 6, 6, 6, 6, 5, 7, 5, 5, 5, 6, 4, 5, 6, 6, 5, 5, 6,
       6, 5, 5, 6, 5, 6, 6, 5, 5, 5, 6, 6, 6, 5, 7, 6, 6, 5, 6, 6, 5, 6,
       5, 6, 6, 6, 6, 6, 6, 7, 6, 7, 6, 6, 6, 5, 5, 5, 6, 5, 6, 5, 5, 5,
       6, 6, 5, 7, 6, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 5, 6, 5, 6, 6, 5, 5,
       5, 5, 7, 6, 7, 6, 5, 6, 5, 5, 6, 6, 7, 5, 5, 6, 6, 6, 5, 6, 6, 6,
       5, 6, 7, 5, 5, 5, 5, 5, 6, 6, 6, 5, 6, 5, 6, 5, 5, 5, 5, 5, 6, 5,
       5, 5, 6, 6, 6, 5, 5, 6, 6, 6, 5, 6, 6, 6, 6, 5, 5, 6, 5, 7, 6, 7,
       6, 5, 5, 5, 6, 6, 6, 6, 6, 5, 5, 7, 7, 7, 6, 6, 5, 6, 6, 6, 5, 5,
       6, 5, 5, 6, 6, 6, 6, 5, 5, 5, 6, 7, 5, 6, 5, 6, 6, 6, 6, 6, 5, 5,
       7, 6, 6, 5, 6, 5, 5, 5, 6, 5, 5, 7, 6, 5, 6, 5, 6, 6, 5, 5, 5, 5,
       5, 6, 5, 6, 6, 5, 6, 6, 5, 5, 5, 6, 5, 6, 5, 6, 5, 5, 5, 5, 5, 5,
       6, 7, 6, 5, 5, 5, 6, 6, 5, 6, 5, 5, 6, 5, 7, 7, 7, 6, 5, 5, 6, 6,
       5, 6, 5, 6, 6, 6, 5, 7, 6, 6, 6, 5, 5, 5, 6, 5, 7, 6, 6, 7, 5, 5,
       7, 6, 5, 6, 6, 5, 6, 5, 5, 6, 5, 6, 5, 6, 6,

In [11]:
proba*100

array([[1.28400205e-01, 8.92522686e-01, 2.56660182e+01, 6.08551392e+01,
        1.22577782e+01, 2.00141513e-01],
       [3.30301337e-01, 1.66828603e+00, 8.06004003e+01, 1.65306168e+01,
        7.96780233e-01, 7.36152985e-02],
       [1.54762903e-01, 6.47520145e-01, 2.74195628e+01, 4.81312711e+01,
        2.34760831e+01, 1.70799971e-01],
       ...,
       [8.13760656e-02, 4.26319675e-01, 7.62834700e+01, 2.27426804e+01,
        4.30657459e-01, 3.54963466e-02],
       [4.65494070e-01, 1.62586885e+00, 7.19862630e+01, 2.25289699e+01,
        3.30232712e+00, 9.10771569e-02],
       [5.99295391e-01, 3.86478460e+00, 5.22000631e+01, 4.05229786e+01,
        2.69940196e+00, 1.13476292e-01]])