# Parameter Tunning AdaBoost

In [19]:
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import SCORERS
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import mean_squared_error

from imblearn.over_sampling import SMOTE

import xgboost as xgb

# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
labels_f = pd.read_csv('../../data/labels_f_filtrado.csv')
datos = pd.read_csv('../../data/datos_filtrado.csv')

In [3]:
atributos = labels_f.columns.tolist()

In [4]:
atributos.remove('person')
atributos.remove('label')

In [5]:
y = labels_f['label'].ravel()
X = labels_f.loc[:,atributos]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, random_state= 123)

In [7]:
def aplicarRus(X_train_p, y_train_p):
    rus= RandomUnderSampler(return_indices=True)
    #id_rus son los índices
    X_rus_train, y_rus_train, id_rus = rus.fit_sample(X_train_p, y_train_p)
    return(X_rus_train, y_rus_train)

In [8]:
X_train, y_train = aplicarRus(X_train,y_train)

In [12]:
modelo = AdaBoostClassifier(n_estimators =  500, learning_rate = 0.01)

In [10]:
param_grid = {#"base_estimator__criterion" : ["gini", "entropy"],
              "learning_rate" :   [0.1, 0.01, 0.001, 0.05],
              "n_estimators": [int(x) for x in np.linspace(start = 200, stop = 1600, num = 10)]
             }


#DTC = DecisionTreeClassifier(random_state = 11, max_features = "auto", class_weight = "balanced",max_depth = None)

ABC = AdaBoostClassifier()#base_estimator = DTC)

# run grid search
clf = GridSearchCV(ABC, param_grid=param_grid, scoring = 'roc_auc')


#clf = GridSearchCV(estimator=model, param_grid=random_grid, scoring = 'roc_auc')
clf.fit(X_train, y_train)

print(clf.best_score_)
print(clf.best_estimator_)

preds = clf.predict(X_test)
preds_prob = clf.predict_proba(X_test)[:,1]
train_accuracy = accuracy_score(y_train, clf.predict(X_train))
test_accuracy = accuracy_score(y_test, preds)
area_debajo_de_curva = roc_auc_score(y_test, preds_prob)
matriz_de_confusion = confusion_matrix(y_test, preds)
print('train acurracy: ')
print(train_accuracy)
print('test acurracy: ')
print(test_accuracy)
print('Matriz de confusión: ')
print(matriz_de_confusion)
print('Área bajo la curva: ')
print(area_debajo_de_curva)
print()




0.8601944251134096
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.01, n_estimators=977, random_state=None)
train acurracy: 
0.8019187358916479
test acurracy: 
0.790422245108136
Matriz de confusión: 
[[1467  381]
 [  26   68]]
Área bajo la curva: 
0.8413696232845169



In [20]:
def aplicarSmote(X_train_p, y_train_p, X_test):
    smote = SMOTE(ratio='minority')
    X_smt_train, y_smt_train = smote.fit_sample(X_train_p, y_train_p)
    #X_smt_train = pd.DataFrame(X_smt_train)
    #X_smt_train.columns = X_test.columns
    return (X_smt_train, y_smt_train)

In [21]:
def aplicarRus(X_train_p, y_train_p,X_test_p):
    rus= RandomUnderSampler(return_indices=True)
    #id_rus son los índices
    X_rus_train, y_rus_train, id_rus = rus.fit_sample(X_train_p, y_train_p)
    #X_rus_train = pd.DataFrame(X_rus_train)
    #X_rus_train.columns = X_test_p.columns
    return(X_rus_train, y_rus_train)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, random_state= 123)

In [23]:
X_rus_train, y_rus_train = aplicarRus(X_train,y_train,X_test)
X_smt_train, y_smt_train = aplicarSmote(X_train,y_train,X_test)

In [24]:
def aplicarModelo(modelo,X_train_p, X_test_p, y_train_p, y_test_p, early = False):
    if(early):
        modelo.fit(X_train_p, y_train_p, early_stopping_rounds = 5, eval_set=[(X_test_p, y_test_p)])
    else: 
        modelo.fit(X_train_p, y_train_p)
    
    preds = modelo.predict(X_test_p)
    preds_prob = modelo.predict_proba(X_test_p)[:,1]
    train_accuracy = accuracy_score(y_train_p, modelo.predict(X_train_p))
    test_accuracy = accuracy_score(y_test_p, preds)
    area_debajo_de_curva = roc_auc_score(y_test_p, preds_prob)
    matriz_de_confusion = confusion_matrix(y_test_p, preds)
    print('train acurracy: ')
    print(train_accuracy)
    print('test acurracy: ')
    print(test_accuracy)
    print('Matriz de confusión: ')
    print(matriz_de_confusion)
    print('Área bajo la curva: ')
    print(area_debajo_de_curva)
    print()

In [26]:
modelo = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.01, n_estimators=977, random_state=None)
print('APLICO SMOTE')
aplicarModelo(modelo,X_smt_train, X_test, y_smt_train, y_test, False)
print('APLICO RUS')
aplicarModelo(modelo,X_rus_train, X_test, y_rus_train, y_test, False)

APLICO SMOTE
train acurracy: 
0.9035029542988062
test acurracy: 
0.870236869207003
Matriz de confusión: 
[[1640  208]
 [  44   50]]
Área bajo la curva: 
0.8387877636547849

APLICO RUS
train acurracy: 
0.7990970654627539
test acurracy: 
0.800205973223481
Matriz de confusión: 
[[1487  361]
 [  27   67]]
Área bajo la curva: 
0.8442249240121581

