In [18]:
#Ignorar warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np #operaciones matriciales y con vectores
import pandas as pd #tratamiento de datos
import matplotlib.pyplot as plt #gráficos
from sklearn import metrics, tree
from sklearn.model_selection import cross_val_score, cross_validate #método para evaluar varios particionamientos de C-V
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import RFECV
from scipy.stats import uniform
from scipy.stats import randint
from sklearn.preprocessing import StandardScaler
import math
import seaborn as sns

#Bagging
from sklearn.ensemble import BaggingClassifier

#Random Forest
from sklearn.ensemble import RandomForestClassifier

#Boosting
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import zero_one_loss

In [4]:
data = pd.read_csv('data_set_cleaned.csv', header = 0, names=None)
data.head()

Unnamed: 0,Case1_Control0,Edad,Genero,Cycle Time,Stance Percent,Stance Time,Step Length,Step Number,Step Time,Step Cadence,...,Step Timel,Step Cadencel,Stride Numberl,Stride Lengthl,Swing Percentl,Swing Timel,Distancel,Duration Timel,Speedl,Accelerationl
0,Positivo,79,M,0.803979,0.644524,1.327075,0.796338,2,0.335486,58.28075,...,0.312041,88.428185,1,0.185463,0.459888,0.624083,1.890646,1.357033,1.39322,1.026666
1,Positivo,79,M,0.826989,0.633677,1.326004,0.823259,2,0.352115,57.346196,...,0.429035,59.169468,1,1.198056,0.423096,0.85807,1.923441,2.028073,0.948408,0.46764
2,Positivo,79,M,0.789131,0.610778,1.219237,0.739892,2,0.359121,60.114133,...,0.405821,56.550877,1,1.059078,0.382493,0.811643,1.974094,2.121983,0.930306,0.438414
3,Positivo,68,M,1.193026,0.536687,1.263887,0.741654,2,0.476885,50.955888,...,0.435792,50.936203,1,1.085094,0.36996,0.871583,1.951583,2.355888,0.828385,0.351623
4,Positivo,68,M,1.045567,0.508093,1.061983,0.817604,2,0.446011,57.412502,...,0.358499,59.64247,1,1.189026,0.356362,0.716997,1.858186,2.011989,0.923557,0.459027


In [5]:
x = data.loc[: , ~data.columns.isin(['Case1_Control0'])]
x = pd.get_dummies(data=x)
x = x.values

y = data['Case1_Control0'].values

In [62]:
#Tunear modelo Bagging

#Parametros a tunear
param_grid = {
    'base_estimator__max_depth' : [2, 5, 10, 200],
    'base_estimator__min_samples_split' : [5,10,20],
    'n_estimators' : [10, 50, 100, 200],
    'max_samples' : [0.05, 0.2, 0.5, 0.7],
    'max_features' : [0.05, 0.2, 0.5, 0.7]
}

GS_BT = GridSearchCV(BaggingClassifier(tree.DecisionTreeClassifier()),
                     param_grid=param_grid, scoring='accuracy', cv=10)

GS_BT.fit(x, y)

print('Mejores parametros:\n', GS_BT.best_params_)
print('Mejor Exactitud:\n', GS_BT.best_score_)

Mejores parametros:
 {'base_estimator__max_depth': 10, 'base_estimator__min_samples_split': 5, 'max_features': 0.2, 'max_samples': 0.05, 'n_estimators': 100}
Mejor Exactitud:
 0.7032967032967034


In [61]:
# Random Forest

param_grid = {
    'n_estimators' : [10, 20, 40, 100],
    'max_depth' : [8, 10],
    'min_samples_split' : [10, 20, 25],
    'min_samples_leaf' : [1, 5, 8, 10],
    'min_impurity_decrease' : [0.0, 0.1]
}

GS_RF = GridSearchCV(RandomForestClassifier(criterion='entropy', max_features=('auto'),
                                            bootstrap=True, oob_score=True,
                                            random_state=1234, n_jobs=2, verbose=0),
                     param_grid=param_grid, scoring='accuracy', cv=10)

GS_RF.fit(x,y)

print('Mejores parametros:\n', GS_RF.best_params_)
print('Mejor Exactitud:\n', GS_RF.best_score_)

Mejores parametros:
 {'max_depth': 8, 'min_impurity_decrease': 0.1, 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_estimators': 100}
Mejor Exactitud:
 0.6758241758241759


In [17]:
param_grid = {
    'n_estimators' : [randint.rvs(10,120,1)],
    'max_depth' : [randint.rvs(5,15,1)],
    'min_samples_split' : [randint.rvs(10,25,1)],
    'min_samples_leaf' : [randint.rvs(1,20,1)],
    'min_impurity_decrease' : uniform(0.0,0.1)
}

RS_RF = RandomizedSearchCV(RandomForestClassifier(criterion='entropy', max_features=('auto'),
                                            bootstrap=True, oob_score=True,
                                            random_state=1234, n_jobs=1, verbose=0), 
                           param_distributions=param_grid, scoring='accuracy', cv=10, random_state=1234)  

RS_RF.fit(x,y)

print('Mejores parametros:\n', RS_RF.best_params_)
print('Mejor Exactitud:\n', RS_RF.best_score_)

Mejores parametros:
 {'max_depth': 12, 'min_impurity_decrease': 0.06221087710398319, 'min_samples_leaf': 13, 'min_samples_split': 16, 'n_estimators': 59}
Mejor Exactitud:
 0.6758241758241759


In [60]:
# Boosting

param_grid ={
    'n_estimators' : [10, 20, 100, 400],
    'learning_rate' : [0.05, 0.1, 0.2, 0.5,0.7,0.8]
}

GS_Boo = GridSearchCV(AdaBoostClassifier(base_estimator=None, algorithm='SAMME'), 
                      param_grid=param_grid, scoring='accuracy', cv=10)
GS_Boo.fit(x,y)

print('Mejores parametros:\n', GS_Boo.best_params_)
print('Mejor Exactitud:\n', GS_Boo.best_score_)

Mejores parametros:
 {'learning_rate': 0.05, 'n_estimators': 20}
Mejor Exactitud:
 0.7142857142857143


## Recursive Feature Elimination (RFE)

In [32]:
ada = AdaBoostClassifier(n_estimators=20, learning_rate=0.05, algorithm='SAMME', base_estimator=None)

rfecv_ada = RFECV(estimator=ada, step=1, cv=10, scoring='accuracy')

rfecv_ada.fit(x,y)

print("Caracteristicas seleccionadas:",data.columns[rfecv.support_])
print("Mejor Exactitud:", np.max(rfecv.grid_scores_))

Caracteristicas seleccionadas: Index(['Cycle Timel', 'Step Timel'], dtype='object')
Mejor Exactitud: 0.7266081871345029
