In [87]:
def read_split(path:str,size:float):
  data = pd.read_csv(path)
  X = data.drop(columns=['survival_status'])
  y = data.survival_status
  return train_test_split(X,y, test_size=size, random_state=123)

In [102]:
import pandas as pd
import os
import glob
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from  xgboost import XGBClassifier as xgb
from sklearn.model_selection import GridSearchCV

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import TomekLinks 
from sklearn.metrics import balanced_accuracy_score

In [14]:
paths = glob.glob('./*.csv')

In [11]:
!mkdir data

In [15]:
for idx, path in enumerate(paths):
   os.rename(path,f'./data/data{idx}.csv')

In [17]:
paths = glob.glob('./data/*.csv')

In [84]:
models = []
models.append(('Tree',DecisionTreeClassifier()))
models.append(('RF',RandomForestClassifier()))
models.append(('KNN',KNeighborsClassifier()))
models.append(('XGB',xgb()))

In [85]:
parameters = []
parameters.append(('Tree',{'min_samples_leaf':[20],
                           'random_state':[123]}))

parameters.append(('RF',{'n_estimators':[500],
                         'min_samples_leaf':[20],
                         'random_state':[123]}))

parameters.append(('KNN',{'n_neighbors':[5]}))

parameters.append(('XGB',{'nthread':[4],
                          'learning_rate': [.03, 0.05, .07],
                          'max_depth': [3],
                          'min_child_weight': [4],
                          'silent': [1],
                          'subsample': [0.7],
                          'colsample_bytree': [0.7],
                          'n_estimators': [50,10,100,30],
                          'random_state':[123]}))
parameters = dict(parameters)

In [86]:
dict_models = dict(models)

In [100]:
for path in paths:
  X_train, X_test, y_train, y_test = read_split(path,0.2)
  ros = SMOTE(random_state=123)
  x_ros, y_ros = ros.fit_resample(X_train,y_train)

  results = []
  names = []
  for name, model in models:
    model_grid = GridSearchCV(model,
                 parameters[name],
                 cv = 5,
                 n_jobs = 5,
                 verbose=True,
                 scoring='balanced_accuracy')
    model_grid.fit(x_ros, y_ros)
    print(model_grid.best_params_)
    # print(model_grid.best_score_)
    model.parameters = model_grid.best_params_
    cv_results = cross_val_score(model,x_ros, y_ros, cv=5,scoring='balanced_accuracy')
    results.append(cv_results)
    names.append(name)
    msg= '%s: %f (%f)' %(name, cv_results.mean(), cv_results.std())
    print(msg)
    model.fit(x_ros,y_ros)
    print(balanced_accuracy_score(y_test, model.predict(X_test)))
    print('******************')
  # alg = str(input('Ingrese el Algoritmo deseado: '))
  # print(dict_models[alg])


Fitting 5 folds for each of 1 candidates, totalling 5 fits
{'min_samples_leaf': 20, 'random_state': 123}
Tree: 0.691190 (0.035657)
0.6404255319148936
******************
Fitting 5 folds for each of 1 candidates, totalling 5 fits
{'min_samples_leaf': 20, 'n_estimators': 500, 'random_state': 123}
RF: 0.742619 (0.059970)
0.6631205673758864
******************
Fitting 5 folds for each of 1 candidates, totalling 5 fits
{'n_neighbors': 5}
KNN: 0.725397 (0.032569)
0.6524822695035462
******************
Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'colsample_bytree': 0.7, 'learning_rate': 0.07, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 4, 'random_state': 123, 'silent': 1, 'subsample': 0.7}
XGB: 0.702381 (0.019089)
0.6524822695035462
******************


In [101]:
parameters = []
parameters.append(('Tree',{'min_samples_leaf':[20],
                           'random_state':[123]}))

parameters.append(('RF',{'n_estimators':[500],
                         'min_samples_leaf':[20],
                         'random_state':[123]}))

parameters.append(('KNN',{'n_neighbors':[5]}))

parameters.append(('XGB',{'nthread':[4],
                          'learning_rate': [.03, 0.05, .07],
                          'max_depth': [3],
                          'min_child_weight': [4],
                          'silent': [1],
                          'subsample': [0.7],
                          'colsample_bytree': [0.7],
                          'n_estimators': [50,10,100,30],
                          'random_state':[123]}))
parameters = dict(parameters)

In [104]:
for path in paths:
  X_train, X_test, y_train, y_test = read_split(path,0.2)
  ros = TomekLinks()
  x_ros, y_ros = ros.fit_resample(X_train,y_train)

  results = []
  names = []
  for name, model in models:
    model_grid = GridSearchCV(model,
                 parameters[name],
                 cv = 5,
                 n_jobs = 5,
                 verbose=True,
                 scoring='balanced_accuracy')
    model_grid.fit(x_ros, y_ros)
    print(model_grid.best_params_)
    # print(model_grid.best_score_)
    model.parameters = model_grid.best_params_
    cv_results = cross_val_score(model,x_ros, y_ros, cv=5,scoring='balanced_accuracy')
    results.append(cv_results)
    names.append(name)
    msg= '%s: %f (%f)' %(name, cv_results.mean(), cv_results.std())
    print(msg)
    model.fit(x_ros,y_ros)
    print(balanced_accuracy_score(y_test, model.predict(X_test)))
    print('******************')
  # alg = str(input('Ingrese el Algoritmo deseado: '))
  # print(dict_models[alg])

Fitting 5 folds for each of 1 candidates, totalling 5 fits
{'min_samples_leaf': 20, 'random_state': 123}
Tree: 0.551854 (0.085401)
0.649645390070922
******************
Fitting 5 folds for each of 1 candidates, totalling 5 fits
{'min_samples_leaf': 20, 'n_estimators': 500, 'random_state': 123}
RF: 0.635329 (0.033628)
0.5283687943262412
******************
Fitting 5 folds for each of 1 candidates, totalling 5 fits
{'n_neighbors': 5}
KNN: 0.606350 (0.030793)
0.6709219858156028
******************
Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'colsample_bytree': 0.7, 'learning_rate': 0.07, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 4, 'random_state': 123, 'silent': 1, 'subsample': 0.7}
XGB: 0.613799 (0.023877)
0.5950354609929078
******************
