# Traning Models

In [110]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from yellowbrick.classifier import ConfusionMatrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import xgboost as xgb 

import warnings
warnings.filterwarnings('ignore')

## Models Group Alpha -> X_data

In [111]:
# Import data training and testing
X_train = np.loadtxt('../data/processed/X_data/X_train.csv', delimiter=',') 
X_train_feature = np.loadtxt('../data/processed/X_data/X_train_feature.csv', delimiter=',') 
y_train = np.loadtxt('../data/processed/y_train.csv', delimiter=',')

print(f' Shape X_train:{X_train.shape}\n', 
      f'Shape X_train_feature:{X_train_feature.shape}\n',
      f'Shape y_train: {y_train.shape}')

 Shape X_train:(455, 30)
 Shape X_train_feature:(455, 9)
 Shape y_train: (455,)


### Selecting model
Objeticve to this topic are create 5 default models of classifications and using the cross validation to verify the model that have a best AUC and standard deviations for this training data. 

In [112]:
models = [RandomForestClassifier, 
          MultinomialNB, 
          LogisticRegression,
          KNeighborsClassifier,
          xgb.XGBClassifier,
          SVC]

In [113]:
# Function to apply cross-validation: 
def evaluate_models(models, X_train, y_train):
    for model in models:
        cls = model()
        kfold = KFold(n_splits=10, random_state=42, shuffle=True)
        s = cross_val_score(cls, X_train, y_train, scoring='roc_auc', cv=kfold)
        print(f"{model.__name__:22} AUC: "
              f"{s.mean():.3f} STD: {s.std():.2f}")


In [114]:
# Apply cross-validationt to X_train: 
evaluate_models(models, X_train, y_train)

RandomForestClassifier AUC: 0.992 STD: 0.01
MultinomialNB          AUC: 0.936 STD: 0.04
LogisticRegression     AUC: 0.994 STD: 0.01
KNeighborsClassifier   AUC: 0.988 STD: 0.02
XGBClassifier          AUC: 0.994 STD: 0.01
SVC                    AUC: 0.995 STD: 0.01


In [115]:
# Apply cross-validationt to X_train: 
evaluate_models(models, X_train_feature, y_train)

RandomForestClassifier AUC: 0.984 STD: 0.02
MultinomialNB          AUC: 0.852 STD: 0.07
LogisticRegression     AUC: 0.986 STD: 0.02
KNeighborsClassifier   AUC: 0.976 STD: 0.02
XGBClassifier          AUC: 0.982 STD: 0.02
SVC                    AUC: 0.986 STD: 0.02


To this both cases with features selections and all features, I'll select the RandonForest, SVC and XGBC to construct and optmize new models. 

* I'll use GridSearch to construct severals models and select the best params. 

### Random Forest

In [116]:
# Creating a function to train: 
def train_random_forest(X_train, y_train, model_name):
    params = {
        'n_estimators': [100, 200, 300],
        'criterion': ['gini', 'entropy'],
        'max_features': ['sqrt', 'log2']
    }

    clf = RandomForestClassifier(random_state=40)
    clf.fit(X_train, y_train)

    cv = GridSearchCV(clf, params, n_jobs=-1, scoring='roc_auc').fit(X_train, y_train)
    print(cv.best_params_)
    model_alpha_rf = RandomForestClassifier(**cv.best_params_)

    model_alpha_rf.fit(X_train, y_train)

    # Save model
    joblib.dump(model_alpha_rf, '../models/'+model_name+'.pkl')
    print('Model Saved: ../models/'+model_name+'.pkl')


In [117]:
# Training with all features X_train: 
train_random_forest(X_train, y_train,'model_alpha_rf')

{'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 200}
Model Saved: ../models/model_alpha_rf.pkl


In [118]:
# Training with features selected X_train_features:
train_random_forest(X_train_feature, y_train,'model_alpha_rf_feature')

{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 100}
Model Saved: ../models/model_alpha_rf_feature.pkl


### XGBoost Classifier

In [119]:
# Creating a function to train: 
def train_xgboostclf(X_train, y_train, model_name):
    params = {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.1, 0.01, 0.001],
        'max_depth': [3, 5, 7],
        'subsample': [0.5, 0.75, 1.0]
        }

    clf = xgb.XGBClassifier()
    clf.fit(X_train, y_train)

    cv = GridSearchCV(clf, params, n_jobs=-1, scoring='roc_auc').fit(X_train, y_train)
    print(cv.best_params_)
    model = xgb.XGBClassifier(**cv.best_params_)

    model.fit(X_train, y_train)

    # Save model
    joblib.dump(model, '../models/'+model_name+'.pkl')
    print('Model Saved: ../models/'+model_name+'.pkl')

In [120]:
# Training with all features X_train: 
train_xgboostclf(X_train, y_train,'model_alpha_xgb')

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.5}
Model Saved: ../models/model_alpha_xgb.pkl


In [121]:
# Training with all features X_train_feature: 
train_xgboostclf(X_train_feature, y_train,'model_alpha_xgb_feature')

{'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.5}
Model Saved: ../models/model_alpha_xgb_feature.pkl


### SCV Classifier

In [122]:
# Creating a function to train: 
def train_SCV(X_train, y_train, model_name):
    params = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'poly', 'rbf'],
        'degree': [2, 3, 4],
        }

    clf = SVC()
    clf.fit(X_train, y_train)

    cv = GridSearchCV(clf, params, n_jobs=-1, scoring='roc_auc').fit(X_train, y_train)
    print(cv.best_params_)
    model = xgb.XGBClassifier(**cv.best_params_)

    model.fit(X_train, y_train)

    # Save model
    joblib.dump(model, '../models/'+model_name+'.pkl')
    print('Model Saved: ../models/'+model_name+'.pkl')

In [123]:
# Training with all features X_train: 
train_SCV(X_train, y_train,'model_alpha_svc')

{'C': 10, 'degree': 2, 'kernel': 'rbf'}
Parameters: { "C", "degree", "kernel" } are not used.

Model Saved: ../models/model_alpha_svc.pkl


In [124]:
# Training with selected features X_train_feature: 
train_SCV(X_train, y_train,'model_alpha_svc_feature')

{'C': 10, 'degree': 2, 'kernel': 'rbf'}
Parameters: { "C", "degree", "kernel" } are not used.

Model Saved: ../models/model_alpha_svc_feature.pkl


## Models Group Beta -> X_data2

In [125]:
# Import data training and testing
X_train2 = np.loadtxt('../data/processed/X_data2/X_train2.csv', delimiter=',') 
X_train2_feature = np.loadtxt('../data/processed/X_data2/X_train2_features.csv', delimiter=',') 
y_train = np.loadtxt('../data/processed/y_train.csv', delimiter=',')

print(f' Shape X_train:{X_train2.shape}\n', 
      f'Shape X_train_feature:{X_train2_feature.shape}\n',
      f'Shape y_train: {y_train.shape}')

 Shape X_train:(455, 24)
 Shape X_train_feature:(455, 7)
 Shape y_train: (455,)


In [126]:
# to X_data2 all features
evaluate_models(models,X_train2, y_train)

RandomForestClassifier AUC: 0.994 STD: 0.01
MultinomialNB          AUC: 0.962 STD: 0.03
LogisticRegression     AUC: 0.988 STD: 0.02
KNeighborsClassifier   AUC: 0.979 STD: 0.03
XGBClassifier          AUC: 0.995 STD: 0.01
SVC                    AUC: 0.994 STD: 0.01


In [127]:
# to X_data2 with seletec features
evaluate_models(models,X_train2_feature, y_train)

RandomForestClassifier AUC: 0.991 STD: 0.01
MultinomialNB          AUC: 0.736 STD: 0.10
LogisticRegression     AUC: 0.973 STD: 0.04
KNeighborsClassifier   AUC: 0.967 STD: 0.04
XGBClassifier          AUC: 0.988 STD: 0.01
SVC                    AUC: 0.985 STD: 0.02


### RandomForest

In [128]:
# To all features
train_random_forest(X_train2,y_train,'model_beta_rf')

{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 100}
Model Saved: ../models/model_beta_rf.pkl


In [129]:
# To seletec features
train_random_forest(X_train2_feature,y_train,'model_beta_rf_features')

{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 200}
Model Saved: ../models/model_beta_rf_features.pkl


### XGBOOSTCV

In [130]:
# To all features
train_xgboostclf(X_train2,y_train,'model_beta_xgb')

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.5}
Model Saved: ../models/model_beta_xgb.pkl


In [131]:
# To select features
train_xgboostclf(X_train2_feature,y_train,'model_beta_xgb_feature')

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
Model Saved: ../models/model_beta_xgb_feature.pkl


### SCV 

In [132]:
# To all features
train_SCV(X_train2, y_train, 'model_beta_svc')

{'C': 10, 'degree': 2, 'kernel': 'rbf'}
Parameters: { "C", "degree", "kernel" } are not used.

Model Saved: ../models/model_beta_svc.pkl


In [133]:
# To selected features
train_SCV(X_train2_feature, y_train, 'model_beta_svc_feature')

{'C': 10, 'degree': 2, 'kernel': 'rbf'}
Parameters: { "C", "degree", "kernel" } are not used.

Model Saved: ../models/model_beta_svc_feature.pkl
