## Model Exploration and Selection

In [1]:
import pandas as pd

In [2]:
# Read the transformed train data
# train = pd.read_csv('./train.csv')
train = pd.read_csv('./Data/train.csv')

In [3]:
train.shape

(22175, 31)

In [4]:
inputs = train.drop(['status'], axis=1).copy()
target = train[['status']].copy()

In [5]:
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.metrics import classification_report

In [6]:
#Loading the classifiers
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [7]:
def display_scores(scores):
    print('Scores ', scores)
    print('Mean Score', scores.mean())
    print('Std', scores.std())

### Model Pre-evaluation

**1. MLP Classifier**

In [8]:
mlp_cls = MLPClassifier(max_iter=500)
mlp_scores = cross_val_score(mlp_cls, inputs, target.values.ravel(), scoring='accuracy', cv=StratifiedKFold(n_splits=10))

display_scores(mlp_scores)

Scores  [0.88638413 0.88818756 0.88548242 0.89269612 0.88322813 0.88543076
 0.8917456  0.88858818 0.88182228 0.88452864]
Mean Score 0.8868093830239566
Std 0.003329024271020675


**2. XGB Classifier**

In [9]:
xgb_cls = XGBClassifier(use_label_encoder=False)
xgb_scores = cross_val_score(xgb_cls, inputs, target.values.ravel(), scoring='accuracy', cv=StratifiedKFold(n_splits=10))

display_scores(xgb_scores)



  "because it will generate extra copies and increase " +




  "because it will generate extra copies and increase " +




  "because it will generate extra copies and increase " +




  "because it will generate extra copies and increase " +




  "because it will generate extra copies and increase " +




  "because it will generate extra copies and increase " +




  "because it will generate extra copies and increase " +




  "because it will generate extra copies and increase " +




  "because it will generate extra copies and increase " +


Scores  [0.88548242 0.88999098 0.89134355 0.8908927  0.88683499 0.88543076
 0.8940009  0.8917456  0.89039242 0.89309878]
Mean Score 0.8899213105712761
Std 0.0028749747081191442


  "because it will generate extra copies and increase " +


**3. LGBM Classifier**

In [10]:
lgbm_cls = LGBMClassifier()
lgbm_scores = cross_val_score(lgbm_cls, inputs, target.values.ravel(), scoring='accuracy', cv=StratifiedKFold(n_splits=10))

display_scores(lgbm_scores)

Scores  [0.88683499 0.89359784 0.89179441 0.89404869 0.88908927 0.88994136
 0.8917456  0.89445196 0.89129454 0.89264772]
Mean Score 0.8915446384666726
Std 0.0022605464752303815


**4. Random Forest Classifier**

In [11]:
rf_cls = RandomForestClassifier()
rf_scores = cross_val_score(rf_cls, inputs, target.values.ravel(), scoring='accuracy', cv=StratifiedKFold(n_splits=10))

display_scores(rf_scores)

Scores  [0.88142471 0.88232642 0.874211   0.88367899 0.88683499 0.87009472
 0.87956698 0.88317546 0.87505638 0.87956698]
Mean Score 0.8795936636849527
Std 0.004822529330241948


**Observations**

Clearly, XGB performs better not by far as the others are close behind. To improve the performance this notebook will hyperparamaeter tune the models and then build a voting classifier.


### Fine Tuning the Models

In [12]:
from sklearn.model_selection import GridSearchCV

rf_param_grid = {
    'n_estimators': [100, 150, 250,],
    'criterion': ['gini', 'entropy'],
    "min_samples_split":[2,3,10],
    "min_samples_leaf":[1,3,10],
    'max_features': ['sqrt', 'log2']
}

lgb_param_grid = {
    'boosting_type':['gbrt', 'dart'],
    'objective':['multiclass'],
    'learning_rate': [0.025, 0.05, 0.1],
    'n_estimators': [100, 150, 250],
    'metric': ['auc_mu', 'multi_logloss', 'multi_error'],
    'bagging_fraction': [0.5, 0.75],
    'feature_fraction':[0.8],
    'bagging_freq':[25, 40]
}

mlp_param_grid = {
    'hidden_layer_sizes' : [(100,), (150,)],
    'activation': ['relu'],
    'alpha': [0.0001, 0.001],
    'max_iter': [1000, 1500],
    'learning_rate_init': [0.01, 0.03, 0.05],
}

xgb_param_grid = {
    'booster' : ['dart'],
    'eta' : [0.01, 0.05, 0.1],
    'objective': ['multi:softprob'],
    'eval_metric': ['mlogloss']
}

classifier_param = [rf_param_grid, lgb_param_grid, mlp_param_grid, xgb_param_grid]

# Classifiers

classifier = [RandomForestClassifier(), LGBMClassifier(), MLPClassifier() ,XGBClassifier(use_label_encoder=False)]

In [13]:
cv_result = []
best_estimators = []

for i in range(len(classifier)):
  clf = GridSearchCV(classifier[i], param_grid = classifier_param[i], scoring = 'accuracy', n_jobs=-1, cv=StratifiedKFold(n_splits=5))
  clf.fit(inputs, target.values.ravel())
  cv_result.append(clf.best_score_)
  best_estimators.append(clf.best_estimator_)



In [14]:
result = pd.DataFrame({'Mean Score': cv_result, 'Model': ['Random Forest', 'LGBM', 'MLP', 'XGBoost']})

In [15]:
rf = best_estimators[0]
lgbm = best_estimators[1]
mlp = best_estimators[2]
xgb = best_estimators[3]

In [16]:
import joblib, os
path = './'
filenames =  ['rf.joblib', 'lgbm.joblib','mlp.joblib', 'xgb.joblib']
models = [rf, lgbm, mlp, xgb]

for i in range(len(filenames)):
  joblib.dump(models[i], os.path.join(path, filenames[i]))

In [17]:
result.to_csv('./result.csv')