MODELLING

In [3]:
import pandas as pd
from google.colab import drive
import pickle

drive.mount('/content/drive')
google_drive_path = '/content/drive/MyDrive/Colab Notebooks/Artificial Intelligence Project/'

with open(google_drive_path + 'final_dataset.pkl', 'rb') as f:
  final_dataset = pickle.load(f)

final_dataset.head(10)

Mounted at /content/drive


Unnamed: 0,ADMISSION_TYPE_EMERGENCY,ADMISSION_TYPE_URGENT,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ADMISSION_LOCATION_HMO REFERRAL/SICK,ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI,ADMISSION_LOCATION_TRANSFER,ADMISSION_LOCATION_UNKNOWN,INSURANCE_Medicaid,INSURANCE_Medicare,INSURANCE_Private,...,PROC_225752,PROC_225789,PROC_225792,PROC_225794,PROC_225966,PROC_227194,PROC_227712,PROC_228128,PROC_228129,TARGET
0,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,11
1,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,10
2,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,10
3,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,9
4,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,11
5,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,11
6,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,10
7,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,10
8,1,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,10
9,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,10


In [6]:
from sklearn.model_selection import train_test_split

# separate features from target
X = final_dataset.iloc[:, :-1]
y = final_dataset.iloc[:, -1].astype(int)

# split into trainining-validation and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=10, stratify=y
)

In [10]:
!pip install imbalanced-learn

# apply smote for effective resampling of target classs
from imblearn.over_sampling import SMOTENC

bool_cols = final_dataset.select_dtypes(include=bool).columns
bool_indexes = [final_dataset.columns.get_loc(col) for col in bool_cols]

smote = SMOTENC(categorical_features=bool_indexes, random_state=10)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)



MODEL 1: Random Forester Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


# create model
model1 = RandomForestClassifier(random_state=10)

# list combination of hyperparameters
params1 = {
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 150],
    'max_features': ['sqrt'],
    'min_samples_split': [10],
    'min_samples_leaf': [5],
    'n_jobs': [-1]
}

# instantiate GridSearch to enable hyperparameter combination tuning
gridsearch1 = GridSearchCV(estimator=model1, param_grid=params1,
                          cv=5, scoring='accuracy', verbose=1)
# fit model
gridsearch1.fit(X_resampled, y_resampled)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [17]:
# SAVE MODEL AND PARAMS
import pickle

# store params1
best_params1 = gridsearch1.best_params_
print("The best parameters found are: ", best_params1)
best_model1 = gridsearch1.best_estimator_

params1_pickle_path = 'model1_bestparams.pkl'
model1_pickle_path = 'model1.pkl'

# save params1
with open(google_drive_path + params1_pickle_path, 'wb') as f:
  pickle.dump(best_params1, f)

# save model1
with open(google_drive_path + model1_pickle_path, 'wb') as f:
  pickle.dump(best_model1, f)

The best parameters found are:  {'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 150, 'n_jobs': -1}


In [4]:
# LOAD MODEL AND PARAMS
import pickle

params1_pickle_path = 'model1_bestparams.pkl'
model1_pickle_path = 'model1.pkl'

# load params1
with open(google_drive_path + params1_pickle_path, 'rb') as f:
  params1 = pickle.load(f)

# load model1
with open(google_drive_path + model1_pickle_path, 'rb') as f:
  final_model1 = pickle.load(f)

In [7]:
# predict values using best params
y_pred1 = final_model1.predict(X_test)

In [23]:
# predict scores
y_score1 = final_model1.predict_proba(X_test)

In [24]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

print("Classification Report for Model 1")
print(classification_report(y_test, y_pred1))

print("The derived AUROC score for this model:")
print(roc_auc_score(y_test, y_score1, multi_class='ovr', average='macro'))

Classification Report for Model 1
              precision    recall  f1-score   support

           0       0.85      0.76      0.80       116
           1       0.61      0.73      0.66        59
           2       0.80      0.76      0.78       367
           3       0.60      0.64      0.62       202
           4       0.84      0.78      0.81       188
           5       0.43      0.53      0.48        60
           6       0.81      0.80      0.80       150
           7       0.50      0.51      0.50        59
           8       0.76      0.69      0.72       513
           9       0.67      0.73      0.70       430
          10       0.76      0.73      0.75      3139
          11       0.61      0.65      0.63      2029

    accuracy                           0.70      7312
   macro avg       0.68      0.69      0.69      7312
weighted avg       0.71      0.70      0.71      7312

The derived AUROC score for this model:
0.9731525959136342


MODEL 2: XGBoost Classifier

In [None]:
import xgboost as xgb

#  instantiate XGB model
model2 = xgb.XGBClassifier(random_state=10)

#  detail hyperparameters for tuning combination
params2 = {
    'max_depth': [6, 9],
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    # 'min_child_weight': [1, 5],
    # 'subsample': [0.7, 0.9],
    # 'colsample_bytree': [0.7, 0.9],
    'n_jobs': [-1]
}

# instantiate gridsearch for hyperparameter tunin
gridsearch2 = GridSearchCV(estimator=model2, param_grid=params2,
                          cv=3, scoring='accuracy', verbose=1)

# fit model
gridsearch2.fit(X_train, y_train)
print("The best parameters found are: ", gridsearch2.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [34]:
# SAVE MODEL AND PARAMS
import pickle

# store params1
best_params2 = gridsearch2.best_params_
print("The best parameters found are: ", best_params2)
best_model2 = gridsearch2.best_estimator_

params2_pickle_path = 'model2_bestparams.pkl'
model2_pickle_path = 'model2.pkl'

# save params1
with open(google_drive_path + params2_pickle_path, 'wb') as f:
  pickle.dump(best_params2, f)

# save model1
with open(google_drive_path + model2_pickle_path, 'wb') as f:
  pickle.dump(best_model2, f)

The best parameters found are:  {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'n_jobs': -1}


In [10]:
# LOAD MODEL AND PARAMS

import pickle

params2_pickle_path = 'model2_bestparams.pkl'
model2_pickle_path = 'model2.pkl'

# load params1
with open(google_drive_path + params2_pickle_path, 'rb') as f:
  params2 = pickle.load(f)

# load model1
with open(google_drive_path + model2_pickle_path, 'rb') as f:
  final_model2 = pickle.load(f)

In [11]:
#  predict values using best params
y_pred2 = final_model2.predict(X_test)

In [19]:
#  predict scores using best params
y_score2 = final_model2.predict_proba(X_test)

In [20]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

print("Classification Report for Model 2")
print(classification_report(y_test, y_pred2))

print("The derived AUROC score for this model:")
print(roc_auc_score(y_test, y_score2, multi_class='ovr', average='macro'))

Classification Report for Model 2
              precision    recall  f1-score   support

           0       0.79      0.84      0.81       116
           1       0.63      0.56      0.59        59
           2       0.77      0.83      0.80       367
           3       0.65      0.55      0.59       202
           4       0.82      0.92      0.86       188
           5       0.58      0.35      0.44        60
           6       0.80      0.91      0.85       150
           7       0.63      0.41      0.49        59
           8       0.77      0.77      0.77       513
           9       0.72      0.72      0.72       430
          10       0.77      0.86      0.81      3139
          11       0.73      0.61      0.67      2029

    accuracy                           0.76      7312
   macro avg       0.72      0.69      0.70      7312
weighted avg       0.75      0.76      0.75      7312

The derived AUROC score for this model:
0.9794409363485449


In [14]:
print(y_pred2)

[ 6 10 10 ...  8  4 11]
