# Import packages

In [1]:
# load data
from submodules.load_data import load_data

# data manipulation
import numpy as np
import pandas as pd

# data splitting
from sklearn.model_selection import train_test_split

# data preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# model
from sklearn.ensemble import RandomForestClassifier

# hyperparameter tuning
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV

# k-fold cross validation
from sklearn.model_selection import cross_validate

# saving models
import joblib

# performance
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt


# Load the data

Load semi-colon separated data from disk

In [2]:
data = load_data()

# Create a Test Dataset

In [3]:
X_train, X_test, y_train, y_test = \
    train_test_split(data.drop("isSepsis", axis=1),
    data["isSepsis"], test_size=0.15,
    random_state=42, stratify=data["isSepsis"])

# Drop the non-biological attributes from the training data

In [5]:
data_num = X_train.drop(["Age",
                         "Unit1",
                         "Unit2",
                         "HospAdmTime",
                         "ICULOS",
                         "Gender"
                         ], axis=1)

# Transform the numeric training data

In [6]:
num_imputer = SimpleImputer(strategy="median")
num_imputer.fit(data_num)
N = num_imputer.transform(data_num)

# Transformation pipeline

In [7]:
num_pipeline = Pipeline([
                        ('imputer', SimpleImputer(strategy='median')),
                        ('std_scaler', StandardScaler()),
                        ])

num_prepared = num_pipeline.fit_transform(data_num)

# Full data pipeline

In [8]:
num_attribs = list(data_num)
# construct the transformer
full_pipeline = ColumnTransformer([
    # transform number columns with num_pipeline defined earlier
    ("num", num_pipeline, num_attribs)
])

# only run the pipeline on the training as the test data will be applied during the evaluation stage with the final model
X_train = full_pipeline.fit_transform(X_train)

# Train a Random Forest Classifier

In [10]:
model = RandomForestClassifier()
cv_model = cross_validate(model,
                       X_train,
                       y_train,
                       n_jobs=-1,
                       cv=3,
                       scoring="f1",
                       return_train_score=True)
cv_model

{'fit_time': array([3.64704895, 4.11862206, 4.18677402]),
 'score_time': array([0.1201489 , 0.13489914, 0.13860822]),
 'test_score': array([0.54166667, 0.57429048, 0.53461876]),
 'train_score': array([0.99933378, 0.99966678, 0.99966678])}

In [11]:
# serialize the model
joblib.dump(model, "models/tune/rfc_model.pkl")

['models/tune/rfc_model.pkl']

In [13]:
# load the model from disk
rfc_model = joblib.load("models/tune/rfc_model.pkl")

# Fine tune the Random Forest Classifier with Randomize Search Cross Validation

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html?highlight=randomizedsearchcv#sklearn.model_selection.RandomizedSearchCV
https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [14]:
print("[INFO] setting hyperparameters...")
# number of trees in the forest
n_estimators = [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
# max number of features considered for splitting a node
max_features = ["auto","sqrt","log2"]
# max number of levels in each decision tree
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
# min number of data points placed in a node before the node is split
min_samples_split = [2, 5, 10]
# min number of data points allowed in a leaf node
min_samples_leaf = [1, 2, 4]
# method for sampling data points (with or without replacement)
bootstrap = [True, False]
grid = dict(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, bootstrap=bootstrap)

[INFO] setting hyperparameters...


In [None]:
# initialize a cross-validation fold and perform a randomized-search
# to tune the hyperparameters
print("[INFO] grid searching over the hyperparameters...")
cvFold = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
randomSearch = RandomizedSearchCV(estimator=model, n_jobs=-1,
	cv=cvFold, param_distributions=grid,
	scoring="f1", return_train_score=True)
searchResults = randomSearch.fit(X_train, y_train)

[INFO] grid searching over the hyperparameters...


In [13]:
# print the best score
searchResults.best_score_

0.5721142535592545

In [14]:
# print the best combination of parameters
searchResults.best_params_

{'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 80,
 'bootstrap': False}

In [15]:
# print the best estimator directly
searchResults.best_estimator_


RandomForestClassifier(bootstrap=False, max_depth=80, max_features='log2',
                       min_samples_split=5, n_estimators=1000)

In [None]:
# display the importance scores next to the corresponding attributes
# from this you can drop less useful features
feature_importances = searchResults.best_estimator_.feature_importances_
sorted(zip(feature_importances, num_attribs), reverse=True)

In [None]:
# fit the data with the best model parameters
rfc_model = RandomForestClassifier(**searchResults.best_params_)
rfc_model.fit(X_train, y_train)

In [None]:
# confusion matrix
fig, ax = plt.subplots(figsize=(6, 6))
plot_confusion_matrix(rfc_model, X_test, y_test, cmap="Blues", ax=ax)
plt.savefig("reports/confusionMatrix.png", dpi=400)

In [None]:
# serialize the model
joblib.dump(rfc_model, "models/final/rfc_model.pkl")


