# Evaluate the Random Forest Classifier

# Import packages

In [None]:
# load data
from submodules.load_data import load_data

# data manipulation
import numpy as np
import pandas as pd

# data splitting
from sklearn.model_selection import train_test_split

# data preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# model
from sklearn.ensemble import RandomForestClassifier

# hyperparameter tuning
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV

# k-fold cross validation
from sklearn.model_selection import cross_validate

# saving models
import joblib

# performance
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt

# Load the data

Load semi-colon separated data from disk

In [None]:
data = load_data()

# Drop the non-biological attributes from the training data

In [None]:
data.drop(["Age",
                         "Unit1",
                         "Unit2",
                         "HospAdmTime",
                         "ICULOS",
                         "Gender"
                         ], axis=1)

# Create a Test Dataset

In [None]:
X_train, X_test, y_train, y_test = \
    train_test_split(data.drop("isSepsis", axis=1),
    data["isSepsis"], test_size=0.15,
    random_state=42, stratify=data["isSepsis"])

# Transform the numeric training data

In [None]:
num_imputer = SimpleImputer(strategy="median")
num_imputer.fit(data_num)
N = num_imputer.transform(data_num)

# Transformation pipeline

In [None]:
num_pipeline = Pipeline([
                        ('imputer', SimpleImputer(strategy='median')),
                        ('std_scaler', StandardScaler()),
                        ])

num_prepared = num_pipeline.fit_transform(data_num)

# Full data pipeline

In [None]:
num_attribs = list(data_num)
# construct the transformer
full_pipeline = ColumnTransformer([
    # transform number columns with num_pipeline defined earlier
    ("num", num_pipeline, num_attribs)
])

# only run the pipeline on the training as the test data will be applied during the evaluation stage with the final model
X_train = full_pipeline.fit_transform(X_train)

In [None]:
# transform, DON'T fit the final data
X_test = full_pipeline.transform(X_test)

In [None]:
# load the model from disk
rfc_model = joblib.load("models/final/rfc_model.pkl")

In [None]:
# predict on test data
rfc_predictions = rfc_model.predict(X_test)

In [None]:
# view the percent of the predictions that were correct
accuracy_score(y_test, rfc_predictions)

In [None]:
# we can successfully identify 6 out of 10 patients that will develop sepsis in the next 6 days
print(recall_score(y_test, rfc_predictions))
print(f1_score(y_test, rfc_predictions))

In [None]:
# save the model
joblib.dump(model_rfc, "models/final/model_rfc.pkl")
# reference to load the model
#final_rfc_loaded = joblib.load("models/final/model_rfc.pkl")