# Data Science Notebook

Build, train and serialize the model

# Import packages

In [1]:
# load data
from submodules.load_data import load_data

# data manipulation
import pandas as pd

# data splitting
from sklearn.model_selection import train_test_split

# data preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# model
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# k-fold cross validation
from sklearn.model_selection import cross_validate

# saving models
import joblib

# performance
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

# Load the data

Load semi-colon separated data from disk

In [2]:
data = load_data()

# Create a Test Dataset
> uses scikit-learn

Performing this early minimizes generalization and bias you may inadvertently apply to your system.
Simply put, a test set of data involves: picking ~20% of the instances randomly and setting them aside.

Some considerations for sampling methods that generate the test set:
1. you don't want your model to see the entire dataset
1. you want to be able to fetch new data for training
1. you want to maintain the same percentage of training data against the entire dataset
1. you want a representative training dataset (~7% septic positive)

https://realpython.com/train-test-split-python-data/

In [3]:
# sets 15% of the data aside for testing, sets the random number generate to it always generates the same shuffled indicies
# x = 2 dimensional array with inputs
# X_train is the training part of the first sequence (x)
# X_test is the test part of the first sequence (x)
# y = 1 dimensional array with outputs
# y_train is the labeled training part of the second sequence
# y_test is the labeled test part of the second sequence
# axis Whether to drop labels from the index (0 or ‘index’) or columns (1 or ‘columns’)
# test_size is the amount of the total dataset to set aside for testing = 10%
# random state fixes the randomization so you get the same results each time
# Shuffle before the data is split, it is shuffled
# stratified splitting keeps the proportion of y values trhough the train and test sets
X_train, X_test, y_train, y_test = \
    train_test_split(data.drop("isSepsis", axis=1),
    data["isSepsis"], test_size=0.15,
    random_state=42, stratify=data["isSepsis"])

# Prepare data for Machine Learning
Instead of preparing data manually, write functions to:
1. reproduce transformations easily on any dataset (e.g., data refresh)
1. builds a library of functions to reuse in future projects
1. use functions in live stream to transform new data before inferencing

## Data cleaning
1. split numerical attributes
    1. transform current and future null values
    1. drop under-represented attributes (<7k)
    1. impute median for missing attributes (>7k)

### Copy Numeric Training Data, drop demographics data

In [4]:
# data_num = X_train.copy()
# .drop also creates a copy without the categorical attributes
# drop non-biological indicators
data_num = X_train.drop(["Age",
                         "Unit1",
                         "Unit2",
                         "HospAdmTime",
                         "ICULOS",
                         "Gender"
                         ], axis=1)
data_num.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Phosphate,Potassium,Bilirubin_total,TroponinI,Hct,Hgb,PTT,WBC,Fibrinogen,Platelets
17166,90.0,100.0,37.3,108.5,79.5,67.5,10.0,,0.0,23.0,...,,3.7,,,33.7,11.6,34.2,24.0,,157.0
8962,91.0,,,137.0,82.0,,21.0,,,27.0,...,3.8,4.3,0.7,,40.4,13.9,38.6,8.3,,209.0
10453,,,,,,,,,,25.0,...,4.2,5.1,0.5,,31.4,9.6,,7.2,,585.0
19087,66.0,100.0,37.6,124.0,78.0,58.0,16.0,31.0,,,...,,4.1,,,26.9,9.0,,14.2,,179.0
5328,103.0,100.0,,108.0,82.0,68.0,,,,25.0,...,2.4,3.5,,,23.7,8.4,30.5,11.2,,167.0


### Transform missing values from numeric data

In [5]:
# create simpleimputer instance
# replace attributes missing values with median of the attribute
num_imputer = SimpleImputer(strategy="median")

# fit applies the imputer to ALL numeric data in case new data includes null values
# when system goes live
# results are stored in a imputer.statistics_ value
num_imputer.fit(data_num)
num_imputer.statistics_

array([8.20e+01, 9.80e+01, 3.68e+01, 1.20e+02, 8.00e+01, 6.20e+01,
       1.80e+01, 3.30e+01, 0.00e+00, 2.40e+01, 5.00e-01, 7.39e+00,
       4.10e+01, 9.70e+01, 3.20e+01, 1.60e+01, 7.20e+01, 8.40e+00,
       1.06e+02, 9.00e-01, 2.80e-01, 1.22e+02, 1.80e+00, 2.00e+00,
       3.40e+00, 4.00e+00, 8.00e-01, 1.40e-01, 3.18e+01, 1.07e+01,
       3.07e+01, 1.01e+01, 2.51e+02, 1.93e+02])

In [6]:
# apply the trained imputer to transform the training set replacing the
# missing values with learn medians
N = num_imputer.transform(data_num)
# result above is plain NumPy array with transformed features
# put back to a pandas DataFrame
num_tr = pd.DataFrame(N, columns=data_num.columns, index=data_num.index)
num_tr.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Phosphate,Potassium,Bilirubin_total,TroponinI,Hct,Hgb,PTT,WBC,Fibrinogen,Platelets
17166,90.0,100.0,37.3,108.5,79.5,67.5,10.0,33.0,0.0,23.0,...,3.4,3.7,0.8,0.14,33.7,11.6,34.2,24.0,251.0,157.0
8962,91.0,98.0,36.8,137.0,82.0,62.0,21.0,33.0,0.0,27.0,...,3.8,4.3,0.7,0.14,40.4,13.9,38.6,8.3,251.0,209.0
10453,82.0,98.0,36.8,120.0,80.0,62.0,18.0,33.0,0.0,25.0,...,4.2,5.1,0.5,0.14,31.4,9.6,30.7,7.2,251.0,585.0
19087,66.0,100.0,37.6,124.0,78.0,58.0,16.0,31.0,0.0,24.0,...,3.4,4.1,0.8,0.14,26.9,9.0,30.7,14.2,251.0,179.0
5328,103.0,100.0,36.8,108.0,82.0,68.0,18.0,33.0,0.0,25.0,...,2.4,3.5,0.8,0.14,23.7,8.4,30.5,11.2,251.0,167.0


## Feature Scaling
1. ML algorithms don't work well when numeric attributes have very different scales
    (e.g. HR max 184,  pH max 7.67)
1. Scaling target values is not necessary
1. Apply
    1. normalization (MinMaxScaler) bounds the values to a specific range (e.g. 0-1)
    1. standardization (StandardScaler) less affected by outliers does not bound to range

### Transformation Pipeline

1. Common to apply many transformation steps in a specific order

In [10]:
num_pipeline = Pipeline([
                        ('imputer', SimpleImputer(strategy='median')),
                        ('std_scaler', StandardScaler()),
                        ])

num_prepared = num_pipeline.fit_transform(data_num)

## Full Data Pipeline

Single transformer to handle numeric and categorical columns using ColumnTransformer.

In [12]:
# get a list of numeric column names
num_attribs = list(data_num)

In [13]:
# construct the transformer
full_pipeline = ColumnTransformer([
    # transform number columns with num_pipeline defined earlier
    ("num", num_pipeline, num_attribs)
])

# only run the pipeline on the training as the test data will be applied during the evaluation stage with the final model
X_train_prepared = full_pipeline.fit_transform(X_train)

# Model Selection

![image](images/scikitlearn-choose-right-estimator.png)

https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

## Classifier Comparison

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html?highlight=svm%20svc

1. [Linear Support Vector Machine "SVM" Support Vector Classifier "SVC"](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)
1. [Naive Bayes](https://scikit-learn.org/stable/modules/naive_bayes.html)
1. [K-Neighbors Classifier](https://scikit-learn.org/stable/modules/neighbors.html#nearest-neighbors-classification) implements learning based on the  nearest neighbors of each query point, where  is an integer value specified by the user.
1. [Random Forest Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier) A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.
1. [Logistic Regression](https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression) a linear model for classification
1. [Stochastic Gradient Descent "SGD" Classifier](https://scikit-learn.org/stable/modules/sgd.html#classification)
1. [Neural Network Multi-Layer Perceptron Classifier](https://scikit-learn.org/stable/modules/neural_networks_supervised.html#multi-layer-perceptron) a supervised learning algorithm that learns a function  by training on a dataset
1. [XGBoost Classifier](https://xgboost.readthedocs.io/en/latest/python/python_api.html?highlight=xgbclassifier#xgboost.XGBClassifier)

## Scoring
https://arifromadhan19.medium.com/part-1-regression-and-classification-model-evaluation-bc7f6ab3b4dd
https://scikit-learn.org/stable/modules/cross_validation.html#
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

Scoring metrics for Classification models
1. Classification
1. Accuracy
1. Precision
1. Recall
1. Specificity
1. F1 Score

Best practice to save every model you experiment with so you can come back easily to any model.
Save both the hyperparameters and trained parameter, as well as the cross-validation scores and predictions.
This will allow you to easily compare scores across model types. Use Pickle or joblib libraries.

## Linear SVM SVC
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

In [14]:
# select the SVM
lin_svm = svm.SVC()
# fit the model to the data
lin_svm.fit(X_train_prepared, y_train)
# configure the cross validation
cv_lin_svm = cross_validate(lin_svm, # estimator to fit
                            X_train_prepared, # data to fit
                            y_train, # target variable isSepsis
                            n_jobs=-1, # use all the processors in parallel
                            verbose=1, # verbosity level
                            cv=3, # splitting strategy to compute the score N consecutive times with different splits
                            scoring="f1", # for binary targets
                            return_train_score=True)

# display the scoring
cv_lin_svm

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   46.7s finished


{'fit_time': array([11.9992919 , 12.64642811, 13.73776317]),
 'score_time': array([10.42156386, 10.00926685,  9.93839383]),
 'test_score': array([0.10377358, 0.10613208, 0.11254396]),
 'train_score': array([0.20697413, 0.25655022, 0.2140056 ])}

In [15]:
# save the model
joblib.dump(lin_svm, "models/experiment/lin_svm.pkl")
# reference to load the model
#lin_svm_loaded = joblib.load("model/experiment/lin_svm.pkl")

['models/lin_svm.pkl']

## Naive Bayes
https://scikit-learn.org/stable/modules/naive_bayes.html

In [16]:
gnb = GaussianNB()
gnb.fit(X_train_prepared, y_train)
cv_n_bayes = cross_validate(gnb, # estimator to fit
                            X_train_prepared, # data to fit
                            y_train, # target variable isSepsis
                            n_jobs=-1, # use all the processors in parallel
                            verbose=1, # verbosity level
                            cv=3, # splitting strategy to compute the score N consecutive times with different splits
                            scoring="f1", # for binary targets
                            return_train_score=True)

# display the scoring
cv_n_bayes

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.9s finished


{'fit_time': array([0.04836202, 0.03323007, 0.03448606]),
 'score_time': array([0.00887394, 0.00911283, 0.00915408]),
 'test_score': array([0.28099174, 0.23349181, 0.27734171]),
 'train_score': array([0.27586207, 0.27238606, 0.26259542])}

In [17]:
# save the model
joblib.dump(gnb, "models/experiment/gnb.pkl")
# reference to load the model
#gnb_loaded = joblib.load("model/experiment/gnb.pkl")

['models/gnb.pkl']

## K Nearest Neighbor Classification
https://scikit-learn.org/stable/modules/neighbors.html#nearest-neighbors-classification

In [18]:
knn = KNeighborsClassifier()
knn.fit(X_train_prepared, y_train)
cv_knn = cross_validate(knn,
                        X_train_prepared,
                        y_train,
                        n_jobs=-1,
                        verbose=1,
                        cv=3,
                        scoring="f1",
                        return_train_score=True)
cv_knn

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   22.5s finished


{'fit_time': array([0.01612735, 0.01470995, 0.01478386]),
 'score_time': array([8.79333568, 9.93717027, 9.89686728]),
 'test_score': array([0.35692771, 0.40208488, 0.38897638]),
 'train_score': array([0.52214452, 0.51745068, 0.50679084])}

In [19]:
# save the model
joblib.dump(knn, "models/experiment/knn.pkl")
# reference to load the model
#knn_loaded = joblib.load("model/experiment/knn.pkl")


['models/knn.pkl']

## Random Forest Classifier

In [20]:
rf = RandomForestClassifier()
cv_rf = cross_validate(rf,
                       X_train_prepared,
                       y_train,
                       n_jobs=-1,
                       cv=3,
                       scoring="f1",
                       return_train_score=True)
cv_rf

{'fit_time': array([3.65996075, 3.70988512, 3.73199224]),
 'score_time': array([0.13547111, 0.13466406, 0.13094187]),
 'test_score': array([0.56818182, 0.54530478, 0.5728    ]),
 'train_score': array([0.99905571, 0.99937028, 0.99937028])}

In [21]:
# save the model
joblib.dump(rf, "models/experiment/rf.pkl")
# reference to load the model
#rf_loaded = joblib.load("model/experiment/rf.pkl")

['models/rf.pkl']

## Logistic Regression
- [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logisticregression#sklearn.linear_model.LogisticRegression)

In [22]:
# set a variable to Logistic regression with verbosity
log_reg = LogisticRegression()
log_reg.fit(X_train_prepared, y_train)
cv_log_reg = cross_validate(log_reg,
                            X_train_prepared, # attributes
                            y_train, # labels isSepsis
                            n_jobs=-1, # use all the processors in parallel
                            verbose=1, # verbosity level
                            cv=5, # splitting strategy to compute the score N consecutive times with different splits
                            scoring="f1", # for binary targets
                            return_train_score=True) # computationally expensive, whether to include training scores on parameters impact
cv_log_reg

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.1s finished


{'fit_time': array([0.11095405, 0.14168811, 0.11711001, 0.12011695, 0.15819907]),
 'score_time': array([0.00465989, 0.00610089, 0.004462  , 0.00369787, 0.0052979 ]),
 'test_score': array([0.02424242, 0.02414487, 0.03225806, 0.04048583, 0.02469136]),
 'train_score': array([0.03245436, 0.0364557 , 0.03444782, 0.03144016, 0.03134479])}

In [23]:
# save the model
joblib.dump(log_reg, "models/experiment/log_reg.pkl")
# reference to load the model
#log_reg_loaded = joblib.load("model/experiment/log_reg.pkl")

['models/log_reg.pkl']

## SGDClassifier

advantages
- efficient

disadvantages
- sensitive to feature scaling

In [29]:
sgd_clf = SGDClassifier(loss="log", # logistic regression
                        penalty="elasticnet",
                        shuffle=True,
                        learning_rate='optimal')
sgd_clf.fit(X_train_prepared, y_train)
cv_sgd_clf = cross_validate(sgd_clf,
                            X_train_prepared, # attributes
                            y_train, # labels isSepsis
                            n_jobs=-1, # use all the processors in parallel
                            verbose=1, # verbosity level
                            cv=5, # splitting strategy to compute the score N consecutive times with different splits
                            scoring="f1", # for binary targets
                            return_train_score=True) # computationally expensive, whether to include training scores on parameters impact

cv_sgd_clf

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.6s finished


{'fit_time': array([0.63595223, 0.73067617, 0.70459986, 0.58786821, 0.32503796]),
 'score_time': array([0.00507975, 0.00321102, 0.00374603, 0.00774407, 0.0035522 ]),
 'test_score': array([0.00819672, 0.02811245, 0.05882353, 0.01629328, 0.02061856]),
 'train_score': array([0.01427843, 0.03923541, 0.05135802, 0.02135231, 0.02755102])}

In [30]:
# serialize the model
joblib.dump(sgd_clf, "models/experiment/sgd_clf.pkl")
# reference load the model
#sgd_clf_loaded = joblib.load("model/experiment/sgd_clf.pkl")

['models/sgd_clf.pkl']

## MLP Classifier

In [31]:
nn = MLPClassifier(solver='sgd', # For small datasets, ‘lbfgs’ can converge faster and perform better.
                   activation='relu',
                   max_iter=5000, # The solver iterates until convergence
                   hidden_layer_sizes=(50,50,50,50), # The ith element represents the number of neurons in the ith hidden layer
                   verbose=0,
                   learning_rate="adaptive") #  keeps the learning rate constant to ‘learning_rate_init’ as long as training loss keeps decreasing
cv_nn = cross_validate(nn,
                       X_train_prepared,
                       y_train,
                       cv=3,
                       scoring="f1",
                       return_train_score=True,
                       verbose=1)
cv_nn

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 10.2min finished


{'fit_time': array([179.28927207, 224.22065091, 208.5081861 ]),
 'score_time': array([0.02683997, 0.02259612, 0.025383  ]),
 'test_score': array([0.39190898, 0.40836408, 0.38754765]),
 'train_score': array([0.96990814, 0.97395172, 0.97684745])}

In [27]:
# save the model
joblib.dump(nn, "models/experiment/nn.pkl")
# reference to load the model
#nn_loaded = joblib.load("model/experiment/nn.pkl")

['models/nn.pkl']

## XGBoost Classifier

Learning task parameter = https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

In [73]:
xgboost = XGBClassifier(use_label_encoder=False, # removes user warning error
                        booster="gbtree",
                        eval_metric='mlogloss')
cv_xgboost = cross_validate(xgboost,
                            X_train_prepared,
                            y_train,
                            cv=3,
                            scoring="f1",
                            return_train_score=True,
                            verbose=0)
cv_xgboost

{'fit_time': array([2.43767905, 1.3382349 , 1.28723097]),
 'score_time': array([0.01427674, 0.01267815, 0.01388526]),
 'test_score': array([0.54941634, 0.55221519, 0.58578053]),
 'train_score': array([0.88031652, 0.88624339, 0.86704432])}

In [29]:
# save the model
joblib.dump(xgboost, "models/experiment/xgb.pkl")
# reference to load the model
#xgboost_loaded = joblib.load("models/experiment/xgboost.pkl")

['models/xgb.pkl']

## Fine-tune the model
Instead of messing with hyperparameters manually, GridSearchCV can be instructed to search hyperparameters
and uses cross validation to evaluate the possible combinations.

During this sampling cycle, you may go back to your pipeline and:
1. drop uninformative features
1. add extra features
1. clean up outliers

### Grid Search on Random Forest Classifier

[sklearn.ensemble.RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html?highlight=random%20forest%20classifier#sklearn.ensemble.RandomForestClassifier)

In [30]:
rfc_param_grid = [
    {'n_estimators': [5, 10, 15, 20], # The number of trees in the forest.
     'max_features': [10, 15, 20, 30]}, # The number of features to consider when looking for the best split
    {'bootstrap': [False], # If False, the whole dataset is used to build each tree
     'n_estimators': [3,10],
     'max_features': [2, 3, 4]},
]

rfc = RandomForestClassifier()

# the grid search will explore 4 x 4 combinations of Random Forest Classifier combinations of
# n_estimators and max_features hyperparameters values
# Then will try 2 x 3 combinations of hyperparameter values in the second dictionary
# with the bootstrap set to false
# overall, gridsearch will explore 16 + 6 = 22 combinations of RFC hyperparameters and train each modele 3 times
rfc_grid_search = GridSearchCV(rfc,
                           rfc_param_grid,
                           cv=3,
                           scoring='f1',
                           return_train_score=True)

rfc_grid_search.fit(X_train_prepared, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid=[{'max_features': [10, 15, 20, 30],
                          'n_estimators': [3, 10, 30, 100]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='f1')

In [31]:
# print the best score
rfc_grid_search.best_score_

0.5524979689752308

In [32]:
# print the best combination of parameters
rfc_grid_search.best_params_

{'max_features': 15, 'n_estimators': 100}

In [33]:
# print the best estimator directly
rfc_grid_search.best_estimator_

RandomForestClassifier(max_features=15)

In [34]:
# display the importance scores next to the corresponding attributes
# from this you can drop less useful features
feature_importances = rfc_grid_search.best_estimator_.feature_importances_
sorted(zip(feature_importances, num_attribs), reverse=True)

[(0.10080489989104702, 'Temp'),
 (0.07748042225765787, 'WBC'),
 (0.07641134866426043, 'BUN'),
 (0.061636673640003296, 'HR'),
 (0.057731118320587686, 'Hgb'),
 (0.05487352507261766, 'Resp'),
 (0.0511490511755732, 'Hct'),
 (0.05073491288456084, 'SBP'),
 (0.04809497355457121, 'Creatinine'),
 (0.047273093695356164, 'DBP'),
 (0.047027014105542865, 'MAP'),
 (0.032694465524768526, 'Platelets'),
 (0.031886427038485037, 'O2Sat'),
 (0.031523365985405505, 'Glucose'),
 (0.028310601425176867, 'Calcium'),
 (0.021407485312965217, 'Potassium'),
 (0.01849600464681473, 'Chloride'),
 (0.01768530092800006, 'PaCO2'),
 (0.017316501799187912, 'Phosphate'),
 (0.016840235648914022, 'pH'),
 (0.01572838364471813, 'HCO3'),
 (0.015127202730327798, 'Magnesium'),
 (0.013709013529501124, 'PTT'),
 (0.013706304936230885, 'BaseExcess'),
 (0.01245806056581834, 'Lactate'),
 (0.011807303338982859, 'Bilirubin_total'),
 (0.00878999623077239, 'AST'),
 (0.008657667790379896, 'Alkalinephos'),
 (0.005467750135777415, 'Gender'),
 

### Grid Search on XGBoost
[xgboost.XGBClassifier](https://xgboost.readthedocs.io/en/latest/python/python_api.html?highlight=xgbclassifier#xgboost.XGBClassifier)

In [35]:
xgb_param_grid = {"n_estimators": [150, 200],
                  "max_delta_step": [0.1],
                  "subsample": [None, 0.5, 1],
                  "reg_lambda": [1, 1.1],
                  "alpha": [0, 0.1]}


xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

xgb_grid_search = GridSearchCV(xgb,
                           xgb_param_grid,
                           cv=3,
                           scoring='f1',
                           return_train_score=True)

xgb_grid_search.fit(X_train_prepared, y_train)

GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     eval_metric='mlogloss', gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,...
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, use

In [36]:
# print the best score
xgb_grid_search.best_score_

0.5563996653758968

In [37]:
# print the best combination of parameters
xgb_grid_search.best_params_

{'alpha': 0.1,
 'max_delta_step': 0.1,
 'n_estimators': 200,
 'reg_lambda': 1,
 'subsample': None}

In [38]:
# print the best estimator directly
xgb_grid_search.best_estimator_

XGBClassifier(alpha=0.1, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0.1, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=12,
              num_parallel_tree=1, random_state=0, reg_alpha=0.100000001,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [39]:
feature_importances = xgb_grid_search.best_estimator_.feature_importances_
sorted(zip(feature_importances, num_attribs), reverse=True)

[(0.17511103, 'BUN'),
 (0.16019359, 'Hgb'),
 (0.1014349, 'WBC'),
 (0.06369768, 'Hct'),
 (0.05132545, 'Platelets'),
 (0.035577442, 'Creatinine'),
 (0.029773762, 'Resp'),
 (0.026058856, 'Temp'),
 (0.024878712, 'Gender'),
 (0.022663962, 'DBP'),
 (0.02184942, 'HCO3'),
 (0.02081274, 'HR'),
 (0.019774554, 'SBP'),
 (0.018967059, 'BaseExcess'),
 (0.018740606, 'MAP'),
 (0.018679056, 'Potassium'),
 (0.017870182, 'pH'),
 (0.017260818, 'PaCO2'),
 (0.01718195, 'Magnesium'),
 (0.015856504, 'Calcium'),
 (0.015337686, 'Chloride'),
 (0.014651926, 'PTT'),
 (0.014223329, 'O2Sat'),
 (0.014109537, 'Alkalinephos'),
 (0.013986062, 'Glucose'),
 (0.013553107, 'Bilirubin_total'),
 (0.012698767, 'Phosphate'),
 (0.012648032, 'Lactate'),
 (0.01108327, 'AST'),
 (0.0, 'Gender')]

# Evaluate RFC model on the Test set

After tweaking the model, you can evaluate the final model on the test set.
1. get the predictors and labels from your test set
1. run the transformation pipeline with transform() < don't fit the test set
1. run the prediction

In [40]:
# transform, DON'T fit the final data
X_test_ready = full_pipeline.transform(X_test)

In [41]:
final_rfc = rfc_grid_search.best_estimator_

rfc_predictions = final_rfc.predict(X_test_ready)

In [42]:
# view the percent of the predictions that were correct
accuracy_score(y_test, rfc_predictions)

0.9476728174056733

In [43]:
# we can successfully identify 6 out of 10 patients that will develop sepsis in the next 6 days
print(recall_score(y_test, rfc_predictions))
print(f1_score(y_test, rfc_predictions))

0.42641509433962266
0.5432692307692307


In [44]:
# save the model
joblib.dump(final_rfc, "models/final_rfc.pkl")
# reference to load the model
#final_rfc_loaded = joblib.load("models/final_rfc.pkl")

['models/final_rfc.pkl']

# Evaluate XGB model on the Test set

In [45]:
final_xgb = xgb_grid_search.best_estimator_

xgb_predictions = final_xgb.predict(X_test_ready)

In [46]:
# view the accuracy
accuracy_score(y_test, xgb_predictions)

0.9493252547507574

In [47]:
# we can successfully identify 6 out of 10 patients that will develop sepsis in the next 6 days
print(recall_score(y_test, xgb_predictions))
print(f1_score(y_test, xgb_predictions))

0.4528301886792453
0.5660377358490566


In [48]:
# save the model
joblib.dump(final_xgb, "models/final_xgb.pkl")
# reference to load the model
#final_xgb_loaded = joblib.load("models/final_xgb.pkl")

['models/final_xgb.pkl']

# Launch, Monitor, and Maintain

1. Use joblib to save the train model inclulding full pre-processing and prediction pipeline
1. Load the trained model to production
1. Call the predict() method to make predictions

## Serving
1. Load the model in a web app that will call the predict() method
1. Wrap the model in a dedicated web service that a web app queries with REST API
    1. makes it easy to upgrade without interrupting the primary web app
    1. makes it easy to scale web services and load balance the requests from the web app across the web services
    1. enables the web app to use any language, not just Python

## Monitor
1. Write monitor code the check live performance at regular intervals and trigger alerts when it drops
    1. Could be steep drop if an infrastructure components stops
    1. or, a gentle decay as the world changes resulting in model rot