In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score, fbeta_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier
import xgboost as xgb


In [2]:
df = pd.read_csv('../data/90_day_mort.csv')
y = df.copy()['target']
X = df.copy().drop(['target'], axis=1)

**MULTIPLE MODEL GRID SEARCH CV**

In [None]:
'''
Organization
'''

names_cat_feats = ['admission_type', 'admit_provider_id', 'admission_location',
       'insurance', 'language', 'marital_status', 'race', 'gender', 'first_careunit', 
       'last_careunit']
names_cont_feats = ['anchor_age', 'los',
       'Absolute Basophil Count', 'Absolute Eosinophil Count',
       'Absolute Lymphocyte Count', 'Absolute Monocyte Count',
       'Absolute Neutrophil Count', 'Anion Gap', 'Base Excess', 'Bicarbonate',
       'Calculated Total CO2', 'Creatinine', 'H', 'Hematocrit', 'Hemoglobin',
       'I', 'INR(PT)', 'Immature Granulocytes', 'L', 'Lactate', 'PT', 'PTT',
       'Platelet Count', 'RDW', 'Red Blood Cells', 'SIRI', 'Urea Nitrogen',
       'pO2']

In [None]:
''' 
Defining imputers
'''

onehot_ftrs = names_cat_feats
std_ftrs = names_cont_feats

# Categorical
one_hot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# Standard scaler 
std_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('std', std_transformer, std_ftrs),
        ('ohot', one_hot_transformer, onehot_ftrs)
    ]
)

clf = Pipeline(steps=[('preprocessor', preprocessor)])
final_scaler = StandardScaler()


In [None]:
''' 
Parameter grid
'''

random_state = 42; max_iter = 1000000

models_and_params = {
    'Ridge': {'model': LogisticRegression(penalty = 'l2', random_state=random_state, max_iter=max_iter),
              'params': {'logisticregression__C': np.logspace(-8, 3, 12),
                         'logisticregression__class_weight': ['balanced', None]}
    },
    'KNN': {'model': KNeighborsClassifier(),
            'params': {'kneighborsclassifier__n_neighbors': [3, 5, 7, 10, 15, 30, 50, 70, 100],
                       'kneighborsclassifier__weights': ['uniform', 'distance'],
                       'kneighborsclassifier__p': [1,2]} #1 is Manhattan distance, 2 is Euclidean distance 
    },
    'SVC Linear': {'model': SVC(kernel = 'linear', random_state=random_state),
                   'params': {'svc__C': np.logspace(-5, 3, 9),
                              'svc__class_weight': ['balanced', None]}
    },
    'SVC RBF': {'model': SVC(kernel = 'rbf', random_state=random_state),
                'params': {'svc__C': np.logspace(-5, 3, 9),
                           'svc__class_weight': ['balanced', None]}
    }, 
    'XGB': {'model': XGBClassifier(learning_rate = 0.03, n_estimators = 1000, missing=np.nan, subsample=0.66),
            'params': {'xgbclassifier__max_depth': [1, 3, 10, 30, 100],  # Depth of the tree
                       'xgbclassifier__colsample_bytree': [0.1, 0.25, 0.5, 0.75, 1.0],  # Fraction of features used for fitting trees
                       'xgbclassifier__scale_pos_weight': [0.025, 0.05, 0.1, 0.25, 0.5, 1, 5, 10]}
    }
}

In [None]:
''' 
Creating pipeline
'''

def MLpipe_StratKFold(X, y, preprocessor, ML_algo, param_grid):
    results = {}
    baseline_accuracy = []

    for i in range(5):
        iterative_imputer = IterativeImputer(max_iter=10, random_state=42*i)

        # Split Data
        X_other, X_test, y_other, y_test = train_test_split(X,y, train_size=0.8,random_state = 42*i)

        majority_class = np.bincount(y_test).argmax()
        y_baseline = [majority_class] * len(y_test)
        baseline_accuracy.append(accuracy_score(y_test, y_baseline))

        if ML_algo != 'XGB':
            pipe = make_pipeline(preprocessor, iterative_imputer, StandardScaler(),ML_algo)
        else:
            pipe = make_pipeline(preprocessor, StandardScaler(),ML_algo)

        # CV and prepro
        grid = GridSearchCV(pipe, param_grid=param_grid,scoring = make_scorer(f1_score),
                                cv=None, return_train_score = True, n_jobs=-1, verbose=False)

        grid.fit(X_other, y_other)
        y_pred = grid.best_estimator_.predict(X_test)

        transformer = grid.best_estimator_['columntransformer']
        X_test_prep = transformer.transform(X_test)
        X_test_prep_df = pd.DataFrame(X_test_prep, columns = transformer.get_feature_names_out())

        results[i] = {
            'X_test [not preprocessed]': X_test,
            'X_test [preprocessed]': X_test_prep_df,
            'y_test': y_test,
            'y_pred': y_pred,
            'test_score': accuracy_score(y_test, y_pred),
            'f_1_score': fbeta_score(y_test, y_pred, beta=1),
            'recall': recall_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'best_model': grid.best_estimator_,
            'best_params': grid.best_params_
        }

    results['baseline_accuracy'] = {
        'baseline_avg': np.mean(baseline_accuracy),
        'baseline_std': np.std(baseline_accuracy)
    }

    baseline = results['baseline_accuracy']['baseline_avg']

    for i in range(5):
        results[i]['relative_improvement'] = (results[i]['test_score'] - baseline)/baseline

    return results

In [None]:
'''
Performing grid search.
'''
models = ['XGB', 'KNN', 'Ridge', 'SVC Linear', 'SVC RBF']

model_results = {}

for model in models:
    ML_algo = models_and_params[model]['model']
    params = models_and_params[model]['params']
    
    print(f"Results for {model}")
    results = MLpipe_StratKFold(X, y, preprocessor=preprocessor, ML_algo=ML_algo, param_grid=params)

    # To print for each random_state
    scores = []
    for i in range(5):
        print(f"Results for Random State {i}")
        print(f"  Test Score: {results[i]['test_score']}")
        print(f"  f1 Score: {results[i]['f_1_score']}")
        print(f"    precision: {results[i]['precision']}")
        print(f"    recall: {results[i]['recall']}")
        print(f"  Baseline: {results['baseline_accuracy']['baseline_avg']}, Relative Accuracy: {results[i]['relative_improvement']}")
        print(f"  Baseline Standard Deviation: {results['baseline_accuracy']['baseline_std']}")
        print(f"  Best Params: {results[i]['best_params']}")
        scores.append(results[i]['test_score'])

    print(f"Mean of Test Scores: {np.mean(scores)}")
    print(f"Standard Deviation of Test Scores: {np.std(scores)}")
    print("=========")

    model_results[model] = results

# Save entire dictionary as pickle

with open('model_results.pkl', 'wb') as f:
    pickle.dump(model_results, f)

In [None]:
# Calculating f_beta score
f1 = []
for i in range(5):
    X_other, X_test, y_other, y_test = train_test_split(X,y, train_size=0.8,random_state = 42*i)
    y_0 = np.ones_like(y_test)
    f1.append(fbeta_score(y_test, y_0, beta=1))

baseline_f1 = np.mean(f1) 
print(f" {baseline_f1} +/- {np.std(f1)}")

In [None]:
with open('model_results.pkl', 'rb') as f:
    model_results = pickle.load(f)

In [None]:
''' 
Saving models and data.
'''

for model in ['Ridge', 'KNN', 'SVC Linear','SVC RBF', 'XGB']:
    for i in range(5):
        # Save test sets as CSV
        X_test_tbs = model_results[model][i]['X_test [not preprocessed]']
        X_test_prep_tbs = model_results[model][i]['X_test [preprocessed]']
        y_test_tbs = model_results[model][i]['y_test'].reset_index(drop=True)

        X_test_tbs.to_csv(f'{model}_X_test_{i}.csv', index=False)

        X_test_prep_tbs.to_csv(f'{model}_X_test_prep_{i}.csv', index=False)

        y_test_tbs_df = pd.DataFrame(y_test_tbs.values, columns=['y_test'])
        y_test_tbs_df.to_csv(f'{model}_y_test_{i}.csv', index=False)

        # Save models
        mymodel = model_results[model][i]['best_model'][-1]
        with open(f'{model}_{i}_trained', 'wb') as f:
            pickle.dump(mymodel, f)

**RESULTS**

In [None]:
'''
Loading data
'''
with open(f"model_results.pkl", 'rb') as file:
        model_results = pickle.load(file)

In [None]:
''' 
Average and Std of Test Scores for all models.
''' 
model_test_scores = {}
model_f1_scores = {}

for model in ['Ridge', 'KNN', 'SVC Linear', 'SVC RBF', 'XGB']:
    test_scores = []
    f1_scores = []

    mymodel = model_results[model]
    for i in range(5):
        test_scores.append(mymodel[i]['relative_improvement'])
        f1_scores.append(mymodel[i]['f_1_score'])
    model_test_scores[model] = {
        'average': np.mean(test_scores),
        'std': np.std(test_scores)
    }

    model_f1_scores[model] = {
        'average': np.mean(f1_scores),
        'std': np.std(f1_scores)
    }

In [None]:
# Getting average and std for accuracy and f1 scores
print(f"accuracy {model_test_scores}")
print(f"f1 scores {model_f1_scores}")