In [136]:
"""
Combined code for AMLLS project
"""
# Import
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import cross_val_score ,StratifiedGroupKFold , train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, make_scorer, balanced_accuracy_score
from sklearn.impute import KNNImputer
from sklearn.multiclass import OneVsRestClassifier
import random
import lightgbm as lgb
from xgboost import XGBClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn import svm
from catboost import CatBoostClassifier

# import string
# import pickle
# import copy
# import glob
# import re
# from imblearn.over_sampling import SMOTENC
# Our classes and functions
import Functions_ML_Project
import Class_ML_Project

%load_ext autoreload
%autoreload 2
%aimport

# Global variables
# Define categorical feature list
# Subset all categorical (removing the non informative medication)
CATEGORICAL = ['age','race', 'gender', 'medical_specialty', 'max_glu_serum',
               'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
               'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
               'miglitol', 'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
               'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change',
               'diabetesMed', 'admission_type_descriptor', 'discharge_disposition_descriptor',
               'admission_source_descriptor']

# Define numerical feature list
NUMERICAL = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications',
             'number_diagnoses', 'number_outpatient', 'number_emergency', 'number_inpatient']

# Define irrelevant feature list
IRRELEVANT_FEATURES = ["payer_code",'diag_1','diag_2','diag_3','repaglinide','nateglinide','chlorpropamide','tolbutamide','acarbose','miglitol','troglitazone',
 'tolazamide','glyburide-metformin','glipizide-metformin','glimepiride-pioglitazone','metformin-pioglitazone',
 'admission_source_descriptor','admission_type_id','discharge_disposition_id','admission_source_id','patient_nbr']


# Define columns for OHE
OHE_regular_cols = ['race', 'gender', 'medical_specialty', 'insulin', 'diabetesMed', 'admission_type_descriptor',
                    'discharge_disposition_descriptor']
OHE_4_to_2_cols = ['metformin', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone']
diagnoses_cols = ['diag_1_cat', 'diag_2_cat', 'diag_3_cat']

# Note age feature will change to numerical later in the code!


# Define default models to initail test
models_defualt = {'Logisitic' : LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=10000),
          'XGBOOST' : XGBClassifier(use_label_encoder=False,random_state = 42, enable_categorical = True),
          'Tree' : DecisionTreeClassifier(random_state=42),
          'LGBM' : lgb.LGBMClassifier(random_state=42),
          'CatBoost' : CatBoostClassifier(random_seed = 42 , cat_features = ['age']),
                 'SVM': svm.SVC(kernel='linear',random_state=42, probability=True)}
random.seed(42)


# read the data file and gets train and test databases
db_train_df, db_test_df = Functions_ML_Project.clean_data_and_return_train_and_test('diabetic_data.csv')

# features pre-processing
id_names, mapping_dict = Functions_ML_Project.preform_ids_maping('IDS_mapping.csv')

# replace the disease code by disease name
training_df_new = Functions_ML_Project.apply_mapping(db_train_df, id_names, mapping_dict)

#
training_df_new = Functions_ML_Project.feature_engineering(training_df_new)


# Define the pipeline

pipeline = Pipeline([('feature_remover', Class_ML_Project.FeatureRemover(features_to_remove = IRRELEVANT_FEATURES)),
                     ('imputer_race', Class_ML_Project.DataFrameImputer(strategy='constant', fill_value='Other', columns = ['race'])),
                     ('imputer_medical', Class_ML_Project.DataFrameImputer(strategy='most_frequent',columns = ['medical_specialty'])),
                     ('age_encoder', Class_ML_Project.MultiColumnLabelEncoder(columns=['age'])),
                     ('numerical_scaler',Class_ML_Project.NumericalTransformer(columns=NUMERICAL)),
                     ('OHE', Class_ML_Project.CustomOHEncoder(OHE_regular_cols= OHE_regular_cols, OHE_4_to_2_cols=OHE_4_to_2_cols,
                       change_col='change', diag_cols=diagnoses_cols))])

# Fit and transform the DataFrame
training_clean_imputed = pipeline.fit_transform(training_df_new)
training_clean_imputed = Functions_ML_Project.remove_sparse_OHE(training_clean_imputed,OHE_regular_cols)


# Get the removed column names
removed_column_names = pipeline.named_steps['feature_remover'].features_to_remove


# Finally print the results
print("DataFrame head after feature selection and imputation:")
print(training_clean_imputed.head())
print("DataFrame shape after feature selection and imputation:")
print(training_clean_imputed.shape)

# remove readmitted above 30 days 
training_clean_imputed = training_clean_imputed[training_clean_imputed.readmitted.isin(['<30', 'NO'])]



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Modules to reload:
all-except-skipped

Modules to skip:

              patient_nbr             race  gender      age weight  \
encounter_id                                                         
2278392           8222157        Caucasian  Female   [0-10)      ?   
149190           55629189        Caucasian  Female  [10-20)      ?   
64410            86047875  AfricanAmerican  Female  [20-30)      ?   
500364           82442376        Caucasian    Male  [30-40)      ?   
16680            42519267        Caucasian    Male  [40-50)      ?   

              admission_type_id  discharge_disposition_id  \
encounter_id                                                
2278392                       6                        25   
149190                        1                         1   
64410                         1                         1   
500364                        1                         1  

  diag_count = whole_data_df[diagnosis_cols_list].apply(lambda x: x.str.contains(pattern)).sum(axis=1)




 38024 encounters out of 101766 total encounters are of diabetic patients 
 i.e. ~ 37.36% 
6836 encounters are patient repeated ones, ~17.98%

 How many pateints with their repeated encounters 
 count
1     26849
2      3091
3       750
4       242
5       114
6        55
7        31
8        14
10       10
11        9
9         7
15        4
12        2
14        2
17        2
28        1
13        1
18        1
19        1
22        1
38        1
Name: count, dtype: int64 

### Label ratios ###
____________________________
 readmitted
NO     0.547102
>30    0.343362
<30    0.109536
Name: count, dtype: float64 
____________________________

            Complete dataset  Train dataset (80%)  Test dataset (20%)
readmitted                                                           
NO                  0.547102             0.547107            0.547080
>30                 0.343362             0.343360            0.343372
<30                 0.109536             0.109533            0.10954

In [226]:
IRRELEVANT_FEATURES_for_GAN = ['repaglinide','nateglinide','chlorpropamide','tolbutamide','acarbose','miglitol','troglitazone',
 'tolazamide','glyburide-metformin','glipizide-metformin','glimepiride-pioglitazone','metformin-pioglitazone',
 'admission_source_descriptor']

pipeline_GAN = Pipeline([('feature_remover', Class_ML_Project.FeatureRemover(features_to_remove = IRRELEVANT_FEATURES_for_GAN)),
                     ('imputer_race', Class_ML_Project.DataFrameImputer(strategy='constant', fill_value='Other', columns = ['race'])),
                     ('imputer_medical', Class_ML_Project.DataFrameImputer(strategy='most_frequent',columns = ['medical_specialty'])),
                     ('age_encoder', Class_ML_Project.MultiColumnLabelEncoder(columns=['age'])),
                     ('numerical_scaler',Class_ML_Project.NumericalTransformer(columns=NUMERICAL)),
                     ('OHE', Class_ML_Project.CustomOHEncoder(OHE_regular_cols= OHE_regular_cols, OHE_4_to_2_cols=OHE_4_to_2_cols,
                       change_col='change', diag_cols=diagnoses_cols))])

id_fold = pd.read_csv('id_fold.csv')

GAN_synthesized_data = [f for f in os.listdir() if 'CTGAN' in f and os.path.isfile(os.path.join(f))]

results_GAN_df = pd.DataFrame()
for fold,syn_data in enumerate(GAN_synthesized_data):
    GAN_train_fold, GAN_test_fold = Functions_ML_Project.GAN_data_preprocessing(syn_data,
                                                                                 training_clean_imputed, IRRELEVANT_FEATURES_for_GAN,
                                                                                 id_fold,OHE_regular_cols,fold+1,'max',pipeline_GAN)
    
    model_results = Functions_ML_Project.run_models_with_GAN(GAN_train_fold, GAN_test_fold, models = models_defualt)
    temp_df = pd.DataFrame([model_results])
    results_GAN_df = pd.concat([results_GAN_df, temp_df], ignore_index=True)

removed 56 sparse OHE columns

_____________
 Logisitic 
_____________



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_train.readmitted = LabelEncoder().fit_transform(subset_train.readmitted)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_test.readmitted = LabelEncoder().fit_transform(subset_test.readmitted)



 Score.................................. = 0.578


_____________
 XGBOOST 
_____________


 Score.................................. = 0.581


_____________
 Tree 
_____________


 Score.................................. = 0.570


_____________
 LGBM 
_____________


 Score.................................. = 0.566


_____________
 CatBoost 
_____________

Learning rate set to 0.041727
0:	learn: 0.6254238	total: 38.1ms	remaining: 38.1s
1:	learn: 0.5805206	total: 61.4ms	remaining: 30.6s
2:	learn: 0.5468677	total: 77.8ms	remaining: 25.9s
3:	learn: 0.5162777	total: 104ms	remaining: 25.9s
4:	learn: 0.4713874	total: 131ms	remaining: 26s
5:	learn: 0.4490336	total: 147ms	remaining: 24.4s
6:	learn: 0.4258628	total: 157ms	remaining: 22.3s
7:	learn: 0.4045625	total: 168ms	remaining: 20.8s
8:	learn: 0.3872968	total: 179ms	remaining: 19.7s
9:	learn: 0.3684443	total: 189ms	remaining: 18.8s
10:	learn: 0.3578986	total: 199ms	remaining: 17.9s
11:	learn: 0.3426809	total: 210ms	remaining: 17.3s
12:	lear

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_train.readmitted = LabelEncoder().fit_transform(subset_train.readmitted)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_test.readmitted = LabelEncoder().fit_transform(subset_test.readmitted)



 Score.................................. = 0.573


_____________
 XGBOOST 
_____________


 Score.................................. = 0.565


_____________
 Tree 
_____________


 Score.................................. = 0.571


_____________
 LGBM 
_____________


 Score.................................. = 0.563


_____________
 CatBoost 
_____________

Learning rate set to 0.041729
0:	learn: 0.6400234	total: 34.6ms	remaining: 34.6s
1:	learn: 0.5944373	total: 65.3ms	remaining: 32.6s
2:	learn: 0.5572228	total: 92ms	remaining: 30.6s
3:	learn: 0.5192530	total: 119ms	remaining: 29.5s
4:	learn: 0.4731466	total: 138ms	remaining: 27.4s
5:	learn: 0.4496964	total: 149ms	remaining: 24.6s
6:	learn: 0.4240210	total: 159ms	remaining: 22.5s
7:	learn: 0.3975274	total: 168ms	remaining: 20.9s
8:	learn: 0.3742677	total: 179ms	remaining: 19.7s
9:	learn: 0.3632697	total: 188ms	remaining: 18.6s
10:	learn: 0.3526927	total: 198ms	remaining: 17.8s
11:	learn: 0.3372547	total: 208ms	remaining: 17.1s
12:	lear

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_train.readmitted = LabelEncoder().fit_transform(subset_train.readmitted)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_test.readmitted = LabelEncoder().fit_transform(subset_test.readmitted)



 Score.................................. = 0.587


_____________
 XGBOOST 
_____________


 Score.................................. = 0.597


_____________
 Tree 
_____________


 Score.................................. = 0.587


_____________
 LGBM 
_____________


 Score.................................. = 0.587


_____________
 CatBoost 
_____________

Learning rate set to 0.041727
0:	learn: 0.6251129	total: 27.9ms	remaining: 27.9s
1:	learn: 0.5903488	total: 53.8ms	remaining: 26.8s
2:	learn: 0.5443773	total: 79.4ms	remaining: 26.4s
3:	learn: 0.4980203	total: 106ms	remaining: 26.3s
4:	learn: 0.4597334	total: 132ms	remaining: 26.3s
5:	learn: 0.4277876	total: 143ms	remaining: 23.7s
6:	learn: 0.4128314	total: 154ms	remaining: 21.9s
7:	learn: 0.3991796	total: 164ms	remaining: 20.4s
8:	learn: 0.3768884	total: 174ms	remaining: 19.1s
9:	learn: 0.3674203	total: 183ms	remaining: 18.1s
10:	learn: 0.3529721	total: 192ms	remaining: 17.3s
11:	learn: 0.3394691	total: 202ms	remaining: 16.6s
12:	le

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_train.readmitted = LabelEncoder().fit_transform(subset_train.readmitted)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_test.readmitted = LabelEncoder().fit_transform(subset_test.readmitted)



 Score.................................. = 0.555


_____________
 XGBOOST 
_____________


 Score.................................. = 0.585


_____________
 Tree 
_____________


 Score.................................. = 0.580


_____________
 LGBM 
_____________


 Score.................................. = 0.572


_____________
 CatBoost 
_____________

Learning rate set to 0.041729
0:	learn: 0.6499227	total: 20.7ms	remaining: 20.6s
1:	learn: 0.6070034	total: 47.3ms	remaining: 23.6s
2:	learn: 0.5445302	total: 68.7ms	remaining: 22.8s
3:	learn: 0.4947795	total: 81.1ms	remaining: 20.2s
4:	learn: 0.4733600	total: 99.3ms	remaining: 19.8s
5:	learn: 0.4374408	total: 120ms	remaining: 19.9s
6:	learn: 0.4088259	total: 129ms	remaining: 18.3s
7:	learn: 0.3960754	total: 138ms	remaining: 17.2s
8:	learn: 0.3834678	total: 148ms	remaining: 16.3s
9:	learn: 0.3726263	total: 157ms	remaining: 15.6s
10:	learn: 0.3557953	total: 166ms	remaining: 14.9s
11:	learn: 0.3415030	total: 176ms	remaining: 14.5s
12:	

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_train.readmitted = LabelEncoder().fit_transform(subset_train.readmitted)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_test.readmitted = LabelEncoder().fit_transform(subset_test.readmitted)



 Score.................................. = 0.572


_____________
 XGBOOST 
_____________


 Score.................................. = 0.578


_____________
 Tree 
_____________


 Score.................................. = 0.568


_____________
 LGBM 
_____________


 Score.................................. = 0.566


_____________
 CatBoost 
_____________

Learning rate set to 0.041729
0:	learn: 0.6427410	total: 41.8ms	remaining: 41.7s
1:	learn: 0.5989008	total: 70.3ms	remaining: 35.1s
2:	learn: 0.5614499	total: 85.8ms	remaining: 28.5s
3:	learn: 0.5066682	total: 114ms	remaining: 28.4s
4:	learn: 0.4743071	total: 126ms	remaining: 25.2s
5:	learn: 0.4472574	total: 140ms	remaining: 23.2s
6:	learn: 0.4222297	total: 150ms	remaining: 21.2s
7:	learn: 0.3943140	total: 159ms	remaining: 19.7s
8:	learn: 0.3734264	total: 168ms	remaining: 18.5s
9:	learn: 0.3628323	total: 177ms	remaining: 17.5s
10:	learn: 0.3519023	total: 186ms	remaining: 16.7s
11:	learn: 0.3371542	total: 195ms	remaining: 16s
12:	lear

In [227]:
results_GAN_df
results_GAN_df.to_csv('default_models_CTGAN_dataframe.csv', index=False)

Unnamed: 0,Logisitic,XGBOOST,Tree,LGBM,CatBoost,RandomForestClassifier
0,0.578311,0.581415,0.570202,0.565848,0.570036,0.549799
1,0.572516,0.564913,0.570566,0.563469,0.560906,0.543556
2,0.586777,0.59661,0.587167,0.587493,0.590925,0.575944
3,0.554991,0.584735,0.579917,0.571569,0.58537,0.560615
4,0.572116,0.578271,0.567799,0.566006,0.568107,0.558816


In [183]:
models_defualt = {'Logisitic' : LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=10000),
          'XGBOOST' : XGBClassifier(use_label_encoder=False,random_state = 42, enable_categorical = True),
          'Tree' : DecisionTreeClassifier(random_state=42),
          'LGBM' : lgb.LGBMClassifier(random_state=42),
          'CatBoost' : CatBoostClassifier(random_seed = 42 , cat_features = ['age']),
                 'SVM': svm.SVC(kernel='linear',random_state=42, probability=True)}

In [189]:
results

{'Logisitic': 0.5667188040261476,
 'XGBOOST': 0.5748048592563552,
 'Tree': 0.576349921998154,
 'LGBM': 0.5673497613660805,
 'CatBoost': 0.5700267357112325,
 'RandomForestClassifier': 0.5529534520921647}

In [188]:
def run_models_with_GAN(train_df:pd.DataFrame,test_df:pd.DataFrame,
                         models:dict,balance_threshold:float =0.3):

    X_train = train_df.drop('readmitted' , axis = 1)
    y_train = train_df['readmitted']

    X_test = test_df.drop('readmitted' , axis = 1)
    y_test = test_df['readmitted']

    _, counts = np.unique(y, return_counts=True)
    ratios = counts / np.max(counts)
    is_balanced = sum(ratios > balance_threshold) == len(counts)

    # Dynamically select and add RandomForestClassifier based on balance
    rf_model_name = 'BalancedRandomForestClassifier' if not is_balanced else 'RandomForestClassifier'
    rf_model = BalancedRandomForestClassifier(random_state=42) if not is_balanced else RandomForestClassifier(random_state=42)
    models[rf_model_name] = rf_model

    results = {}
    for name, model in models_defualt.items():
        print("\n_____________\n",name,"\n_____________\n")
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        tmp_score = balanced_accuracy_score(y_test,y_pred)
        print('\n Score.................................. = %.3f\n' % tmp_score)
        results[name] = tmp_score
    return results

run_models_with_GAN(GAN_train_fold, GAN_test_fold, models = models_defualt)




_____________
 Logisitic 
_____________


 Score.................................. = 0.567


_____________
 XGBOOST 
_____________


 Score.................................. = 0.575


_____________
 Tree 
_____________


 Score.................................. = 0.576


_____________
 LGBM 
_____________


 Score.................................. = 0.567


_____________
 CatBoost 
_____________

Learning rate set to 0.041727
0:	learn: 0.6250821	total: 36.1ms	remaining: 36s
1:	learn: 0.5896625	total: 46.3ms	remaining: 23.1s
2:	learn: 0.5335367	total: 76.4ms	remaining: 25.4s
3:	learn: 0.5080935	total: 101ms	remaining: 25.2s
4:	learn: 0.4731665	total: 126ms	remaining: 25.1s
5:	learn: 0.4386240	total: 149ms	remaining: 24.7s
6:	learn: 0.4227538	total: 159ms	remaining: 22.5s
7:	learn: 0.3960590	total: 168ms	remaining: 20.8s
8:	learn: 0.3833778	total: 177ms	remaining: 19.5s
9:	learn: 0.3646010	total: 186ms	remaining: 18.4s
10:	learn: 0.3552011	total: 195ms	remaining: 17.5s
11:	learn: 0.3412

{'Logisitic': 0.5667188040261476,
 'XGBOOST': 0.5748048592563552,
 'Tree': 0.576349921998154,
 'LGBM': 0.5673497613660805,
 'CatBoost': 0.5700267357112325,
 'RandomForestClassifier': 0.5529534520921647}