In [1]:
# References
# https://www.kaggle.com/hkubra/predicting-the-readmission-of-diabetic-patient-s#7.-Hyperparameter-Tuning
# https://www.kaggle.com/iabhishekofficial/prediction-on-hospital-readmission?select=diabetic_data.csv
# https://www.kaggle.com/alibaris/eda-vis-on-diabetes-data
# https://aws.amazon.com/getting-started/hands-on/build-train-deploy-machine-learning-model-sagemaker/

import os
import boto3
import re
import sagemaker
#Loading libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score,recall_score,mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.metrics import classification_report
import sklearn.metrics as metrics
from sklearn.metrics import precision_recall_fscore_support

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = boto3.Session().region_name

In [3]:
df = df = pd.read_csv("./data/diabetic_data.csv")

In [7]:
df.readmitted = [1 if each=='<30' else 0 for each in df.readmitted]

In [8]:
df.replace('?', np.nan , inplace=True)

In [12]:
df.gender.replace('Unknown/Invalid', np.nan , inplace=True)
df.dropna(subset=['gender'], how='all', inplace = True)
df['diabetesMed'] = df['diabetesMed'].replace('Yes', 1)
df['diabetesMed'] = df['diabetesMed'].replace('No', 0)

In [None]:
# A1Cresult and max_glu_serum
df['A1Cresult'] = df['A1Cresult'].replace(['>7','>8','Norm','None'],[1,1,0,-99])
df['max_glu_serum'] = df['max_glu_serum'].replace(['>200','>300','Norm','None'],[1,1,0,-99])

df.head()

### Train and Test Splitting

In [None]:
# Go with 12 columns only for the prediction
df = df[['gender','age','time_in_hospital','num_medications','number_outpatient','number_emergency','number_inpatient','number_diagnoses','max_glu_serum','A1Cresult','insulin','diabetesMed','readmitted']]
X = df.drop(columns="readmitted", axis=1)
Y = df.readmitted
df.head()

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.30, random_state = 42)

In [None]:
ordinal_enc = OrdinalEncoder()
X_train.age = ordinal_enc.fit_transform(X_train.age.values.reshape(-1, 1))
X_test.age = ordinal_enc.transform(X_test.age.values.reshape(-1, 1))

In [27]:
# S3 bucket for saving code and model artifacts.
bucket = sagemaker.Session().default_bucket()

In [28]:
train_file = "train.csv"
prefix = (
    "sagemaker/admission-risk-prediction"  # place to upload training files within the bucket
)
#([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).
pd.concat([y_train, X_train], axis=1).to_csv('train.csv', index=False, header=False)
pd.concat([y_test, X_test], axis=1).to_csv('test.csv', index=False, header=True)
             
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')

In [None]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "latest")

print("Success - the MySageMakerInstance is in the " + region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")

sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(xgboost_container,role, instance_count=1, instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket, prefix),sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100)

In [None]:
xgb.fit({'train': s3_input_train})

In [None]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')