# Assignment 2: Neural Network (One Hidden Layer) with Optimizer

<h2> <b> <u> Dataset background:</u></b> </h2>
<ul>
    <li>Data: Diabetic Encounters (1-14 days/each) from 130 Hospitals for 10 years (1999-2008) </li>
    <li>Goal: Predict if a diabetic patient will be readmitted to a hospital (less than 30 days, after 30 days, or never)</li>
    <li>Target Feature: readmitted </li>
    <li> <a href = "https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008">Dataset Source</li>
</ul>



In [1307]:
## import all required libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

#display all columns of dataframe
pd.pandas.set_option('display.max_columns', None) 

In [1308]:
#import dataset 
dataset_url = "https://raw.githubusercontent.com/ronakHegde98/CS-4372-Computational-Methods-for-Data-Scientists/master/diabetic_data.csv"
df = pd.read_csv(dataset_url)

print(f"Initial Dataset Shape: {df.shape}")
df.sample(5)

Initial Dataset Shape: (101766, 50)


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
9394,41024826,94703481,Caucasian,Female,[70-80),?,1,5,7,6,?,Family/GeneralPractice,65,0,9,0,0,0,969,292.0,276,8,,>7,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,>30
67489,189575298,90680004,AfricanAmerican,Female,[70-80),?,1,1,7,3,MC,?,49,1,14,0,6,0,558,403.0,585,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
76305,230199294,79466571,Caucasian,Male,[50-60),?,1,1,7,3,BC,?,12,5,12,0,0,0,410,414.0,272,6,,>8,No,No,No,No,No,No,Steady,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,Ch,Yes,<30
9326,40807878,31320,Caucasian,Male,[50-60),?,2,1,1,2,?,Cardiology,47,0,9,0,0,2,482,491.0,403,9,,Norm,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,>30
35984,111004542,9731943,Caucasian,Female,[50-60),?,1,1,6,4,MC,Family/GeneralPractice,56,0,19,0,0,3,964,250.02,788,5,,,Steady,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [1309]:
## check if patients have multiple records
print(f"There are {np.sum(df['patient_nbr'].value_counts() > 1)} patients with multiple records")

There are 16773 patients with multiple records


In [1310]:
categorical_cols = [col for col in df.columns if df[col].dtype == np.dtype(np.object)]
print(f"There are {len(categorical_cols)} categorical columns and {len(df.columns)-len(categorical_cols)} numerical columns")

There are 37 categorical columns and 13 numerical columns


<h2> Handling Missing Values </h2>

In [1311]:
## sum all missing values for each row of df (axis 0 is row)
missing_count = np.sum(np.sum(np.equal(df, '?'), axis=0))
print(f"There are {missing_count} '?' values in our dataset which is approx {np.round((missing_count/(np.multiply(df.shape[0], df.shape[1])))*100,2)}% of our entire dataset")

There are 192849 '?' values in our dataset which is approx 3.79% of our entire dataset


In [1312]:
## convert ?'s into np.nan
df.replace("?", np.nan, inplace=True)

In [1313]:
print("Columns with missing data")
missing_cols = df.columns[df.isnull().any()].tolist()
for col in missing_cols:
    print(' ' + col + ': ' + str(df[col].isna().sum()))

Columns with missing data
 race: 2273
 weight: 98569
 payer_code: 40256
 medical_specialty: 49949
 diag_1: 21
 diag_2: 358
 diag_3: 1423


In [1314]:
## drop rows where gender is Unknown/Invalid
df.drop(df[df['gender'] == "Unknown/Invalid"].index, axis=0, inplace=True)

## dropping columns that have many missing values
dropped_columns = ['weight', 'payer_code', 'medical_specialty']
dropped_columns.append("encounter_id")
dropped_columns.append('discharge_disposition_id')

## dropping columns that have little to no variability
for col in categorical_cols:
    if(df[col].value_counts(normalize=True).max() > 0.948):
        dropped_columns.append(col)
        
df.drop(columns=dropped_columns, axis=1, inplace=True)
df.dropna(inplace=True)

<h2> Some Patients have multiple records </h2>

In [1315]:
## one record per patient (where they had max of time_in_hospital)
df = df.loc[df.groupby("patient_nbr", sort=False)['time_in_hospital'].idxmax()]
df.drop(columns = ['patient_nbr'], inplace=True)

In [1316]:
## convert our categorical variable (if readmitted -> 1 else 0)
df['readmitted'] = np.where(df['readmitted']!='NO',1,0)

In [1317]:
## convert age ranges to the midpoint of the ranges
new_ages = {
    "[0-10)": 5,
    "[10-20)": 15,
    "[20-30)": 25,
    "[30-40)": 35,
    "[40-50)": 45,
    "[50-60)": 55,
    "[60-70)": 65,
    "[70-80)": 75,
    "[80-90)": 85,
    "[90-100)": 95
}

df['age'] = df['age'].map(new_ages)

In [1318]:
max_glu_serums = {
    "None": 0,
    "Norm": 100,
    ">200": 200,
    ">300": 300
}
df['max_glu_serum'] = df['max_glu_serum'].map(max_glu_serums)

In [1319]:
A1CResult_map = {
    "None": 0,
    "Norm": 5,
    ">7": 7,
    ">8": 8
}
df['A1Cresult'] = df['A1Cresult'].map(A1CResult_map)

In [1320]:
#converting binary variables into -1 or 1
df['change'] = np.where(df['change']=='No',-1,1)
df['diabetesMed'] = np.where(df['diabetesMed']=='No',-1,1)

In [1322]:
drug_codes = {
    "No": -20,
    "Down": -10, 
    "Steady": 0,
    "Up": 10    
}
drugs = ['metformin','glipizide','glyburide', 'pioglitazone', 'rosiglitazone','insulin'] 
for drug in drugs:
    df[drug] = df[drug].map(drug_codes)

In [1323]:
## mapping diagnosis categories according to paper (else 800 plus features)
diagnosis_cols = ['diag_1', 'diag_2', 'diag_3']

for col in diagnosis_cols:
    df['tmp'] = np.nan
    df.loc[(df[col].str.contains("250")), col] = '250'
    df.loc[(df[col].str.startswith('V')) | (df[col].str.startswith('E')), col] = '-999' 

    df[col] = df[col].astype(float)
    
    #convert the correct ranges based on values given in paper
    df.loc[(((df[col] >=390) & (df[col]<=460)) | (df[col] == 785)), 'tmp'] = 'Circulatory'
    df.loc[(((df[col] >=460) & (df[col]<=519)) | (df[col] == 786)), 'tmp'] = 'Respiratory'
    df.loc[(((df[col] >=520) & (df[col]<=579)) | (df[col] == 787)), 'tmp'] = 'Digestive'
    df.loc[(((df[col] >=580) & (df[col]<=629)) | (df[col] == 788)), 'tmp'] = 'Genitourinary'
    df.loc[((df[col] >=800) & (df[col]<=999)), 'tmp'] = 'Injury'
    df.loc[((df[col] >=710) & (df[col]<=739)), 'tmp'] = 'Musculoskeletal'
    df.loc[((df[col] >=140) & (df[col]<=239)), 'tmp'] = 'Neoplasms'
    df.loc[(df[col] == 250), 'tmp'] = 'Diabetes'
    
    df['tmp'].fillna(value = "Other", inplace=True)
    
    df[col] = df['tmp']
    df.drop(columns=['tmp'], inplace=True)
    

In [1324]:
## admission_source_id
df['tmp'] = np.nan
col = 'admission_source_id'
df.loc[((df[col].between(4,6)) | (df[col] == 10) | (df[col] == 18) | (df[col] == 22) | (df[col].between(25,26))), 'tmp'] = "Transfer_Source"
df.loc[df[col].between(1,3), 'tmp'] = "Referral_Source"
df.loc[((df[col].between(11,14))| (df[col].between(23,24))), 'tmp'] = "Birth_Source"
df.loc[df[col] == 7, 'tmp'] = "Emergency_Source"
df.loc[((df[col] == 8) | (df[col]==19)), 'tmp'] = "Other"
        
df['tmp'].fillna(value = "Unknown", inplace=True)
df[col] = df['tmp']
df.drop(columns=['tmp'], inplace=True)


##mapping admission type_id
df['tmp'] = np.nan
col = 'admission_type_id'
df.loc[df[col] == 1, 'tmp'] = 'Emergency_Type'
df.loc[df[col] == 2, 'tmp'] = 'Urgent_Type'
df.loc[df[col] == 3, 'tmp'] = 'Elective_Type'
df.loc[df[col] == 7, 'tmp'] = 'Trauma_Type'
df.loc[df[col] == 4, 'tmp'] = 'Newborn_Type'

df['tmp'].fillna(value = "Unknown", inplace=True)
df[col] = df['tmp']
df.drop(columns=['tmp'], inplace=True)


In [1325]:
def one_hot_encoder(df, cols):
    """one-hot encoding function for all our categorical columns"""
    
    for col in cols:
        if("admission" in col):
            dummies = pd.get_dummies(df[col], drop_first=False)
        else:
            dummies = pd.get_dummies(df[col], prefix=col, drop_first=False)
        df = pd.concat([df, dummies], axis=1)   
        df.drop([col],axis=1, inplace=True)
    return df

In [1326]:
#one-hot encoding 
categorical_columns = [col for col in df.columns if df[col].dtype == np.dtype(object)]
df = one_hot_encoder(df, categorical_columns)
df.columns = map(str.lower, df.columns)

#train-test-split
target_variable = 'readmitted'
Y_feature = df[target_variable]
X_features = df.drop(columns=[target_variable])
X_train, X_test, y_train, y_test = train_test_split(X_features,Y_feature, test_size=0.2, random_state = 42)

In [1327]:
# normalize of numerical columns
mm_scaler = MinMaxScaler()
X_train = pd.DataFrame(mm_scaler.fit_transform(X_train), columns = X_train.columns) 
X_test = pd.DataFrame(mm_scaler.fit_transform(X_test), columns = X_test.columns)