# Assignment 2: Neural Network (One Hidden Layer) with Optimizer

<h2> <b> <u> Dataset background:</u></b> </h2>
<ul>
    <li>Data: Diabetic Encounters (1-14 days/each) from 130 Hospitals for 10 years (1999-2008) </li>
    <li>Goal: Predict if a diabetic patient will be readmitted to a hospital (less than 30 days, after 30 days, or never)</li>
    <li>Target Feature: readmitted </li>
    <li> <a href = "https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008">Dataset Source</li>
</ul>



In [6]:
## import all required libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

#display all columns of dataframe
pd.pandas.set_option('display.max_columns', None) 

In [7]:
#import dataset 
dataset_url = "https://raw.githubusercontent.com/ronakHegde98/CS-4372-Computational-Methods-for-Data-Scientists/master/data/diabetic_data.csv"
df = pd.read_csv(dataset_url)

print(f"Initial Dataset Shape: {df.shape}")
df.sample(5)

Initial Dataset Shape: (101766, 50)


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
11522,47603976,53229249,Caucasian,Male,[70-80),?,1,1,7,2,?,InternalMedicine,48,0,10,0,0,1,780.0,780,332,8,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,>30
26691,87798972,27645615,AfricanAmerican,Male,[40-50),?,5,1,17,1,CP,Family/GeneralPractice,18,0,12,0,0,0,250.2,530,278,3,>300,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,NO
50457,151582878,99473202,Asian,Male,[70-80),?,1,6,7,5,CM,?,51,0,1,0,0,0,805.0,250,E884,4,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
4889,26569620,17707707,Caucasian,Male,[80-90),?,1,11,7,9,?,Family/GeneralPractice,67,6,26,0,0,1,428.0,518,585,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
2959,18860946,53593200,Caucasian,Male,[80-90),[50-75),6,3,17,10,?,?,48,1,20,0,0,0,507.0,427,593,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,<30


In [8]:
## check if patients have multiple records
print(f"There are {np.sum(df['patient_nbr'].value_counts() > 1)} patients with multiple records")

There are 16773 patients with multiple records


In [9]:
categorical_cols = [col for col in df.columns if df[col].dtype == np.dtype(np.object)]
print(f"There are {len(categorical_cols)} categorical columns and {len(df.columns)-len(categorical_cols)} numerical columns")

There are 37 categorical columns and 13 numerical columns


<h2> Handling Missing Values </h2>

In [10]:
## sum all missing values for each row of df (axis 0 is row)
missing_count = np.sum(np.sum(np.equal(df, '?'), axis=0))
print(f"There are {missing_count} '?' values in our dataset which is approx {np.round((missing_count/(np.multiply(df.shape[0], df.shape[1])))*100,2)}% of our entire dataset")

There are 192849 '?' values in our dataset which is approx 3.79% of our entire dataset


In [11]:
## convert ?'s into np.nan
df.replace("?", np.nan, inplace=True)

In [12]:
print("Columns with missing data")
missing_cols = df.columns[df.isnull().any()].tolist()
for col in missing_cols:
    print(' ' + col + ': ' + str(df[col].isna().sum()))

Columns with missing data
 race: 2273
 weight: 98569
 payer_code: 40256
 medical_specialty: 49949
 diag_1: 21
 diag_2: 358
 diag_3: 1423


In [13]:
## drop rows where gender is Unknown/Invalid
df.drop(df[df['gender'] == "Unknown/Invalid"].index, axis=0, inplace=True)

## dropping columns that have many missing values
dropped_columns = ['weight', 'payer_code', 'medical_specialty']
dropped_columns.append("encounter_id")
dropped_columns.append('discharge_disposition_id')

## dropping columns that have little to no variability
for col in categorical_cols:
    if(df[col].value_counts(normalize=True).max() > 0.948):
        dropped_columns.append(col)
        
df.drop(columns=dropped_columns, axis=1, inplace=True)
df.dropna(inplace=True)

<h2> Some Patients have multiple records </h2>

In [14]:
## one record per patient (where they had max of time_in_hospital)
df = df.loc[df.groupby("patient_nbr", sort=False)['time_in_hospital'].idxmax()]
df.drop(columns = ['patient_nbr'], inplace=True)

In [15]:
## convert our categorical variable (if readmitted -> 1 else 0)
df['readmitted'] = np.where(df['readmitted']!='NO',1,0)

In [16]:
## convert age ranges to the midpoint of the ranges
new_ages = {
    "[0-10)": 5,
    "[10-20)": 15,
    "[20-30)": 25,
    "[30-40)": 35,
    "[40-50)": 45,
    "[50-60)": 55,
    "[60-70)": 65,
    "[70-80)": 75,
    "[80-90)": 85,
    "[90-100)": 95
}

df['age'] = df['age'].map(new_ages)

In [17]:
max_glu_serums = {
    "None": 0,
    "Norm": 100,
    ">200": 200,
    ">300": 300
}
df['max_glu_serum'] = df['max_glu_serum'].map(max_glu_serums)

In [18]:
A1CResult_map = {
    "None": 0,
    "Norm": 5,
    ">7": 7,
    ">8": 8
}
df['A1Cresult'] = df['A1Cresult'].map(A1CResult_map)

In [19]:
#converting binary variables into -1 or 1
df['change'] = np.where(df['change']=='No',-1,1)
df['diabetesMed'] = np.where(df['diabetesMed']=='No',-1,1)

In [20]:
drug_codes = {
    "No": -20,
    "Down": -10, 
    "Steady": 0,
    "Up": 10    
}
drugs = ['metformin','glipizide','glyburide', 'pioglitazone', 'rosiglitazone','insulin'] 
for drug in drugs:
    df[drug] = df[drug].map(drug_codes)

In [21]:
## mapping diagnosis categories according to paper (else 800 plus features)
diagnosis_cols = ['diag_1', 'diag_2', 'diag_3']

for col in diagnosis_cols:
    df['tmp'] = np.nan
    df.loc[(df[col].str.contains("250")), col] = '250'
    df.loc[(df[col].str.startswith('V')) | (df[col].str.startswith('E')), col] = '-999' 

    df[col] = df[col].astype(float)
    
    #convert the correct ranges based on values given in paper
    df.loc[(((df[col] >=390) & (df[col]<=460)) | (df[col] == 785)), 'tmp'] = 'Circulatory'
    df.loc[(((df[col] >=460) & (df[col]<=519)) | (df[col] == 786)), 'tmp'] = 'Respiratory'
    df.loc[(((df[col] >=520) & (df[col]<=579)) | (df[col] == 787)), 'tmp'] = 'Digestive'
    df.loc[(((df[col] >=580) & (df[col]<=629)) | (df[col] == 788)), 'tmp'] = 'Genitourinary'
    df.loc[((df[col] >=800) & (df[col]<=999)), 'tmp'] = 'Injury'
    df.loc[((df[col] >=710) & (df[col]<=739)), 'tmp'] = 'Musculoskeletal'
    df.loc[((df[col] >=140) & (df[col]<=239)), 'tmp'] = 'Neoplasms'
    df.loc[(df[col] == 250), 'tmp'] = 'Diabetes'
    
    df['tmp'].fillna(value = "Other", inplace=True)
    
    df[col] = df['tmp']
    df.drop(columns=['tmp'], inplace=True)
    

In [22]:
## admission_source_id
df['tmp'] = np.nan
col = 'admission_source_id'
df.loc[((df[col].between(4,6)) | (df[col] == 10) | (df[col] == 18) | (df[col] == 22) | (df[col].between(25,26))), 'tmp'] = "Transfer_Source"
df.loc[df[col].between(1,3), 'tmp'] = "Referral_Source"
df.loc[((df[col].between(11,14))| (df[col].between(23,24))), 'tmp'] = "Birth_Source"
df.loc[df[col] == 7, 'tmp'] = "Emergency_Source"
df.loc[((df[col] == 8) | (df[col]==19)), 'tmp'] = "Other"
        
df['tmp'].fillna(value = "Unknown", inplace=True)
df[col] = df['tmp']
df.drop(columns=['tmp'], inplace=True)


##mapping admission type_id
df['tmp'] = np.nan
col = 'admission_type_id'
df.loc[df[col] == 1, 'tmp'] = 'Emergency_Type'
df.loc[df[col] == 2, 'tmp'] = 'Urgent_Type'
df.loc[df[col] == 3, 'tmp'] = 'Elective_Type'
df.loc[df[col] == 7, 'tmp'] = 'Trauma_Type'
df.loc[df[col] == 4, 'tmp'] = 'Newborn_Type'

df['tmp'].fillna(value = "Unknown", inplace=True)
df[col] = df['tmp']
df.drop(columns=['tmp'], inplace=True)


In [23]:
def one_hot_encoder(df, cols):
    """one-hot encoding function for all our categorical columns"""
    
    for col in cols:
        if("admission" in col):
            dummies = pd.get_dummies(df[col], drop_first=False)
        else:
            dummies = pd.get_dummies(df[col], prefix=col, drop_first=False)
        df = pd.concat([df, dummies], axis=1)   
        df.drop([col],axis=1, inplace=True)
    return df

In [24]:
#one-hot encoding 
categorical_columns = [col for col in df.columns if df[col].dtype == np.dtype(object)]
df = one_hot_encoder(df, categorical_columns)
df.columns = map(str.lower, df.columns)

#train-test-split
target_variable = 'readmitted'
Y_feature = df[target_variable]
X_features = df.drop(columns=[target_variable])
X_train, X_test, y_train, y_test = train_test_split(X_features,Y_feature, test_size=0.2, random_state = 42)

In [25]:
# normalize of numerical columns
mm_scaler = MinMaxScaler()
X_train = pd.DataFrame(mm_scaler.fit_transform(X_train), columns = X_train.columns) 
X_test = pd.DataFrame(mm_scaler.fit_transform(X_test), columns = X_test.columns)

In [77]:
class NeuralNet:

    def __init__(self, X_train, y_train, h=4):
        #np.random.seed(1)
        # h represents the number of neurons in the hidden layers
        self.X = X_train
        self.y = y_train

        # Find number of input and output layers from the dataset
        input_layer_size = self.X.shape[1]
        
        self.output_layer_size = 1

        # assign random weights to matrices in network
        # number of weights connecting layers = (no. of nodes in previous layer) x (no. of nodes in following layer)
        self.W_hidden = 2 * np.random.random((input_layer_size, h)) - 1
        self.Wb_hidden = 2 * np.random.random((1, h)) - 1

        self.W_output = 2 * np.random.random((h, self.output_layer_size)) - 1
        self.Wb_output = np.ones((1, self.output_layer_size))

        self.deltaOut = np.zeros((self.output_layer_size, 1))
        self.deltaHidden = np.zeros((h, 1))
        self.h = h


    def __activation(self, x, activation):
        if activation == "sigmoid":
            self.__sigmoid(self, x)
        elif activation == "tanh":
            self.__tanh(self,x)
        elif activation == "relu":
            self.__relu(self,x)
     

    def __activation_derivative(self, x, activation):
        if activation == "sigmoid":
            self.__sigmoid_derivative(self, x)
        elif activation == "tanh":
            self.__tanh_derivative(self,x)
        elif activation == "relu":
            self.__relu_derivative(self,x)

    def __sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def __tanh(self, x):
        return np.tanh(x)
    
    def __relu(self, x):
        return np.maximum(0, x)

    def __sigmoid_derivative(self, x):
        return x * (1 - x)
    
    def __tanh_derivative(self, x):
        return 1-(np.tanh(x))**2
    
    def __relu_derivative(self,x):
        return (x>0)*1


    # Below is the training function

    def train(self, activation, max_iterations=2, learning_rate=0.01):
        for iteration in range(max_iterations):
            out = self.forward_pass(activation)
            
            error = 0.5 * np.power((out - self.y), 2)
            # TODO: I have coded the sigmoid activation, you have to do the rest
            self.backward_pass(out, activation)
            
            print(self.deltaOut)
#             print(self.deltaHidden)
            break
            update_weight_output = learning_rate * np.dot(self.X_hidden.T, self.deltaOut)
            update_weight_output_b = learning_rate * np.dot(np.ones((np.size(self.X, 0), 1)).T, self.deltaOut)

            update_weight_hidden = learning_rate * np.dot(self.X.T, self.deltaHidden)
            update_weight_hidden_b = learning_rate * np.dot(np.ones((np.size(self.X, 0), 1)).T, self.deltaHidden)

            self.W_output += update_weight_output
            self.Wb_output += update_weight_output_b
            self.W_hidden += update_weight_hidden
            self.Wb_hidden += update_weight_hidden_b
            
#             print(self.Wb_hidden)
            
            print(f"Error for iteration {iteration} is {np.sum(error)}")
#         print("After " + str(max_iterations) + " iterations, the total error is " + str(np.sum(error)))
#         print("The final weight vectors are (starting from input to output layers) \n" + str(self.W_hidden))
#         print("The final weight vectors are (starting from input to output layers) \n" + str(self.W_output))

#         print("The final bias vectors are (starting from input to output layers) \n" + str(self.Wb_hidden))
#         print("The final bias vectors are (starting from input to output layers) \n" + str(self.Wb_output))

    def forward_pass(self, activation):
        # pass our inputs through our neural network
        in_hidden = np.dot(self.X, self.W_hidden) + self.Wb_hidden

        # TODO: I have coded the sigmoid activation, you have to do the rest
        if activation == "sigmoid":
            self.X_hidden = self.__sigmoid(in_hidden)
        elif activation == "tanh":
            self.X_hidden = self.__tanh(in_hidden)
        elif activation == "relu":
            self.X_hidden = self.__relu(in_hidden)

        in_output = np.dot(self.X_hidden, self.W_output) + self.Wb_output

        # output 
        if activation == "sigmoid":
            out = self.__sigmoid(in_output)
        elif activation == "tanh":
            out = self.__tanh(in_output)
        elif activation == "relu":
            out = self.__relu(in_output)
        return out

    def backward_pass(self, out, activation):
        # pass our inputs through our neural network
        self.compute_output_delta(out, activation)
        self.compute_hidden_delta(activation)
        print(self.deltaHidden)
        

    # TODO: Implement other activation functions

    def compute_output_delta(self, out, activation):
        if activation == "sigmoid":
            delta_output = (self.y - out) * (self.__sigmoid_derivative(out))
        elif activation == "tanh":
            delta_output = (self.y - out) * (self.__tanh_derivative(out))
        elif activation == "relu":
            delta_output = (self.y - out) * (self.__relu_derivative(out))

        self.deltaOut = delta_output

    def compute_hidden_delta(self, activation):
        if activation == "sigmoid":
            delta_hidden_layer = (self.deltaOut.dot(self.W_output.T)) * (self.__sigmoid_derivative(self.X_hidden))
        elif activation == "tanh":
            delta_hidden_layer = (self.deltaOut.dot(self.W_output.T)) * (self.__tanh_derivative(self.X_hidden))
        elif activation == "relu":
            delta_hidden_layer = (self.deltaOut.dot(self.W_output.T)) * (self.__relu_derivative(self.X_hidden))
            
        self.deltaHidden = delta_hidden_layer

    # TODO: Implement the predict function for applying the trained model on the  test dataset.
    # You can assume that the test dataset has the same format as the training dataset
    # You have to output the test error from this function

    def predict(self, X_test, y_test):
        pass


In [79]:
# y_train = y_train.values.reshape(y_train.shape[0],1)
# y_test = y_test.values.reshape(y_test.shape[0],1)
nn_model = NeuralNet(X_train,y_train, h=20 )
nn_model.train(activation="relu")

[[ 0.         -0.         -0.34431661 ... -0.31159388  0.
   0.30443981]
 [-0.          0.          0.07088055 ...  0.         -0.02074915
  -0.06267157]
 [-0.          0.          0.         ...  0.41093099 -0.
  -0.40149618]
 ...
 [-0.          0.          0.         ...  1.24105232 -0.
  -1.21255826]
 [-0.          0.          1.49394449 ...  0.         -0.43732833
  -1.32092432]
 [-0.75262946  0.          1.28906129 ...  1.1665531  -0.
  -1.13976952]]
[[ 0.37983413]
 [-0.07819214]
 [-0.50092645]
 ...
 [-1.51284754]
 [-1.64805038]
 [-1.42203272]]
[[ 0.         -0.         -0.34431661 ... -0.31159388  0.
   0.30443981]
 [-0.          0.          0.07088055 ...  0.         -0.02074915
  -0.06267157]
 [-0.          0.          0.         ...  0.41093099 -0.
  -0.40149618]
 ...
 [-0.          0.          0.         ...  1.24105232 -0.
  -1.21255826]
 [-0.          0.          1.49394449 ...  0.         -0.43732833
  -1.32092432]
 [-0.75262946  0.          1.28906129 ...  1.1665531  -0.


In [74]:
X_train.iloc[:,0:3]

Unnamed: 0,age,time_in_hospital,num_lab_procedures
0,0.555556,0.153846,0.366412
1,0.666667,0.153846,0.206107
2,0.666667,0.461538,0.351145
3,0.777778,0.615385,0.358779
4,0.555556,0.000000,0.320611
...,...,...,...
54898,0.777778,0.307692,0.358779
54899,0.333333,0.076923,0.167939
54900,0.666667,0.230769,0.312977
54901,0.777778,0.076923,0.351145
