# Import Statements

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import random
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA 
import math
random.seed(45)

# Train test split

In [2]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size*len(df))#Calculating the test size 

    indices = df.index.tolist()
    test_indices = random.sample(population = indices, k = test_size)#Random sampling of data

    test_df = df.loc[test_indices]#Getting the test data set
    train_df = df.drop(test_indices)#Getting the train data set
    return train_df, test_df


# Handling missing values

In [21]:
def handle_missing_values(df):
    #Columns with missing values: Bed Grade, City_Code_Patient
    
    # Method 1.Replacing with most common value
    df["Bed Grade"].fillna(df["Bed Grade"].value_counts().index[0], inplace=True)
    df["City_Code_Patient"].fillna(df["City_Code_Patient"].value_counts().index[0], inplace=True)
    


# Encoding categorical variables

In [4]:
def encode_data(df):
    cleanup_nums = {"Department":     {"gynecology": 0, "anesthesia": 1, "radiotherapy": 2, "TB & Chest disease": 3, "surgery": 4 },
                "Ward_Type": {"R": 0, "Q": 1, "S": 2, "P": 3, "T": 4, "U": 5 },
                "Type of Admission": {"Trauma": 0, "Emergency": 1, "Urgent": 2 },
                "Severity of Illness": {"Minor": 0, "Moderate": 1, "Extreme": 2},
                "Age": {"0-10": 0, "11-20": 1, "21-30": 2, "31-40": 3, "41-50": 4, "51-60": 5, "61-70": 6, "71-80": 7, "81-90": 8, "91-100": 9},
                "Stay": {"0-10": 0, "11-20": 1, "21-30": 2, "31-40": 3, "41-50": 4, "51-60": 5, "61-70": 6, "71-80": 7, "81-90": 8, "91-100": 9, "More than 100 Days": 10}, 
                "Hospital_type_code": {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4, "f": 5, "g": 6}, 
                "Hospital_region_code": {"X": 0, "Y": 1, "Z": 2}, 
                "Ward_Facility_Code": {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5}}

    df.replace(cleanup_nums, inplace=True)
    return df

# Train using Naive Bayes Classifier

In [5]:
def train(df,target, attributes):

    label_classes = df[target].unique()
    '''
    Data structure used to store probabilities : Dictionary of dictionary of one dimensional arrays.
    Element prob_dict[k][j][i] stores probability of occurence of jth value of kth attribute given that ith class is TRUE
    
    Simpler Format : prob_dict[attribute][attr_value][class]

    '''
    prob_dict = {}    #Main Dictionary 

    for attribute in attributes:
        print(attribute)
        attr_values = df[attribute].unique()
        conditional_prob_dict = {}                #Stores For all attr_value -> [ P(attr_value | class1), P(attr_value | class2), P(attr_value | class3)]
        
        for attr_value in attr_values:
            conditional_prob = {}
            for label_class in label_classes:  
                df_filtered=df[df[target]==label_class]
                denominator=len(df_filtered)
                df_filtered1=df_filtered[df_filtered[attribute]==attr_value]
                numerator = len(df_filtered1)+ 1  #Adding 1 to avoid zero proabability
                
                probability = numerator / denominator
                
                conditional_prob[label_class]=probability

            conditional_prob_dict[attr_value] = conditional_prob       #[ P(attr_value | class1), P(attr_value | class2), P(attr_value | class3)]

        prob_dict[attribute] = conditional_prob_dict

        #Calculating class probabilities
        prob_class={}
        for label_class in label_classes:
            prob_class[label_class] = (df[df[target] == label_class].shape[0] / df.shape[0])

    return prob_dict,prob_class

# Prediction and Accuracy

In [6]:
def predict(prob_dict,prob_class,attributes,label_classes,datapoint,df):
    prob_max=0
    for label_class in label_classes:
        prob=1
        for attribute in attributes:
            try:
                prob = prob * prob_dict[attribute][datapoint[attribute]][label_class]
            except KeyError:
                prob=1/len(df)
      
        prob=prob*prob_class[label_class]
      
        if prob > prob_max:
            prob_max = prob
            label = label_class

    return label

def getPrediction(prob_dict,prob_class,attributes,label_classes,test,df):
    preds=[]
    for i in range(0,len(test)):
        #print(i)
        preds.append(predict(prob_dict,prob_class,attributes,label_classes,test.iloc[i],df))
    return preds

def getAccuracy(preds,labels):
    if len(preds) != len(labels):
        print("Shape of Actual label array & predicted label array should be same.")
        exit(1)
     
    correct_count=0
    for i in range(0,len(preds)):
        #print(i)
        if(preds[i]==labels[i]):
            correct_count+=1

    return correct_count/len(preds)

In [9]:
#Helper function
def createBin(x,max_val):
    frac = x/max_val
    if frac < 0.25:
        return 0
    elif frac >=0.25 and frac < 0.5:
        return 1
    elif frac >=0.5 and frac < 0.75:
        return 2
    else:
        return 3
df=pd.read_csv("Train_B.csv")
handle_missing_values(df)

#converting continuous feature into categorical
feature="Admission_Deposit"
max_val = df[feature].max()
df[feature] = df[feature].apply(lambda x: createBin(x,max_val))
target=df.columns[-1]
label_classes=df[target].unique()
df.drop(['case_id','patientid'], axis=1, inplace=True)


train_df, test_df=train_test_split(df,0.2)
attributes=train_df.columns[:-1].tolist()

In [10]:
 df = encode_data(df)

# Removing samples with maximum outlier features

In [14]:
# Removing a row if Age and Admission deposit values lie outside the given range
print(df.shape)
mean_age=df["Age"].mean()
mean_adm=df["Admission_Deposit"].mean()
sd_age=np.sqrt(df["Age"].var())
sd_adm=np.sqrt(df["Admission_Deposit"].var())
filt=(df["Age"]<=mean_age+3*sd_age) & (df["Admission_Deposit"] <=mean_adm+3*sd_adm)
df=df.loc[filt]
df.shape

(318438, 16)


(314909, 16)

# Sequential Backward Selection

In [19]:
validation_set_size = round(0.2*len(train_df))  #Validation set for testing acc in feature selection
indices = train_df.index.tolist()
validation_set_indices = random.sample(population = indices, k = validation_set_size)#Random sampling of data
validation_set_df = train_df.loc[validation_set_indices]
train_df = train_df.drop(validation_set_indices)

attributes_new = attributes
#print(attributes)
#Training with all attributes to initialise maximum accuracy
probs,prob_class = train(train_df,target,attributes_new)
preds = getPrediction(probs,prob_class,attributes_new,label_classes,validation_set_df,df)
max_acc = getAccuracy(preds,validation_set_df[target].tolist())

#print("##########Initial acc=",max_acc)


#print("Initial set of attributes:",attributes)    
for i in range(len(attributes)):
    attribute_to_be_removed=""
    flg=0
    j=0
    for attribute in (attributes):
        train_df2=train_df.drop([attribute],axis=1)
        attributes_new=attributes[:]
        print(attributes_new)
        attributes_new.remove(attribute)
        #print("##########",i,j)
        #print("##########  Attributes_size=",len(attributes))
        #print("##########  Attributes_new_size=",len(attributes_new))
        probs,prob_class = train(train_df2,target,attributes_new)
        preds = getPrediction(probs,prob_class,attributes_new,label_classes,validation_set_df,df)
        acc = getAccuracy(preds,validation_set_df[target].tolist())
        #print("############ Acc=",acc)
        if acc>max_acc:
            max_acc=acc
            attribute_to_be_removed=attribute
            flg=1
        j=j+1
    if (flg==1):
        train_df.drop([attribute_to_be_removed],axis=1,inplace=True)
        attributes.remove(attribute_to_be_removed)
    else:
        break
print("Final set of attributes:",attributes)    
        
    

Hospital_code
Hospital_type_code
City_Code_Hospital
Hospital_region_code
Available Extra Rooms in Hospital
Department
Ward_Type
Ward_Facility_Code
Bed Grade
City_Code_Patient
Type of Admission
Severity of Illness
Visitors with Patient
Age
Admission_Deposit
['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code', 'Available Extra Rooms in Hospital', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade', 'City_Code_Patient', 'Type of Admission', 'Severity of Illness', 'Visitors with Patient', 'Age', 'Admission_Deposit']
Hospital_type_code
City_Code_Hospital
Hospital_region_code
Available Extra Rooms in Hospital
Department
Ward_Type
Ward_Facility_Code
Bed Grade
City_Code_Patient
Type of Admission
Severity of Illness
Visitors with Patient
Age
Admission_Deposit
['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code', 'Available Extra Rooms in Hospital', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade', 'City_Code_P

In [88]:
df=pd.read_csv("Train_B.csv")
handle_missing_values(df)

#converting continuous feature into categorical
feature="Admission_Deposit"
max_val = df[feature].max()
df[feature] = df[feature].apply(lambda x: createBin(x,max_val))
target=df.columns[-1]
label_classes=df[target].unique()
df.drop(['case_id','patientid'], axis=1, inplace=True)

df = df[attributes]

train_df, test_df=train_test_split(df,0.2)

print("Training started...")
attributes_now=train_df.columns[:-1].tolist()
probs,prob_class = train(train_df,target,attributes_now)
print("Training completed.")

print("Prediction started...")
preds = getPrediction(probs,prob_class,attributes_now,label_classes,test_df,df)
print("Prediction completed...")
print("Getting accuracy...")
acc = getAccuracy(preds,test_df[target].tolist())

print("Accuracy=",acc)

Training started...
Hospital_code
Available Extra Rooms in Hospital
Department
Ward_Type
Bed Grade
City_Code_Patient
Type of Admission
Visitors with Patient
Age
Training completed.
Prediction started...
Prediction completed...
Getting accuracy...
Accuracy= 0.38102939329229996


In [92]:
print("Final Set of Attributes: ", attributes)

Final Set of Attributes:  ['Hospital_code', 'Available Extra Rooms in Hospital', 'Department', 'Ward_Type', 'Bed Grade', 'City_Code_Patient', 'Type of Admission', 'Visitors with Patient', 'Age']
