In [2]:
#Importing the required libraries 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, normalize,MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import seaborn as sb    # for seeing visually how the data distribution

In [3]:
#Reading the CSV file
file_name = 'out_pat_data.csv'
#Storing it in a dataframe
df = pd.read_csv(file_name)
df

Unnamed: 0.1,Unnamed: 0,X,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
0,1,1,66154,25312,118,68,22.730000,0,Caucasian,M,...,0,1,0,0,0,0,0,Sepsis,Cardiovascular,0
1,2,2,114252,59342,81,77,27.420000,0,Caucasian,F,...,0,1,0,0,0,0,0,Respiratory,Respiratory,0
2,3,3,119783,50777,118,25,31.950000,0,Caucasian,F,...,0,0,0,0,0,0,0,Metabolic,Metabolic,0
3,4,4,79267,46918,118,81,22.640000,1,Caucasian,F,...,0,0,0,0,0,0,0,Cardiovascular,Cardiovascular,0
4,5,5,92056,34377,33,65,27.654655,0,Caucasian,M,...,0,0,0,0,0,0,0,Trauma,Trauma,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91708,91709,91709,91592,78108,30,75,23.060250,0,Caucasian,M,...,0,1,0,0,0,0,1,Sepsis,Cardiovascular,0
91709,91710,91710,66119,13486,121,56,27.623893,0,Caucasian,F,...,0,0,0,0,0,0,0,Sepsis,Cardiovascular,0
91710,91711,91711,8981,58179,195,48,27.236914,0,Caucasian,M,...,0,1,0,0,0,0,0,Metabolic,Metabolic,0
91711,91712,91712,33776,120598,66,65,23.297481,0,Caucasian,F,...,0,0,0,0,0,0,0,Respiratory,Respiratory,0


In [4]:
#Dropping the columns
df.drop('X', inplace = True, axis = 1)
df.drop('Unnamed: 0', inplace = True, axis = 1)

In [5]:
#Getting the numerical data
num_cols = df.select_dtypes(include = ['int64','float64']).columns
print('Numerical Cols=', num_cols)

Numerical Cols= Index(['encounter_id', 'patient_id', 'hospital_id', 'age', 'bmi',
       'elective_surgery', 'height', 'icu_id', 'pre_icu_los_days', 'weight',
       'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_post_operative',
       'arf_apache', 'gcs_eyes_apache', 'gcs_motor_apache',
       'gcs_unable_apache', 'gcs_verbal_apache', 'heart_rate_apache',
       'intubated_apache', 'map_apache', 'resprate_apache', 'temp_apache',
       'ventilated_apache', 'd1_diasbp_max', 'd1_diasbp_min',
       'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min',
       'd1_heartrate_max', 'd1_heartrate_min', 'd1_mbp_max', 'd1_mbp_min',
       'd1_mbp_noninvasive_max', 'd1_mbp_noninvasive_min', 'd1_resprate_max',
       'd1_resprate_min', 'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_max',
       'd1_sysbp_min', 'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min',
       'd1_temp_max', 'd1_temp_min', 'h1_diasbp_max', 'h1_diasbp_min',
       'h1_heartrate_max', 'h1_heartrate_min', 'h1_respra

In [6]:
#Storing the numerical data in a new dataframe
new_df = df[['age','bmi','pre_icu_los_days','apache_2_diagnosis','apache_3j_diagnosis',
             'gcs_eyes_apache','gcs_motor_apache','gcs_verbal_apache','heart_rate_apache',
             'map_apache','resprate_apache','temp_apache','d1_diasbp_max','d1_diasbp_min',
             'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min','d1_heartrate_max',
             'd1_heartrate_min','d1_mbp_max','d1_mbp_min','d1_mbp_noninvasive_max',
             'd1_mbp_noninvasive_min','d1_resprate_max','d1_resprate_min','d1_spo2_max',
             'd1_spo2_min','d1_sysbp_max','d1_sysbp_min','d1_sysbp_noninvasive_max',
             'd1_sysbp_noninvasive_min','d1_temp_max','d1_temp_min','h1_diasbp_max',
             'h1_diasbp_min', 'h1_heartrate_max', 'h1_heartrate_min', 'h1_resprate_max',
             'h1_resprate_min','h1_spo2_max', 'h1_spo2_min','h1_sysbp_max','h1_sysbp_min']]
#Number of elements in each dimensions 
new_df.shape

(91713, 42)

In [7]:
#Normalising the numerical data between 0 to 1 in order to get a normal scale
std = MinMaxScaler()
arr1 = std.fit_transform(new_df)

In [8]:
#Storing the normalised data in another dataframe
df_normal = pd.DataFrame(arr1,columns = ['age','bmi','pre_icu_los_days','apache_2_diagnosis','apache_3j_diagnosis',
             'gcs_eyes_apache','gcs_motor_apache','gcs_verbal_apache','heart_rate_apache',
             'map_apache','resprate_apache','temp_apache','d1_diasbp_max','d1_diasbp_min',
             'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min','d1_heartrate_max',
             'd1_heartrate_min','d1_mbp_max','d1_mbp_min','d1_mbp_noninvasive_max',
             'd1_mbp_noninvasive_min','d1_resprate_max','d1_resprate_min','d1_spo2_max',
             'd1_spo2_min','d1_sysbp_max','d1_sysbp_min','d1_sysbp_noninvasive_max',
             'd1_sysbp_noninvasive_min','d1_temp_max','d1_temp_min','h1_diasbp_max',
             'h1_diasbp_min', 'h1_heartrate_max', 'h1_heartrate_min', 'h1_resprate_max',
             'h1_resprate_min','h1_spo2_max', 'h1_spo2_min','h1_sysbp_max','h1_sysbp_min'])
df_normal.head()

Unnamed: 0,age,bmi,pre_icu_los_days,apache_2_diagnosis,apache_3j_diagnosis,gcs_eyes_apache,gcs_motor_apache,gcs_verbal_apache,heart_rate_apache,map_apache,...,h1_diasbp_max,h1_diasbp_min,h1_heartrate_max,h1_heartrate_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_max,h1_sysbp_min
0,0.695652,0.253511,0.558739,0.057971,0.35527,0.666667,1.0,0.75,0.610687,0.0,...,0.373494,0.5,0.722772,0.727273,0.571429,0.541667,1.0,0.692308,0.444444,0.469697
1,0.826087,0.404298,0.95702,0.033816,0.143665,0.0,0.4,0.0,0.625954,0.0375,...,0.289157,0.317073,0.673267,0.646465,0.75,0.958333,0.285714,0.692308,0.15873,0.136364
2,0.072464,0.549941,0.000716,0.101449,0.497534,0.666667,1.0,1.0,0.48855,0.175,...,0.614458,0.439024,0.49505,0.424242,0.357143,0.458333,0.714286,0.307692,0.579365,0.537879
3,0.884058,0.250618,0.000716,0.492754,0.853511,1.0,1.0,1.0,0.580153,0.125,...,0.301205,0.268293,0.534653,0.606061,0.071429,0.25,1.0,0.923077,0.484127,0.401515
4,0.652174,0.411842,0.075931,0.086957,0.425333,1.0,1.0,1.0,0.167939,0.39375,...,0.746988,0.560976,0.425743,0.40404,0.392857,0.458333,1.0,1.0,0.436508,0.507576


In [None]:
#Using For Loop to get the Silhoette Score at different values of n_clusters
n = [2,3,4,5,6]
silhouette_sco = []
max_score = 0
max_n = 0
for i in n:
    kmeans = KMeans(n_clusters = i, random_state = 45)
    kmeans.fit(df_normal)
    y_kmeans = kmeans.predict(df_normal)
    val = silhouette_score(df_normal, y_kmeans)
    silhouette_sco.append(val)    

In [None]:
#Plotting the n_clusters value against Silhoette Score
fig=plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(n,silhouette_sco)
plt.title('Comparison of Number of Clusters on the basis of Silhoette Score')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhoette Score')
plt.show()

In [None]:
#Using KMeans for Number of Clusters value = 2
arr =['age','bmi','pre_icu_los_days','apache_2_diagnosis','apache_3j_diagnosis',
             'gcs_eyes_apache','gcs_motor_apache','gcs_verbal_apache','heart_rate_apache',
             'map_apache','resprate_apache','temp_apache','d1_diasbp_max','d1_diasbp_min',
             'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min','d1_heartrate_max',
             'd1_heartrate_min','d1_mbp_max','d1_mbp_min','d1_mbp_noninvasive_max',
             'd1_mbp_noninvasive_min','d1_resprate_max','d1_resprate_min','d1_spo2_max',
             'd1_spo2_min','d1_sysbp_max','d1_sysbp_min','d1_sysbp_noninvasive_max',
             'd1_sysbp_noninvasive_min','d1_temp_max','d1_temp_min','h1_diasbp_max',
             'h1_diasbp_min', 'h1_heartrate_max', 'h1_heartrate_min', 'h1_resprate_max',
             'h1_resprate_min','h1_spo2_max', 'h1_spo2_min','h1_sysbp_max','h1_sysbp_min']
for i in range(len(arr)-1):                            # taking into consideration all the variables for clustering
    kmeans = KMeans(n_clusters = 2, random_state = 45)
    df_normal['cluster'] = kmeans.fit_predict(df_normal[[arr[i],arr[i+1]]])
#Get the centroids
    centroids = kmeans.cluster_centers_
    cen_x = [i[0] for i in centroids] 
    cen_y = [i[1] for i in centroids]
    
    #Plotting the total numbers of elements in each cluster
    count = df_normal['cluster'].value_counts()
    fig = plt.figure()
    ax = fig.add_axes([1,1,1,1])
    ax.bar(count.index,count)
    plt.xticks(count.index, color='black', fontweight='bold', fontsize='10', horizontalalignment='right')
    plt.xlabel('Clusters')
    plt.ylabel('Counts')
    plt.title('Cluster Counts')
    plt.show()
 #Adding the centroids to the dataframe
    df_normal['cen_x'] = df_normal.cluster.map({0:cen_x[0], 1:cen_x[1]})
    df_normal['cen_y'] = df_normal.cluster.map({0:cen_y[0], 1:cen_y[1]})
    #Defining the colur map
    colors = ['#DF2020', '#81DF20']
    df_normal['c'] = df_normal.cluster.map({0:colors[0], 1:colors[1]}) 
    #Plotting the scatter plot
    plt.scatter(df_normal[arr[i]], df_normal[arr[i+1]], c=df_normal.c, alpha = 0.6, s=10)
    plt.title("scatter plot")
    plt.xlabel(arr[i])
    plt.ylabel(arr[i+1])


In [None]:
#Storing all numerical variables heading in the array for drawing histograms and checking for anomaly
arr1=['age','bmi','pre_icu_los_days','apache_2_diagnosis','apache_3j_diagnosis',
             'gcs_eyes_apache','gcs_motor_apache','gcs_verbal_apache','heart_rate_apache',
             'map_apache','resprate_apache','temp_apache','d1_diasbp_max','d1_diasbp_min',
             'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min','d1_heartrate_max',
             'd1_heartrate_min','d1_mbp_max','d1_mbp_min','d1_mbp_noninvasive_max',
             'd1_mbp_noninvasive_min','d1_resprate_max','d1_resprate_min','d1_spo2_max',
             'd1_spo2_min','d1_sysbp_max','d1_sysbp_min','d1_sysbp_noninvasive_max',
             'd1_sysbp_noninvasive_min','d1_temp_max','d1_temp_min','h1_diasbp_max',
             'h1_diasbp_min', 'h1_heartrate_max', 'h1_heartrate_min', 'h1_resprate_max',
             'h1_resprate_min','h1_spo2_max', 'h1_spo2_min','h1_sysbp_max','h1_sysbp_min']

for i in arr1:                # making use of for loop in order to draw histograms for all 42 numerical variables
    print(i)                  # Printing variables in order to know if all the numerical variables are included or not
    sb.displot(df, x=i)       # Plotting in order to see visually how the data is distributed visually
                              # #plot the histogram to see the distribution of the point data.
    

In [None]:
arr2=['encounter_id', 'patient_id', 'hospital_id', 'age', 'bmi',
       'elective_surgery', 'height', 'icu_id', 'pre_icu_los_days', 'weight',
       'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_post_operative',
       'arf_apache', 'gcs_eyes_apache', 'gcs_motor_apache',
       'gcs_unable_apache', 'gcs_verbal_apache', 'heart_rate_apache',
       'intubated_apache', 'map_apache', 'resprate_apache', 'temp_apache',
       'ventilated_apache', 'd1_diasbp_max', 'd1_diasbp_min',
       'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min',
       'd1_heartrate_max', 'd1_heartrate_min', 'd1_mbp_max', 'd1_mbp_min',
       'd1_mbp_noninvasive_max', 'd1_mbp_noninvasive_min', 'd1_resprate_max',
       'd1_resprate_min', 'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_max',
       'd1_sysbp_min', 'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min',
       'd1_temp_max', 'd1_temp_min', 'h1_diasbp_max', 'h1_diasbp_min',
       'h1_heartrate_max', 'h1_heartrate_min', 'h1_resprate_max',
       'h1_resprate_min', 'h1_spo2_max', 'h1_spo2_min', 'h1_sysbp_max',
       'h1_sysbp_min', 'aids', 'cirrhosis', 'diabetes_mellitus',
       'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma',
       'solid_tumor_with_metastasis', 'hospital_death']
for i in arr2:
     plt.boxplot(df[i])  # plotting boxplots for all the variables
     plt.show()


In [None]:
corMat = df.corr('hospital_death')     # making correlation matrix for the dataframe df

print(corMat)

sb.heatmap(corMat, annot = True,cmap='')
    