Stage 2: Predicting Length of Stay

In [1]:
import pandas as pd
import numpy as np

In [2]:
#import admisson data
df = pd.read_csv('admission_data.csv')

In [3]:
df

Unnamed: 0,PID,AGE,GENDER,SMOKING,ALCOHOL,HTN,CAD,PRIOR CMP,HB,TLC,...,VT,PSVT,CONGENITAL,NEURO CARDIOGENIC SYNCOPE,ORTHOSTATIC,CARDIOGENIC SHOCK,SHOCK,PULMONARY EMBOLISM,CHEST INFECTION,TARGET
0,506,50,M,0,0,1,1,0,13.1,10.5,...,0,0,0,0,0,0,0,0,0,4
1,798,71,M,0,0,0,1,0,12.9,8.5,...,0,0,0,0,0,0,0,0,0,2
2,798,71,M,0,0,0,1,0,12.9,8.5,...,0,0,0,0,0,0,0,0,0,2
3,798,72,M,0,0,0,1,0,11.4,6.6,...,0,0,0,0,0,0,0,0,0,6
4,989,71,M,0,0,1,1,0,10.7,14.1,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15752,4888286,68,M,0,0,0,1,0,12.2,6.7,...,0,0,0,0,0,0,0,0,0,5
15753,4888926,65,F,0,0,1,1,0,11.3,11.3,...,0,0,0,0,0,0,0,0,0,2
15754,5711587,60,F,0,0,0,0,1,10.1,9.8,...,0,0,0,0,0,0,0,0,0,11
15755,6408503,68,F,0,0,0,1,0,9.8,17.3,...,0,0,0,0,0,0,1,0,0,11


In [4]:
#find correlation between all the variables
corr_matrix = df.corr()
#remove correlated variables above 0.80
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# Find features with correlation greater than 0.80
to_drop = [column for column in upper.columns if any(upper[column] > 0.80)]
# Drop features 
df.drop(to_drop, axis=1, inplace=True)

In [5]:
#list unique values in chest infection
df['CHEST INFECTION'].unique()
#drop rows with '\\' in chest infection
df = df[df['CHEST INFECTION'] != '\\']

In [6]:
df.dropna(inplace=True)
df = df[df['PID'] != 'NILL']
df = df[(df != 'EMPTY').all(1)]
df['GENDER'] = np.where(df['GENDER'] == 'M', 1, 0)
#convert PID to int
df['PID'] = df['PID'].astype(int)
#convert HB and TLC to float 
df['HB'] = df['HB'].astype(float)
df['TLC'] = df['TLC'].astype(float)
#convert Platelets, glucose and chest infection to int
# df['PLATELETS'] = df['PLATELETS'].astype(float)
# df['GLUCOSE'] = df['GLUCOSE'].astype(float)
df['CHEST INFECTION'] = df['CHEST INFECTION'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [7]:
df.dtypes

PID                            int32
AGE                            int64
GENDER                         int32
SMOKING                        int64
ALCOHOL                        int64
HTN                            int64
CAD                            int64
PRIOR CMP                      int64
HB                           float64
TLC                          float64
PLATELETS                     object
GLUCOSE                       object
RAISED CARDIAC ENZYMES         int64
SEVERE ANAEMIA                 int64
ANAEMIA                        int64
STABLE ANGINA                  int64
ACS                            int64
STEMI                          int64
ATYPICAL CHEST PAIN            int64
HEART FAILURE                  int64
VALVULAR                       int64
CHB                            int64
SSS                            int64
AF                             int64
VT                             int64
PSVT                           int64
CONGENITAL                     int64
N

In [8]:
#list unique values and counts of PID in descending order
df['PID'].value_counts()
#find range of PID
print(df['PID'].max())
df['PID'].min()

5711587


506

In [9]:
#check if input PID already exists in DF return TARGET variable
def check_pid(pid):
    if pid in df['PID'].values:
        #return mean of all values for PID
        return df[df['PID'] == pid]['TARGET'].mean().round()
    else:
        return 0

In [10]:
#regression model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

#split data into train and test and drop Target and PID variable from X
X = df.drop(['TARGET'], axis=1)
y = df['TARGET']

In [11]:
#fit the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model = LinearRegression().fit(X_train, y_train)
#predict the model and convert to int 
# y_pred = model.predict(X_test)

In [12]:
#list data types
df.dtypes

PID                            int32
AGE                            int64
GENDER                         int32
SMOKING                        int64
ALCOHOL                        int64
HTN                            int64
CAD                            int64
PRIOR CMP                      int64
HB                           float64
TLC                          float64
PLATELETS                     object
GLUCOSE                       object
RAISED CARDIAC ENZYMES         int64
SEVERE ANAEMIA                 int64
ANAEMIA                        int64
STABLE ANGINA                  int64
ACS                            int64
STEMI                          int64
ATYPICAL CHEST PAIN            int64
HEART FAILURE                  int64
VALVULAR                       int64
CHB                            int64
SSS                            int64
AF                             int64
VT                             int64
PSVT                           int64
CONGENITAL                     int64
N

In [13]:
# import tensorflow as tf
# from tensorflow.keras import backend
# from tensorflow.keras import models, regularizers, layers, optimizers, losses, metrics
# from tensorflow.keras import Sequential
# from tensorflow.keras.layers import Dense

In [14]:
# # Training the network
# modell = models.Sequential()
# cols = X_train.shape[1]
# input_shape = (cols,)
# modell.add(layers.Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001), input_shape=input_shape))
# # model.add(layers.BatchNormalization())
# modell.add(layers.Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
# modell.add(layers.Dropout(0.2))
# modell.add(layers.Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
# modell.add(layers.Dropout(0.2))
# modell.add(layers.Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
# modell.add(layers.Dropout(0.2))
# modell.add(layers.Dense(1, activation='relu'))
# print(modell.summary())

# # FIT / TRAIN model on training daa
# NumEpochs = 20
# BatchSize = 8

# X_train = np.asarray(X_train).astype('float32')
# # Compile the model and fit it on the training data
# modell.compile(optimizer=optimizers.Adam(lr=1e-4), loss='mean_absolute_error', metrics=['mae'])
# history = modell.fit(X_train, y_train, epochs=NumEpochs, batch_size=BatchSize, validation_data=(X_test, y_test))

# # Predicting on the test dataset
# results = modell.evaluate(X_test, y_test)
# print("_"*100)
# print("Test Loss and MAE")
# print("results ", results)

In [15]:
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test).astype(int).round()
y_pred.shape
#call check_pid function to check if PID already exists in DF for every row in y_pred
for i in range(len(X_test)):
    y_pred[i] = ((y_pred[i] + check_pid(X_test.iloc[i]['PID']))/2).round()

print(y_pred)
score = r2_score(y_test, y_pred)
print(score)

[8 9 6 ... 7 8 6]
0.6847353571088057


In [16]:
# #predict the model
# test = np.array([989,71,1,0,0,1,1,0,10.7,14.1,227,253,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])
# test = test.reshape(1, -1)
# pid = 989
# y_pred = model.predict(test).astype(int).round()
# ((y_pred[0] + check_pid(pid))/2).round()

In [17]:
#append X_test and y_test to df
df_test = X_test
df_test['LOS'] = y_pred

In [18]:
df_test

Unnamed: 0,PID,AGE,GENDER,SMOKING,ALCOHOL,HTN,CAD,PRIOR CMP,HB,TLC,...,VT,PSVT,CONGENITAL,NEURO CARDIOGENIC SYNCOPE,ORTHOSTATIC,CARDIOGENIC SHOCK,SHOCK,PULMONARY EMBOLISM,CHEST INFECTION,LOS
12775,602495,82,0,0,0,1,1,0,11.8,10.1,...,0,0,0,0,0,0,0,0,0,8
14935,670926,65,1,0,0,1,1,0,8.7,15.6,...,0,0,0,0,0,0,0,0,0,9
4006,280038,53,0,0,0,1,1,0,13.9,12.5,...,0,0,0,0,0,0,0,0,0,6
15095,675773,62,1,0,0,1,1,0,9.4,16.7,...,0,0,0,0,0,0,0,0,0,8
5078,315883,60,1,0,0,0,0,1,13.8,14.4,...,0,0,0,0,0,0,0,1,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6029,351289,66,0,0,0,1,0,1,12.7,9.2,...,0,0,0,0,0,0,0,0,0,6
7660,399243,45,1,0,1,0,0,1,13.8,7.0,...,0,0,0,0,0,0,0,0,0,6
9509,465403,42,1,1,1,1,1,0,16.9,12.6,...,0,0,0,0,0,0,0,0,0,7
8762,438244,60,1,0,0,1,1,0,12.6,8.0,...,0,0,0,0,0,0,0,0,0,8


In [19]:
#randomly select 43 rows from df_test
df_test = df_test.sample(n=43)

In [20]:
df_test
#save this to csv
# df_test.to_csv('test.csv')

Unnamed: 0,PID,AGE,GENDER,SMOKING,ALCOHOL,HTN,CAD,PRIOR CMP,HB,TLC,...,VT,PSVT,CONGENITAL,NEURO CARDIOGENIC SYNCOPE,ORTHOSTATIC,CARDIOGENIC SHOCK,SHOCK,PULMONARY EMBOLISM,CHEST INFECTION,LOS
2796,237678,67,1,0,0,1,0,0,11.5,8.8,...,0,0,0,0,0,0,0,0,0,6
5408,329398,41,1,0,0,1,1,0,14.1,10.2,...,0,0,0,0,0,0,0,0,0,4
5356,326472,65,1,0,0,0,0,1,10.7,8.3,...,0,0,0,0,0,0,0,0,0,6
9621,469963,60,0,0,0,0,1,0,8.1,25.0,...,0,0,0,0,0,0,0,0,0,10
8649,433970,60,1,0,0,0,1,0,15.1,6.3,...,0,0,0,0,0,0,0,0,0,4
3265,256469,65,1,0,0,1,1,0,11.7,9.1,...,0,0,0,0,0,0,0,0,0,7
4361,292597,70,0,0,0,1,1,0,13.1,5.6,...,0,0,0,0,0,0,0,0,0,6
5089,316272,47,0,0,0,0,0,0,9.3,8.1,...,0,0,0,0,0,1,0,0,0,6
10064,484180,72,1,0,0,0,1,0,13.7,7.1,...,0,0,0,0,0,0,0,0,0,2
9058,449225,73,1,0,0,0,1,0,12.5,21.18,...,0,0,0,0,0,0,0,0,0,7


In [21]:
#read isAdmitted data
import pandas as pd
df_admitted = pd.read_csv('isAdmitted.csv')
df_admitted.drop(['Unnamed: 0'], axis=1, inplace=True)
df_admitted

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Predicted,Urgency Score
0,60,1,4,130,206,0,2,132,1,2.4,2,2,7,1,98.196249
1,51,1,4,140,298,0,0,122,1,4.2,2,3,7,1,96.321198
2,46,1,4,140,311,0,0,120,1,1.8,2,2,7,1,95.592806
3,56,0,4,134,409,0,2,150,1,1.9,2,2,7,1,95.289008
4,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1,95.25888
5,54,1,4,124,266,0,2,109,1,2.2,2,1,7,1,94.663355
6,53,1,4,123,282,0,0,95,1,2.0,2,2,7,1,94.130874
7,59,1,4,110,239,0,2,142,1,1.2,2,1,7,1,94.037527
8,62,1,4,120,267,0,0,99,1,1.8,2,2,7,1,90.539153
9,57,1,4,165,289,1,2,124,0,1.0,2,3,7,1,90.371228


In [22]:
PID = df_test['PID']
PID = PID.to_numpy()

#append PID to df_admitted
df_admitted['PID'] = PID

In [23]:
df_admitted

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Predicted,Urgency Score,PID
0,60,1,4,130,206,0,2,132,1,2.4,2,2,7,1,98.196249,237678
1,51,1,4,140,298,0,0,122,1,4.2,2,3,7,1,96.321198,329398
2,46,1,4,140,311,0,0,120,1,1.8,2,2,7,1,95.592806,326472
3,56,0,4,134,409,0,2,150,1,1.9,2,2,7,1,95.289008,469963
4,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1,95.25888,433970
5,54,1,4,124,266,0,2,109,1,2.2,2,1,7,1,94.663355,256469
6,53,1,4,123,282,0,0,95,1,2.0,2,2,7,1,94.130874,292597
7,59,1,4,110,239,0,2,142,1,1.2,2,1,7,1,94.037527,316272
8,62,1,4,120,267,0,0,99,1,1.8,2,2,7,1,90.539153,484180
9,57,1,4,165,289,1,2,124,0,1.0,2,3,7,1,90.371228,449225


In [24]:
#merge df_test and df_admitted
df_test = pd.merge(df_test, df_admitted, on='PID', how='right')

In [25]:
df_test['Length of Stay'] = df_test['LOS']

In [26]:
df_test.drop(['LOS'], axis=1, inplace=True)
df_test
#save as csv
df_test.to_csv('patientDashboard.csv')

In [27]:
#make a df using PID, LOS and urgency score
df_test = df_test[['PID', 'Length of Stay', 'Urgency Score']]
df_test
#save into csv without index
df_test.to_csv('bedForecast.csv', index=False)

In [28]:

import pickle
pickle.dump(model, open('model_los.pkl','wb'))

Stage 3: Estimating number of beds

In [29]:
def sort_queue(df):
    df = df.sort_values(by=['Urgency Score'], ascending=False)
    return df

In [30]:
totalbeds = 450
filledbeds = 420

#queue of urgency score and LOS only 20 rows from df_test
new = 0
new+=20
queue = df_test[['Urgency Score', 'Length of Stay']].head(new).to_numpy()

#add another column with value 0
queue = np.insert(queue, 2, 0, axis=1)
beds = np.zeros(450)
#make 400 in beds 1
for i in range(filledbeds):
    beds[i] = 1
i+=1
days = 1
while True: #real time
    print("Day No.: ",days)
    
    #decrement LOS by 1 for every patient with a bed
    for j in range(len(queue)):
        if queue[j,2] == 1:
            queue[j,1] -= 1
            #keep max of 0 or queue[j,1]
            if queue[j,1] < 0:
                queue[j,1] = 0
            #if LOS is 0, release the bed
            if queue[j,1] == 0:
                print("Released bed is: ", j+1)
                # beds[filledbeds+j] = 0
                i-=1
                queue[j,2] = -1 #discharged

    print("No. of beds available: ",450-i)
    #check if any patient does not have a bed and find the patient with highest urgency score
    while True:
        if i < 450:
            for j in range(len(queue)):
                if queue[j,2] == 0:
                    maxi = j
                    break
            if queue[maxi,2] == 0:
                # beds[i] = 1
                queue[maxi,2] = 1
                print("Bed filled for patient: ", maxi+1, " with urgency score: ", queue[maxi,0])
                i+=1
                #end loop if all patients are discharged
                if np.count_nonzero(queue[:,2] == 1) + np.count_nonzero(queue[:,2] == -1) == len(queue):
                    break
        else:
            break
    # print(queue)
    
    #print a new line
    print()
    days+=1
    #select next 10 rows from df_test
    if (new > len(df_test)):
        queue_new = df_test[['Urgency Score', 'Length of Stay']].iloc[new:].to_numpy()
        queue_new = np.insert(queue_new, 2, 0, axis=1)
        queue = np.concatenate((queue, queue_new), axis=0)
        
    else:
        queue_new = df_test[['Urgency Score', 'Length of Stay']].iloc[new:new+10].to_numpy()
        queue_new = np.insert(queue_new, 2, 0, axis=1)
        queue = np.concatenate((queue, queue_new), axis=0)
        new+=10
    
    #check if queue[:,2] is all 1 or -1
    if np.count_nonzero(queue[:,2] == 1) + np.count_nonzero(queue[:,2] == -1) == len(queue):
        break

print(days-1)


Day No.:  1
No. of beds available:  30
Bed filled for patient:  1  with urgency score:  98.1962492038868
Bed filled for patient:  2  with urgency score:  96.32119803160218
Bed filled for patient:  3  with urgency score:  95.59280607599857
Bed filled for patient:  4  with urgency score:  95.289008039144
Bed filled for patient:  5  with urgency score:  95.258880206613
Bed filled for patient:  6  with urgency score:  94.6633547229964
Bed filled for patient:  7  with urgency score:  94.13087362618874
Bed filled for patient:  8  with urgency score:  94.0375272857432
Bed filled for patient:  9  with urgency score:  90.53915293802962
Bed filled for patient:  10  with urgency score:  90.37122817348585
Bed filled for patient:  11  with urgency score:  90.27242931335893
Bed filled for patient:  12  with urgency score:  89.93253277084598
Bed filled for patient:  13  with urgency score:  89.66081700395102
Bed filled for patient:  14  with urgency score:  88.2766000178473
Bed filled for patient:  1