# Pre-Processing and Training Data Development

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

In [3]:
#load our data and see few the rows
df = pd.read_csv('../data/mission_hospital_cleaned2.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
AGE,58.0,59.0,82.0,46.0,60.0
GENDER,M,M,M,M,M
MARITAL STATUS,MARRIED,MARRIED,MARRIED,MARRIED,MARRIED
KEY COMPLAINTS-CODE,other- heart,CAD-DVD,CAD-TVD,CAD-DVD,CAD-DVD
BODY WEIGHT,49.2,41.0,46.6,80.0,58.0
BODY HEIGHT,160.0,155.0,164.0,173.0,175.0
HR PULSE,118,78,100,122,72
BP-HIGH,100.0,70.0,110.0,110.0,180.0
BP-LOW,80.0,50.0,80.0,80.0,100.0
RR,32,28,20,24,18


To make predictions we only use information at the admission of the patient and we not going to use cost information. We can not know length of stay at admission, therefore we drop length of stay, length of stay-ICU, and length of stay-ward. We also drop implant used(Y/N) and cost of implant because we don't those information at admission. Finally, we assume all the test results and vital sign measurements were done during admission.

In [4]:
columns_to_drop = ['TOTAL LENGTH OF STAY', 'LENGTH OF STAY-ICU', 'LENGTH OF STAY-WARD', 'IMPLANT USED (Y/N)', 'COST OF IMPLANT']
df.drop(columns=columns_to_drop, inplace=True)
df.head().T

Unnamed: 0,0,1,2,3,4
AGE,58.0,59.0,82.0,46.0,60.0
GENDER,M,M,M,M,M
MARITAL STATUS,MARRIED,MARRIED,MARRIED,MARRIED,MARRIED
KEY COMPLAINTS-CODE,other- heart,CAD-DVD,CAD-TVD,CAD-DVD,CAD-DVD
BODY WEIGHT,49.2,41.0,46.6,80.0,58.0
BODY HEIGHT,160.0,155.0,164.0,173.0,175.0
HR PULSE,118,78,100,122,72
BP-HIGH,100.0,70.0,110.0,110.0,180.0
BP-LOW,80.0,50.0,80.0,80.0,100.0
RR,32,28,20,24,18


Next we create dummy variable for the categorigal variable to make them useful

In [5]:
#create dummy for the categorical variables 
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,AGE,BODY WEIGHT,BODY HEIGHT,HR PULSE,BP-HIGH,BP-LOW,RR,HB,UREA,CREATININE,...,KEY COMPLAINTS-CODE_other-tertalogy,PAST MEDICAL HISTORY CODE_diabetes2,PAST MEDICAL HISTORY CODE_hypertension1,PAST MEDICAL HISTORY CODE_hypertension2,PAST MEDICAL HISTORY CODE_hypertension3,PAST MEDICAL HISTORY CODE_other,PAST MEDICAL HISTORY CODE_unknown,MODE OF ARRIVAL_TRANSFERRED,MODE OF ARRIVAL_WALKED IN,TYPE OF ADMSN_EMERGENCY
0,58.0,49.2,160.0,118,100.0,80.0,32,11.4,33.0,0.8,...,False,False,False,False,False,False,True,False,False,True
1,59.0,41.0,155.0,78,70.0,50.0,28,11.4,95.0,1.7,...,False,False,False,False,False,False,True,False,False,True
2,82.0,46.6,164.0,100,110.0,80.0,20,11.8,15.0,0.8,...,False,True,False,False,False,False,False,False,True,False
3,46.0,80.0,173.0,122,110.0,80.0,24,11.8,74.0,1.5,...,False,False,True,False,False,False,False,False,False,True
4,60.0,58.0,175.0,72,180.0,100.0,18,10.0,48.0,1.9,...,False,True,False,False,False,False,False,False,False,True


Now we split our data to train and test sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='TOTAL COST TO HOSPITAL'), 
                                                    df['TOTAL COST TO HOSPITAL'], test_size=0.3, 
                                                    random_state=42)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((168, 34), (72, 34), (168,), (72,))

From data wrangling stage we left some null values for creatinine therefore we have to impute it with the mean values of creatinine also we have to scale and standardize the date. we combine this two steps and make a pipeline.

In [8]:
#We make pipeline for imputing and scaling our data
pipe = make_pipeline(
    SimpleImputer(strategy='mean'), 
    StandardScaler()
)

In [9]:
#we fit our pipeline
pipe.fit(X_train)

In [12]:
#We tranform both our training and test data
X_tr_scaled = pipe.transform(X_train)
X_te_scaled = pipe.transform(X_test)

In [14]:
df_X_train = pd.DataFrame(X_tr_scaled)
df_X_test = pd.DataFrame(X_te_scaled)
df_y_train = pd.DataFrame(y_train)
df_y_test = pd.DataFrame(y_test)

In [15]:
df_X_train.to_pickle('../data/X_train.pkl')
df_X_test.to_pickle('../data/X_test.pkl')
df_y_train.to_pickle('../data/y_train.pkl')
df_y_test.to_pickle('../data/y_test.pkl')