# Introduction
> Recent Covid-19 Pandemic has raised alarms over one of the most overlooked area to focus: Healthcare Management. While healthcare management has various use cases for using data science, patient length of stay is one critical parameter to observe and predict if one wants to improve the efficiency of the healthcare management in a hospital.
This parameter helps hospitals to identify patients of high LOS risk (patients who will stay longer) at the time of admission. Once identified, patients with high LOS risk can have their treatment plan optimized to miminize LOS and lower the chance of staff/visitor infection. Also, prior knowledge of LOS can aid in logistics such as room and bed allocation planning.

In [None]:
import numpy as np
import pandas as pd

# Import Dataframes

In [None]:
files_path = '/kaggle/input/av-healthcare-analytics-ii/'
train_df = pd.read_csv('{}healthcare/train_data.csv'.format(files_path))
test_df = pd.read_csv('{}healthcare/test_data.csv'.format(files_path))
y_test_df = pd.read_csv('{}healthcare/sample_sub.csv'.format(files_path))
data_dict = pd.read_csv('{}healthcare/train_data_dictionary.csv'.format(files_path))

In [None]:
train_df.head()

In [None]:
pd.set_option('display.max_colwidth', None)
data_dict.head(18)

In [None]:
train_df.info()

$\color{green}{\text{
  some columns names contain spaces which can't be accessed by dot notation
}}$

In [None]:
test_df.head()

In [None]:
y_test_df.head()

In [None]:
train_df.Stay.value_counts()

In [None]:
train_df.Stay.nunique()

$\color{green}{\text{'Stay' labeles are in 11 imbalanced classes, which will reduce model training performance}}$

In [None]:
y_test_df.Stay.value_counts()

$\color{red}{\text{
train data has 11 classes while test data has only 1 class
}}$

In [None]:
y_test_df.shape

$\color{green}{\text{
test data is really large amount of data, so it will be merged with train data to be split before model train
}}$

# Repair data

*   Merge all data sets in one data frame
*   Replace spaces in columns

## Merge Datasets

In [None]:
# merge test features and labeles
test_df = test_df.merge(y_test_df, how='inner', left_on=['case_id'], right_on=['case_id'])

In [None]:
test_df.info()

In [None]:
# create new dataframe contains train and test datasets
df = train_df.append(test_df)

In [None]:
# check fro duplicates in case_id
df.case_id.is_unique

## Repair Columns Names


> replace spaces in columns names with undescore "_" so it can be accessed by dot notation "."



In [None]:
# replace spaces with "-"
df.columns = df.columns.str.replace(' ','_')
df.info()

# Exploratory DA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
count = 1

In [None]:
eda_df = df.copy()

In [None]:
plot_df = eda_df.Stay.value_counts().sort_values(ascending=False)
fig = px.bar(plot_df, x=plot_df.index, y=plot_df, color='Stay',
             labels={'index':'Duration', 'y':'Count'}, height=500, width=900, color_continuous_scale=px.colors.sequential.Sunset)
fig.update_layout(title="Count of patients in each duration category")
fig.show()



$\color{green}{\text{
35% of patients stay up to 10
}}$

in order to perform better analysis, stay durations will be regrouped into 3 durations

In [None]:
stay_values_mapping = {
 '11-20': '11-30',
 '21-30': '11-30',
 '31-40': '30+',
 '41-50': '30+',
 '51-60': '30+',
 '61-70': '30+',
 '71-80': '30+',
 '81-90': '30+',
 '91-100': '30+',
 'More than 100 Days': '30+'
}

In [None]:
grouped_duration = eda_df.copy()
grouped_duration.Stay.replace(stay_values_mapping, inplace=True)

In [None]:
plot_df = grouped_duration.Stay.value_counts().sort_values(ascending=False)
fig = px.bar(plot_df, x=plot_df.index, y=plot_df, color='Stay',
             labels={'index':'Duration', 'y':'Count'}, height=400, width=600, color_continuous_scale=px.colors.sequential.Sunset)
fig.update_layout(title="Count of patients in each duration category")
fig.show()



$\color{green}{\text{
Durations are regrouped in almost balanced three categories
}}$

In [None]:
fig = px.sunburst(eda_df, path=['Severity_of_Illness'], height=600, width=600)
fig.update_layout(title='Severity of Illness')
fig.update_traces(textinfo='label+percent entry')
fig.show()




$\color{green}{\text{
55% of patients admitted in moderate severity of illness
}}$

In [None]:
fig = px.sunburst(eda_df, path=['Type_of_Admission'], height=600, width=600)
fig.update_layout(title='Type of Admission')
fig.update_traces(textinfo='label+percent entry')
fig.show()



$\color{green}{\text{
48% of patients in trauma admission
}}$

In [None]:
plot_df = eda_df.Department.value_counts(normalize=True).sort_values(ascending=False) * 100
fig = px.bar(plot_df, x=plot_df.index, y=plot_df, color='Department',
             labels={'index':'Department', 'y':'Percentage'}, height=500, width=900, color_continuous_scale=px.colors.sequential.Sunset)
fig.update_layout(title="Percentage of patients admitted in each department")
fig.show()



$\color{green}{\text{
78% of patients admitted in gynecology department
}}$

In [None]:
fig = px.sunburst(grouped_duration, path=['Stay', 'Severity_of_Illness'], height=600, width=600)
fig.update_layout(title='Severity of Illness vs Duration')
fig.update_traces(textinfo='label+percent parent')
fig.show()



In [None]:
fig = px.sunburst(eda_df, path=['Type_of_Admission', 'Severity_of_Illness'], height=600, width=600)
fig.update_layout(title='Severity of Illness vs Type of Admission')
fig.update_traces(textinfo='label+percent parent')
fig.show()



$\color{green}{\text{
severity of illnes is distriputed over durations and type of admission almost as it is distriputed over all data
}}$

In [None]:
plot_df = df.Age.value_counts().sort_index()
fig = px.bar(plot_df, x=plot_df.index, y=plot_df, color='Age',
             labels={'y': 'Count', 'index':'Age'}, height=400, width=800, color_continuous_scale=px.colors.sequential.Sunset)
fig.update_layout(title="Count of patients in each age category")
fig.show()



$\color{green}{\text{
most of admitted patients are between 31 and 50 years old
}}$

In [None]:
age_mapping = {
'0-10': '0-30',
'11-20': '0-30',
'21-30': '0-30',
'31-40': '31-60',
'41-50': '31-60',
'51-60': '31-60',
'61-70': '61-100',
'71-80': '61-100',
'81-90': '61-100',
'91-100':'61-100'
}

grouped_age_duration = grouped_duration.copy()
grouped_age_duration.Age.replace(age_mapping, inplace=True)

In [None]:
fig = px.sunburst(grouped_age_duration, path=['Stay', 'Age'], height=600, width=600)
fig.update_layout(title='Age vs Stay Duration')
fig.update_traces(textinfo='label+percent parent')
fig.show()



1. $\color{green}{\text{
percentage of patients between 61-100 years old increases in saty duration more than 30 days slightly
}}$

1. $\color{green}{\text{
percentage of patients between 0-30 years old decreases by the same amount
}}$

1. $\color{green}{\text{
percentage of patients between 31-60 years is constant over all stay duration categories
}}$

In [None]:
plot_df = df.groupby('Department')['Visitors_with_Patient'].mean()
fig = px.bar(plot_df, x=plot_df.index, y=plot_df, color='Visitors_with_Patient',
             labels={'y': 'Visitors Average'}, height=400, width=700, color_continuous_scale=px.colors.sequential.Sunset)
fig.update_layout(title='Visitors average in each department')
fig.show()



$\color{green}{\text{
  patients in all departments have visitors of 3 in average
}}$

In [None]:
plot_df = eda_df.groupby('Department')['Available_Extra_Rooms_in_Hospital'].mean()
fig = px.bar(plot_df, x=plot_df.index, y=plot_df, color='Available_Extra_Rooms_in_Hospital',
             labels={'y': 'Extra Rooms Average'}, height=400, width=800, color_continuous_scale=px.colors.sequential.Sunset)
fig.update_layout(title='Extra rooms average in each department')
fig.show()



$\color{green}{\text{
each department has in average 3 extra rooms at admission time
}}$

In [None]:
eda_df.Admission_Deposit.describe()

In [None]:
eda_df.Admission_Deposit.mode()[0]

In [None]:
x = eda_df.Admission_Deposit
fig = px.histogram(x=x, histnorm='percent', template='ggplot2', height=400, width=700, labels={'x': 'Admission_Deposit'})
fig.update_layout(title='Admission Deposit Histogram')
fig.show()



$\color{green}{\text{
most admission deposits are between 3k and 6k
}}$

In [None]:
eda_df.Visitors_with_Patient.describe()

In [None]:
eda_df.Visitors_with_Patient.mode()[0]

In [None]:
x = eda_df.Visitors_with_Patient
fig = px.histogram(x=x, histnorm='percent', template='ggplot2', height=400, width=700, labels={'x': 'Visitors_with_Patient'})
fig.update_layout(title='Visitors Histogram')
fig.show()



$\color{green}{\text{
most number of visitors between 2 and 4
}}$

In [None]:
eda_df.Available_Extra_Rooms_in_Hospital.describe()

In [None]:
eda_df.Available_Extra_Rooms_in_Hospital.mode()[0]

In [None]:
x = eda_df.Available_Extra_Rooms_in_Hospital
fig = px.histogram(x=x, histnorm='percent', template='ggplot2', height=400, width=700, labels={'x': 'Available_Extra_Rooms_in_Hospital'})
fig.update_layout(title='Extra Rooms Histogram')
fig.show()



$\color{green}{\text{
most number of extra rooms between 2 and 4
}}$

# Data Preprocessing

> Missing values, label encoding



## Regroup 'Stay' labeles

In [None]:
set(train_df.Stay.values)

In [None]:
df.Stay.value_counts()

In [None]:
stay_values_mapping = {
 '11-20': '11-30',
 '21-30': '11-30',
 '31-40': '30+',
 '41-50': '30+',
 '51-60': '30+',
 '61-70': '30+',
 '71-80': '30+',
 '81-90': '30+',
 '91-100': '30+',
 'More than 100 Days': '30+'
}

In [None]:
df.Stay.replace(stay_values_mapping, inplace=True)

## Missing Values

In [None]:
# check missing values
df.isnull().sum()

In [None]:
# missing value heat map
sns.heatmap(df.isnull(), cbar=False)



In [None]:
(df.City_Code_Patient.isnull().sum() / df.shape[0]) * 100

$\color{green}{\text{
misisng values are 1.4% of data, so it will be removed
}}$

In [None]:
df.shape[0] - df.dropna().shape[0]

In [None]:
df.dropna().shape[0] / df.shape[0]

In [None]:
# drop missing values
df = df.dropna()
df.isna().sum()

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
object_cols = df.select_dtypes(include='object').columns.to_list()
num_cols = df.drop(object_cols, axis=1).columns
object_cols.remove('Stay')

In [None]:
# convert object data to numerical using label encoding
les = {}
for col in object_cols:
  les[col] = LabelEncoder()
  data = df[col].values
  df[col]  = les[col].fit_transform(data)
  print("{}: {} \n".format(col, les[col].classes_))

## Correlation Matrix

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(ax=ax, data=df.corr(), cmap="YlGnBu", annot=True, cbar=False)



# Scaling

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
df.describe().T

In [None]:
x = df.drop(['Stay', 'case_id', 'patientid'], axis=1)
y = df.Stay

In [None]:
x[x.columns] = RobustScaler().fit_transform(x[x.columns].values)

# Train-Test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3, stratify=y)

# Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV
from time import time

In [None]:
values= [RandomForestClassifier(random_state=42), KNeighborsClassifier(), XGBClassifier(random_state=42), MLPClassifier(random_state=42)]

keys= ['RandomForsetClassifier',
      'KNeighborsClassifier',
      'XGBClassifier',
      'MLPClassifier']

models= dict(zip(keys,values))
accuracy_scores=[]
train_times=[]

for key,value in models.items():
  t = time()
  value.fit(x_train,y_train)
  duration = (time() - t) / 60
  y_pred= value.predict(x_test)
  accuracy= accuracy_score(y_test, y_pred)
  accuracy_scores.append(accuracy)
  train_times.append(duration)   
  print(key)
  print(accuracy)

In [None]:
fig = px.bar(accuracy_scores, x=keys, y=accuracy_scores, color=accuracy_scores,
             labels={'x': '', 'y':'Scores %', 'color':''}, height=400, width=500, color_continuous_scale=px.colors.sequential.Sunset)
fig.update_layout(title='Classifiers Accuracy Scores')
fig.show()



In [None]:
fig = px.bar(train_times, x=keys, y=train_times, color=train_times,
             labels={'x': '', 'y':'Train Times (min)', 'color':''}, height=400, width=500, color_continuous_scale=px.colors.sequential.Sunset)
fig.update_layout(title='Classifiers Trainig Times')
fig.show()



In [None]:
params= {'max_depth': [7,8], 'max_features':[8,9,10,11]}

xgb= XGBClassifier(random_state=42)
grid= RandomizedSearchCV(xgb, cv=3, verbose=3,param_distributions= params, n_iter=5)
grid.fit(x_train,y_train)

In [None]:
grid.best_score_

In [None]:
grid.best_estimator_

In [None]:
y_pred= grid.best_estimator_.predict(x_test)
print(accuracy_score(y_test,y_pred))

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
def f_importances(coef, names, top=-1):
    imp = coef
    imp, names = zip(*sorted(list(zip(imp, names))))

    # Show all features
    if top == -1:
        top = len(names)

    plt.barh(range(top), imp[::-1][0:top], align='center')
    plt.yticks(range(top), names[::-1][0:top])
    plt.title('feature importances for dt')
    plt.show()

In [None]:
# whatever your features are called
features_names = x_train.columns
f_importances(abs(grid.best_estimator_.feature_importances_), features_names, top=7)

# Colclusion
> it seems like the significant and busiest units of the hospital, where the MAJORITY of patients stay for care, are missing, so this dataset is incomplete, so building a model to predict LOS using it will lead to poor trainig performance