# HealthCare Analytics: Predicting the Length of Stay (LOS) of a patient 
- Objective: Predict the Length of Stay for each patient on case by case basis so that the Hospitals can use this information for optimal resource allocation and better functioning
- The length of stay is divided into 11 different classes ranging from 0-10 days to more than 100 days.

## Load Libraries & Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data.csv')
test = pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/test_data.csv')
dictionary = pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data_dictionary.csv')
sample = pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/sample_sub.csv')

## Data Dictionary

In [None]:
dictionary

#### Hypothesis: 
The following features may have relevance with the Length of Stay of a patient
- Department: Relates to the type of disease
- Type of Admission: Relates to patients' reason of admission to the hospital
- Severity of Illness: Relates to the curability of disease
- Age: Relates to the curability of disease

# 1. Exploratory Data Analysis
* What question(s) are you trying to solve?
* What kind of data do we have and how do we treat different types?
* What's missing from the data and how do you deal with it?
* Where are the outliers and why should you care about them?
* How can you add, change or remove features to get more out of your data?

In [None]:
train.info()

In [None]:
train.head(5)

In [None]:
train.Stay.value_counts()

In [None]:
train.head(5)

In [None]:
# Distribution of target feature
plt.figure(figsize=(10,7))
train.Stay.value_counts().plot(kind="bar", color = ['Salmon'])

Insight: Most of the patients stay in hospital for **21-30 days**, followed by **11-20**, **31-40**

In [None]:
# Check for unique values in every column
for features in train.columns:
    print('Unique Values for {}'.format(features))
    print(train[features].unique())
    print('======================================')
    print()

**Insights**:
1. Categorical features which need to be encoded:
- `Hospital_region_code`
- `Department`
- `Ward_Type`
- `Admission`
- `Illness`

2. Bins (range) which need to be encoded:
- `Age`
- `Stay` (Target)


In [None]:
# Check for null values
train.isna().sum()

`Bed Grade` and `City_Code_Patient` features have null values to be filled.
- But do we need them?

# 2. Data Processing & Feature engineering

In [None]:
train = train.drop(['Hospital_region_code', 'Bed Grade', 'patientid', 'City_Code_Patient'], axis = 1)
test = test.drop(['Hospital_region_code', 'Bed Grade', 'patientid', 'City_Code_Patient'], axis = 1)

In [None]:
# Combine test and train dataset for processing
combined = [train, test]

### 2a Encoding categorical features

In [None]:
from sklearn.preprocessing import LabelEncoder

for dataset in combined:
    label = LabelEncoder()
    dataset['Department'] = label.fit_transform(dataset['Department'])

In [None]:
combined[1].Department.unique()

In [None]:
# Ward type
for dataset in combined:
    label = LabelEncoder()
    dataset['Hospital_type_code'] = label.fit_transform(dataset['Hospital_type_code'])
    dataset['Ward_Facility_Code'] = label.fit_transform(dataset['Ward_Facility_Code'])
    dataset['Ward_Type'] = label.fit_transform(dataset['Ward_Type'])
    dataset['Type of Admission'] = label.fit_transform(dataset['Type of Admission'])
    dataset['Severity of Illness'] = label.fit_transform(dataset['Severity of Illness'])

In [None]:
combined[0]

### 2b Binning features

In [None]:
combined[1]

In [None]:
# Check age distribution
combined[0].Age.hist()

In [None]:
combined[0].Age.unique()

In [None]:
age_dict = {'0-10': 0, '11-20': 1, '21-30': 2, '31-40': 3, '41-50': 4, '51-60': 5, '61-70': 6, '71-80': 7, '81-90': 8, '91-100': 9}

In [None]:
for dataset in combined:
    dataset['Age'] = dataset['Age'].replace(age_dict.keys(), age_dict.values())

In [None]:
combined[0].Stay.unique()

In [None]:
stay_dict = {'0-10': 0, '11-20': 1, '21-30': 2, '31-40': 3, '41-50': 4, '51-60': 5, '61-70': 6, '71-80': 7, '81-90': 8, '91-100': 9, 'More than 100 Days': 10}

In [None]:
combined[0]['Stay'] = combined[0]['Stay'].replace(stay_dict.keys(), stay_dict.values())

In [None]:
combined[0].Age.hist()

In [None]:
for dataset in combined:
    print(dataset.shape)

In [None]:
combined[1].info()

### 2c. Scaling numerical data

In [None]:
columns_list = ['Type of Admission', 'Available Extra Rooms in Hospital', 'Visitors with Patient', 'Admission_Deposit']

In [None]:
len(columns_list)

In [None]:
from sklearn.preprocessing import StandardScaler

ss= StandardScaler()

for dataset in combined:
    dataset[columns_list]= ss.fit_transform(dataset[columns_list].values)


In [None]:
combined[0]

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(combined[0].corr(), annot=True, cmap='coolwarm')

# 3. Data Modelling

In [None]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier


In [None]:
train = combined[0]
test = combined[1]

In [None]:
sample

In [None]:
X_train = train.drop(['case_id', 'Stay'], axis=1)
Y_train = train["Stay"]
X_test  = test.drop("case_id", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

In [None]:
sample.shape

In [None]:
X_test.columns

In [None]:
Y_train

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

In [None]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

In [None]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

In [None]:
sns.barplot(x= ['KNN','DT','RF'],y= [acc_knn, acc_decision_tree,acc_random_forest])

# 4. Submission

In [None]:
sample

In [None]:
submission = pd.DataFrame({
        "case_id": test["case_id"],
        "Stay": Y_pred
})

In [None]:
submission['Stay'] = submission['Stay'].replace(stay_dict.values(), stay_dict.keys())

In [None]:
submission.to_csv('submission.csv', index = False)

In [None]:
sample