In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#loading the datasets
train=pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data.csv')
test=pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/test_data.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
#renaming some of the features in train
train.rename(columns={'Available Extra Rooms in Hospital':'Extra_Rooms','Type of Admission':'Admission_Type',
                     'Severity of Illness':'Illness_severity','Visitors with Patient':'Visitors'},inplace=True)

In [None]:
#renaming some of the features in test
test.rename(columns={'Available Extra Rooms in Hospital':'Extra_Rooms','Type of Admission':'Admission_Type',
                     'Severity of Illness':'Illness_severity','Visitors with Patient':'Visitors'},inplace=True)

In [None]:
train.info()

* Many columns in the dataset are of object type.
* Target variable('Stay') is of object type
* We have to convert object columns into numerical form for fitting of the model.


In [None]:
train.describe(include='all')

* Continuous Columns 'case_id','Extra_Rooms','patientid','City_Code_Patient','Visitors','Admission_Deposit' have a large difference between 75th quartile and maximum values which depicts presence of outliers in the columns.
* Object or discrete columns have many repeated values of unique labels which have to be numerically encoded. 
* Since the target variable has 11 unique values(classes) therefore this dataset is an classification dataset.

In [None]:
#converting object type columns into categorical type
columns=['Hospital_type_code','Hospital_region_code','Department', 'Ward_Type',
       'Ward_Facility_Code','Admission_Type', 'Illness_severity', 'Age',
       'Stay']

In [None]:
for i in columns:
    train[i]=train[i].astype('category')
    if i!='Stay':
        test[i]=train[i].astype('category')

In [None]:
train.info()

In [None]:
#checking for missing values in test and train dataset
train.isnull().sum()

Columns 'Bed Grade' has 113 missing values and 'City_Code_Patient' has 4532 missing values in the train dataset

In [None]:
test.isnull().sum()

Columns 'Bed Grade' has 35 missing values and 'City_Code_Patient' has 2157 missing values in the test dataset.

In [None]:
#filling the missing value in bed grade column in both train and test
train['Bed Grade']=train['Bed Grade'].fillna(1.0)

In [None]:
test['Bed Grade']=test['Bed Grade'].fillna(1.0)

In [None]:
train.drop(columns=['City_Code_Patient'],inplace=True)
test.drop(columns=['City_Code_Patient'],inplace=True)

Since City_Code_Patient has many missing values which are irrelevant so dropping the column from both train and test sets.

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

## EXPLORATORY DATA ANALYSIS

In [None]:
#importing libraries for visualization
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
plotly.offline.init_notebook_mode(connected = True)

In [None]:
train.groupby('Department')['Extra_Rooms'].agg('count')

* 'Gynecology' Department has most number of extra rooms allotted.
* 'Surgery' Department has the minimum number of extra rooms allotted.

In [None]:
train.groupby('Bed Grade')['Extra_Rooms'].agg('count')

Most of the extra rooms have beds of grade 2 followed by grade 3.

In [None]:
train.groupby('Admission_Type')['Extra_Rooms'].agg('count')

Most of the rooms are allotted for trauma situations.

In [None]:
train.groupby('Illness_severity')['Extra_Rooms'].agg('count')

Most of the rooms are allotted for patients having moderate severity.

In [None]:
train.groupby('Department')['Bed Grade'].agg('mean')

* 'Anaesthesia' Department has mostly beds above grade 3.
* 'Surgery' Department has mostly beds around grade 2.

In [None]:
train.groupby('Admission_Type')['Bed Grade'].agg('mean')

Those patients who have emergent situations get less grade beds than those who have trauma and urgent situations.

In [None]:
train.groupby('Illness_severity')['Bed Grade'].agg('mean')

Those patients who have minor disease are allotted rooms having bed around grade 3.

In [None]:
px.pie(train,values='Extra_Rooms',names='Department',title='Distribution of Extra Rooms in Departments')

Gynaecology department has most number of rooms whereas Surgery has least number of rooms.

In [None]:
px.pie(train,values='Extra_Rooms',names='Bed Grade',title='Distribution of Bed in extra rooms')

Most of the extra rooms contain bed grade of 2 followed by grade 3.

In [None]:
px.pie(train,values='patientid',names='Age',title='Distribution of Age in Patients')

The patient admitted to the hospital are mostly of middle age i.e. age 31-40 and 41-50.

In [None]:
px.pie(train,values='patientid',names='Stay',title='Distribution of Stay Length of Patients')

Most of the patients stay in hospital for about 11-20 and 21-30 days.

## VARIABLE ENCODING

In [None]:
train.columns

In [None]:
#taking categorical variables to label encode
cat_columns=['Age','Stay']

In [None]:
#storing the encoded values both in train and test sets
from sklearn.preprocessing import LabelEncoder
l=LabelEncoder()
for i in cat_columns:
    train[i]=l.fit_transform(train[i])
    if i!='Stay':
        test[i]=l.transform(test[i])

In [None]:
#generating one hot features of remaining categorical features
train=pd.get_dummies(train)
test=pd.get_dummies(test)

## MODEL BUILDING

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
#dropping irrelevant columns 
X=train.drop(columns=['Stay'])
Y=train['Stay']

In [None]:
X.head()

In [None]:
#dividing data into train and test sets
X_train,X_valid,y_train,y_valid=train_test_split(X,Y,test_size=0.2,random_state=0)

In [None]:
#scaling the features
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_valid=sc.transform(X_valid)

In [None]:
from catboost import Pool, CatBoostClassifier

In [None]:
eval_dataset = Pool(data=X_valid, label=y_valid)
# initialising catboost classifier

model = CatBoostClassifier(iterations=500,learning_rate=0.3,
                           eval_metric='Accuracy')

In [None]:
model.fit(X_train,y_train,eval_set=eval_dataset)

In [None]:
model.get_best_score()

Training accuracy-47.17%

Validation accuracy-42.27%

In [None]:
test_dataset=Pool(test)

In [None]:
y_pred=model.predict(test_dataset)

In [None]:
y_pred

This notebook will be updated shortly.

**If you like this notebook do upvote it.**

Feedback and Suggestions are always appreciated.

Do checkout my other notebooks at https://www.kaggle.com/tmchls