In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train=pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data.csv')
df_test=pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/test_data.csv')

# **List of Feature Columns**

* **Case_Id** : Storing Unique ids with respect to cases
* **Hospital_Code** : Unique Code for Hospitals in which particular case is being investigated
* **Hospital_type_code** : Unique Code for Type of Hospitals
* **Hospital_region_code** : Region of Hospitals
* **Available Extra Rooms in Hospital** : No. of extra rooms available
* **Department** : Department overlooking the case
* **Ward_Type** : Unique code for ward type
* **Ward_Facility_Code** : Unique code for ward facility
* **Bed Grade** : Condition for Bed in Hospital
* **Patient_Id** : Patient id undergoing current case
* **City_Code_Patient** : City code for the patient
* **Type_of_Admission** : Admission type registered by hospital
* **Severity of illness** : severity recorded at time of admission
* **Visitors with Patient** : No. of visitors with patient
* **Age** : Age of the patient
* **Admission_Deposit** : Deposit at admission time
* **Stay** : Target Variable Stay which depicts no. of days the patient stayed at hospital


**Using Test Data we need to predict the no. of days patient will stay at hospital**

# **Data type of Feature Columns**

In [None]:
print(df_train.dtypes)

**Inference** :

* Case_Id and Patient_Id can be removed from dataset
* Bed Grade and City_Code_Patient are float but it can be taken as int
* All the object type features need to be label encoded later for training models

In [None]:
df_train=df_train.drop(columns=['case_id','patientid'])
y_submit_1=df_test['case_id'].values # storing for final submission
df_test=df_test.drop(columns=['case_id','patientid'])

# **Calculating no. of empty data**

In [None]:
df_train.describe()

In [None]:
df_test.describe()

In [None]:
print(df_train.isna().sum())

In [None]:
print(df_test.isna().sum())

In [None]:
print(df_train['Bed Grade'].unique())

print(df_train['City_Code_Patient'].unique())

Replacing Nan values by mean

In [None]:
df_train['Bed Grade']=df_train['Bed Grade'].replace(np.nan,2)
df_test['Bed Grade']=df_test['Bed Grade'].replace(np.nan,2)

df_train['City_Code_Patient']=df_train['City_Code_Patient'].replace(np.nan,7)
df_test['City_Code_Patient']=df_test['City_Code_Patient'].replace(np.nan,7)

In [None]:
print(df_train.isna().sum())
print(df_test.isna().sum())

# **Univariate Analysis**

* Hospital Code

In [None]:
import plotly.express as px
fig = px.histogram(df_train, x="Hospital_code")
fig.show()

In [None]:
import plotly.express as px
fig = px.histogram(df_test, x="Hospital_code")
fig.show()

Inference: Hospital code 26 having highest count in both Train and Test data

* Hospital Type Code

In [None]:
import plotly.express as px
fig = px.histogram(df_train, x="Hospital_type_code").update_xaxes(categoryorder="total descending")
fig.show()

In [None]:
import plotly.express as px
fig = px.histogram(df_test, x="Hospital_type_code").update_xaxes(categoryorder="total descending")
fig.show()

* Hospital Region Distribution

In [None]:
import plotly.express as px
fig = px.histogram(df_train, x="Hospital_region_code").update_xaxes(categoryorder="total descending")
fig.show()

import plotly.express as px
fig = px.histogram(df_test, x="Hospital_region_code").update_xaxes(categoryorder="total descending")
fig.show()

* Department Distribution

In [None]:
import plotly.express as px
fig = px.histogram(df_train, x="Department").update_xaxes(categoryorder="total descending")
fig.show()

import plotly.express as px
fig = px.histogram(df_test, x="Department").update_xaxes(categoryorder="total descending")
fig.show()

* Ward Type

In [None]:
import plotly.express as px
fig = px.histogram(df_train, x="Ward_Type").update_xaxes(categoryorder="total descending")
fig.show()

import plotly.express as px
fig = px.histogram(df_test, x="Ward_Type").update_xaxes(categoryorder="total descending")
fig.show()

* Type of Admission

In [None]:
import plotly.express as px
fig = px.histogram(df_train, x="Type of Admission").update_xaxes(categoryorder="total descending").update_xaxes(categoryorder="total descending")
fig.show()

import plotly.express as px
fig = px.histogram(df_test, x="Type of Admission").update_xaxes(categoryorder="total descending")
fig.show()

* Severity of Illness

In [None]:
import plotly.express as px
fig = px.histogram(df_train, x="Severity of Illness").update_xaxes(categoryorder="total descending")
fig.show()

import plotly.express as px
fig = px.histogram(df_test, x="Severity of Illness").update_xaxes(categoryorder="total descending")
fig.show()

* Age

In [None]:
import plotly.express as px
fig = px.histogram(df_train, x="Age").update_xaxes(categoryorder="total descending")
fig.show()

import plotly.express as px
fig = px.histogram(df_test, x="Age").update_xaxes(categoryorder="total descending")
fig.show()

* Stay (Target Variable)

In [None]:
import plotly.express as px
fig = px.histogram(df_train, x="Stay").update_xaxes(categoryorder="total descending")
fig.show()

# **Bivariate analysis of target with feature columns**

Let us visualize the features against the Target variable

In [None]:
import seaborn as sns
sns.set(style="white")

g=sns.barplot(y="Stay", x="Admission_Deposit", data=df_train,order=['0-10','11-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90','91-100','More than 100 Days'])


**Inference**: When the Admission Deposit tends to high values, patients are discharged early probably. \
Signs of corruption? Not at all good.

In [None]:
import seaborn as sns
sns.set(style="white")

g=sns.barplot(y="Stay", x="Visitors with Patient", data=df_train,order=['0-10','11-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90','91-100','More than 100 Days'])


**Inference:** Understandably, more visitors determine more serious cases and in the situation the patient needs his/her beloved for some motivation. This trend is so true

In [None]:
df_train["Stay"].unique()

In [None]:
df_train["Age"].unique()

In [None]:
y_val=df_train.groupby(['Age','Stay']).count().reset_index()

In [None]:
y_val['count']=y_val['Hospital_code']

In [None]:
import plotly.express as px
df = px.data.gapminder()

fig = px.scatter(y_val, x="Age", y="Stay",
      size="count")
fig.show()

**Inference**: Much of the data lies for patients Aging between (31-40) and their days of Stay being between (21-30) days