# Objective  
The goal of this kernel is to visually explore the data and to figure out which features are most useful and discover what features we can engineer before creating a predictive model. 

In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import LabelEncoder

First let's load in the data and take a look at the features that are given to us

In [None]:
train = pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data.csv')
test = pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/test_data.csv')

In [None]:
train.head()

We can drop case_id because it isn't relevant to predicting the target variable 'Stay'

In [None]:
train.drop('case_id', axis=1, inplace=True)
test.drop('case_id', axis=1, inplace=True)

Using .info() we see that there are a good mix of numerical and categorical features in the train and test sets. There are also some null values in the 'Bed Grade' and 'City_Code_Patient' columns that will need to be dealt with.

In [None]:
print(train.info())
print(test.info())

We can explore the range of values that the 'Admission_Deposit' and 'Age' columns take on with boxplots to learn more about their range of values and potentially identify outliers that may be present. 

In [None]:
fig = go.Figure() 
fig.add_trace(go.Box(x=train['Admission_Deposit'],
                     marker_color="blue",
                     name="Train"))
fig.add_trace(go.Box(x=test['Admission_Deposit'],
                     marker_color="red",
                     name="Test"))
fig.update_layout(title="Distributions of Admission Deposit")
fig.show()


In [None]:
fig = go.Figure() 
fig.add_trace(go.Box(x=train['Age'],
                     marker_color="blue",
                     name="Train"))
fig.add_trace(go.Box(x=test['Age'],
                     marker_color="red",
                     name="Test"))
fig.update_layout(title="Distributions of Age")
fig.show()


We can use .corr() to figure out whether or not any of the features has a direct correlation with our target variable 'Stay'. From the output below, we see that the most correlated feature is 'Visitors with Patient' with a correlation of ~0.54. We can use a scatterplot to see that there is a very slight positive correlation.

In [None]:
train.corr()['Stay']

In [None]:
fig = px.scatter(train, x=train['Visitors with Patient'], y=train['Stay'])
fig.update_layout(title='Number of Visitors vs. Duration of Stay',xaxis_title="Visitors",yaxis_title="Duration")
fig.show()

In [None]:
le = LabelEncoder()
for column in train.columns:
    if train[column].dtype == 'object': 
        train[column] = le.fit_transform(train[column])
for column in test.columns:
    if test[column].dtype == 'object': 
        test[column] = le.fit_transform(test[column])


In [None]:
sns.heatmap(train.corr())