In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import statistics

warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
train= pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data.csv')
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.isna().sum()

In [None]:
train['Bed Grade'].fillna(statistics.mode(train['Bed Grade']),inplace=True)
train['City_Code_Patient'].fillna(statistics.mode(train['City_Code_Patient']),inplace=True)

In [None]:
train.isna().sum()

**These columns are of no use**

In [None]:
train.drop(['case_id', 'patientid'], axis=1, inplace=True)

**Dividing the columns into categorical and numerical for EDA**

In [None]:
cat_cols=[]
num_cols=[]

for col in train.columns:
    if train[col].dtypes=='object':
        cat_cols.append(col)
        
for col in train.columns:
    if train[col].dtypes!='object':
        num_cols.append(col)
        
print(cat_cols)
print(num_cols)

# Exploratory Data Analysis

In [None]:
i=1
plt.figure(figsize=(15,20))
for col in cat_cols:
    plt.subplot(5,2,i)
    sns.countplot(train[col])
    i=i+1
plt.show()

In [None]:
train['Stay'].value_counts()

In [None]:
train['Stay'].replace('More than 100 Days', '>100', inplace=True)

In [None]:
train['Stay'].value_counts()

# Dividing the Labels
**Here, I divide the stay duration into 3 categories**
0: 0-20 days
1: 21-60 days
2: 61-100+ days

You might say why a disproportionate distribution. This is to balance the number of patients in each category

Plus the model will not give good prediction accuracy if there are 11 classes to predict. It would have given good results if there were a proportionate number of rows give, example 1,000,000 but with the amount of data we have, we can get a decent accuracy with 3 labels to predict.

In [None]:
train['Stay']= train['Stay'].replace({'0-10':0, '11-20':0, '21-30':1, '31-40':1, '41-50':1, '51-60':2,'61-70':2,'71-80':2,'81-90':2,'91-100':2,'>100':2})

In [None]:
train['Stay'].value_counts()

# More EDA

In [None]:
i=1
plt.figure(figsize=(15,20))
for col in num_cols:
    plt.subplot(4,2,i)
    sns.distplot(train[col])
    i=i+1
    
plt.show()
   

**I found that some columns in numerical category were actually categorical columns. So I shifted them to the category**

# Encoding Categorical Columns

In [None]:
from sklearn.preprocessing import LabelEncoder

le= LabelEncoder()
cat_cols.append('Bed Grade')
cat_cols.append('City_Code_Hospital')
cat_cols.append('City_Code_Patient')
for col in cat_cols:
    train[col]= le.fit_transform(train[col])
    

In [None]:
train[cat_cols]

In [None]:
train['City_Code_Hospital'].value_counts()

In [None]:
train['City_Code_Patient'].value_counts()

In [None]:
num_cols.remove('Bed Grade')
num_cols.remove('City_Code_Hospital')
num_cols.remove('City_Code_Patient')
num_cols

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train.corr(), annot=True, cmap='coolwarm')

# Scaling Numerical Columns

In [None]:
from sklearn.preprocessing import StandardScaler

ss= StandardScaler()

train[num_cols]= ss.fit_transform(train[num_cols].values)


In [None]:
train

In [None]:
from sklearn.model_selection import train_test_split

y= train['Stay']
X= train.drop('Stay', axis=1)

In [None]:
X

In [None]:
y

In [None]:
X_train, X_test, y_train,y_test= train_test_split(X,y,test_size= 0.2, stratify=y, random_state=42)

# Building our Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV

In [None]:
value= [LogisticRegression(), RandomForestClassifier(), DecisionTreeClassifier(),  KNeighborsClassifier(), CatBoostClassifier(), XGBClassifier()]

key= ['LogisticRegression', 'RandomForsetClassifier', 'DecisionTreeClassifier',  'KNeighborsClassifier', 'CatBoostClassifier', 'XGBClassifier']

models= dict(zip(key,value))
accuracy_scores=[]
for key,value in models.items():
    value.fit(X_train,y_train)
    y_pred= value.predict(X_test)
    accuracy= accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    print(key)
    print(accuracy)

In [None]:
sns.barplot(x= ['LR','RFC','DT','KNN','CB','XGB'],y=accuracy_scores)

**It is clear that CatBoostClassifier and XGBClassifier are best for the data.**

**But, now for hyperparameter tuning, CatBoostClassifier takes a lot of time. So, I decided to go with only tuning XGBClassifier model fot the sake of my old laptop xD**

# Hyperparameter Tuning

In [None]:
params= {'objective':['binary:logistic'],
              'max_depth': [3,4,5,6],
              'min_child_weight': [1,5,10,12],
              'subsample': [0.6,0.8,1.0],
              'colsample_bytree': [0.6,0.8,1.0], 'gamma': [0.5,1,1.5,2]}

xgb= XGBClassifier(n_estimators=600)

grid= RandomizedSearchCV(xgb, cv=3, verbose=3,param_distributions= params, n_iter=5)
grid.fit(X,y)

In [None]:
grid.best_score_

In [None]:
grid.best_estimator_

In [None]:
y_pred= grid.best_estimator_.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

# Using the test data

In [None]:
test= pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/test_data.csv')
test.head()

In [None]:
test.isna().sum()

In [None]:
test['Bed Grade'].fillna(statistics.mode(test['Bed Grade']),inplace=True)
test['City_Code_Patient'].fillna(statistics.mode(test['City_Code_Patient']),inplace=True)

In [None]:
test.info()

In [None]:
test.drop(['case_id', 'patientid'], axis=1, inplace=True)

In [None]:
cat_cols.remove('Stay')
for col in cat_cols:
    
    test[col]= le.fit_transform(test[col])

In [None]:
test[num_cols]= ss.transform(test[num_cols].values)

In [None]:
test

In [None]:
predictions= grid.best_estimator_.predict(test)

# Final Predictions

In [None]:
np. set_printoptions(threshold=np. inf)
print(predictions)

In [None]:
sns.countplot(predictions)

In [None]:
submission= pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/sample_sub.csv')

In [None]:
submission.head()

In [None]:
submission['Stay']= predictions

In [None]:
submission['Stay']= submission['Stay'].replace({0:'Less than 20 days', 1:'21-50 days', 2:'51-100+ days'})

In [None]:
submission

# Upvote and Comment if you liked my Notebook