# Predict how long patient can stay in the hospital.
* Cleaning and Rearranging data
* Feature Selection
* Model Selection and Training
* Catboost - 42.96%
* Submit

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Cleaning and Rearranging data

In [None]:
df_train = pd.read_csv("/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data.csv",index_col=0)
df_test = pd.read_csv("/kaggle/input/av-healthcare-analytics-ii/healthcare/test_data.csv",index_col=0)
print(df_train.shape)
print(df_test.shape)

In [None]:
df_train.head()

In [None]:
df_train.Stay.unique()

### Convert Healthcare data to numerical format

In [None]:
def Convert_to_Num(df):
    dept_lst = df["Department"].unique()
    dept_lst.sort()
    dept_dict = dict(zip(dept_lst, range(len(dept_lst))))
    df.Department.replace(dept_dict, inplace=True)
    print(dept_dict)
    
    HRC_lst = df["Hospital_region_code"].unique()
    HRC_lst.sort()
    HRC_dict = dict(zip(HRC_lst, range(len(HRC_lst))))
    df.Hospital_region_code.replace(HRC_dict, inplace=True)
    print(HRC_dict)
    
    ward_lst = df["Ward_Type"].unique()
    ward_lst.sort()
    ward_dict = dict(zip(ward_lst, range(len(ward_lst))))
    df.Ward_Type.replace(ward_dict, inplace=True)
    print(ward_dict)
    
    WFC_lst = df["Ward_Facility_Code"].unique()
    WFC_lst.sort()
    WFC_dict = dict(zip(WFC_lst, range(len(WFC_lst))))
    df.Ward_Facility_Code.replace(WFC_dict, inplace=True)
    print(WFC_dict)
    
    TOA_lst = df["Type of Admission"].unique()
    TOA_lst.sort()
    TOA_dict = dict(zip(TOA_lst, range(len(TOA_lst))))
    df["Type of Admission"].replace(TOA_dict, inplace=True)
    print(TOA_dict)
   
    SOI_lst = df["Severity of Illness"].unique()
    SOI_lst.sort()
    SOI_dict = dict(zip(SOI_lst, range(len(SOI_lst))))
    df["Severity of Illness"].replace(SOI_dict, inplace=True)
    print(SOI_dict)
    
    age_lst = df["Age"].unique()
    age_lst.sort()
    age_dict = dict(zip(age_lst, range(len(age_lst))))
    df["Age"].replace(age_dict, inplace=True)
    print(age_dict)
    
    HTC_lst = df["Hospital_type_code"].unique()
    HTC_lst.sort()
    HTC_dict = dict(zip(HTC_lst, range(len(HTC_lst))))
    df["Hospital_type_code"].replace(HTC_dict, inplace=True)
    print(HTC_dict)
    
Convert_to_Num(df_train)
stay_list = df_train["Stay"].unique()
stay_list.sort()
dept_Stay = dict(zip(stay_list, range(len(stay_list))))
df_train["Stay"].replace(dept_Stay, inplace=True)
print(dept_Stay)

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train["Bed Grade"].unique()

### Fill empty records

In [None]:
df_train["Bed Grade"] = df_train["Bed Grade"].fillna(2)
# df_train["Bed Grade"] = df_train["Bed Grade"].fillna(2)
df_train["Bed Grade"].unique()

In [None]:
df_train.isnull().sum()

In [None]:
df_train.describe().T

## Feature Selection
As city code patient records has lot of NaN and I think these records are not so important I am removing it.

In [None]:
y = df_train.Stay
x = df_train.drop(columns=["Stay", "City_Code_Patient", "patientid"])

### Create dataset to confirm Accuracy

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Model Selection

In [None]:
from catboost import CatBoostClassifier, Pool
from catboost.utils import get_confusion_matrix

eval_dataset = Pool(x_test,
                    y_test)

model = CatBoostClassifier(iterations=1000,
                           learning_rate=0.03,
                           depth=10,
                           l2_leaf_reg = 3,
                           loss_function='MultiClass',
                           eval_metric='MultiClass')

model.fit(x_train,
          y_train,
          eval_set=eval_dataset,
          verbose=True)

print(model.get_best_score())
cm = get_confusion_matrix(model, eval_dataset)
print(cm)
predict_accuracy_on_test_set = (cm[0,0] + cm[1,1])/(cm[0,0] + cm[1,1]+cm[1,0] + cm[0,1])
ax = sns.heatmap(cm, linewidth=1)
plt.show()
print("catboost Acc : ", predict_accuracy_on_test_set)

### Test Actual dataset

In [None]:
print(cm)

In [None]:
import catboost
pred = model.predict(
                 eval_dataset, 
                 verbose=True)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

In [None]:
df_test.head()

In [None]:
Convert_to_Num(df_test)
df_test = df_test.drop(columns=["City_Code_Patient", "patientid"])
df_test.head()

In [None]:
eval_dt = Pool(df_test)
eval_dt

In [None]:
preds_class = model.predict(eval_dt)
preds_class.shape

### Submit

In [None]:
df_sub = pd.DataFrame(index=df_test.index)
df_sub["Stay"] = preds_class
swap_dict_Stay = dict([(value, key) for key, value in dept_Stay.items()])
df_sub["Stay"].replace(swap_dict_Stay, inplace=True)
df_sub.head()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
parameter_space = {
    'hidden_layer_sizes': [(1024), (50,), (50,100, 50), (48,), (48, 48, 48), (96,), (144,), (192,), (96, 144, 192), (240,), (144, 192, 240)],
    'activation': ['tanh', 'logistic', 'relu'],
    'solver': ['adam', 'lbfgs', 'sgd'],
    'alpha': [0.0001, 0.001, 0.05, 0.1, 1],
    'beta_1': [0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99],
    'beta_2': [0.990, 0.991, 0.992, 0.993, 0.994, 0.995, 0.996, 0.997, 0.998, 0.999],
    'learning_rate': ['constant','adaptive'],
                }

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=1000, random_state=42)

In [None]:
import sklearn
# score = ['accuracy']
scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average = 'weighted')
clf = RandomizedSearchCV(mlp, parameter_space, n_jobs = -1, n_iter = 10,  cv=3, refit='precision', scoring=scorer, random_state=0)

In [None]:
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
print("Validation Accuracy",score*100,"%")

In [None]:
plot_confusion_matrix(clf, x_test, y_test)

In [None]:
df_sub.to_csv("Submission.csv")

# Please UPVOTE if you like the notebook.