Recent Covid-19 Pandemic has raised alarms over one of the most overlooked area to focus: Healthcare Management. While healthcare management has various use cases for using data science, patient length of stay is one critical parameter to observe and predict if one wants to improve the efficiency of the healthcare management in a hospital. 
This parameter helps hospitals to identify patients of high LOS risk (patients who will stay longer) at the time of admission. Once identified, patients with high LOS risk can have their treatment plan optimized to miminize LOS and lower the chance of staff/visitor infection. Also, prior knowledge of LOS can aid in logistics such as room and bed allocation planning.
Suppose you have been hired as Data Scientist of HealthMan – a not for profit organization dedicated to manage the functioning of Hospitals in a professional and optimal manner.
The task is to accurately predict the Length of Stay for each patient on case by case basis so that the Hospitals can use this information for optimal resource allocation and better functioning. The length of stay is divided into 11 different classes ranging from 0-10 days to more than 100 days.
 
Data Description

Train.zip contains 1 csv alongside the data dictionary that contains definitions for each variable
train.csv – File containing features related to patient, hospital and Length of stay on case basis
train_data_dict.csv – File containing the information of the features in train file

Test Set
test.csv – File containing features related to patient, hospital. Need to predict the Length of stay for each case_id


Sample Submission:
case_id: Unique id for each case
Stay: Length of stay for the patient w.r.t each case id in test data
Evaluation Metric
The evaluation metric for this hackathon is 100*Accuracy Score.



In [None]:
import pandas as pd
import numpy as np

In [None]:
import os

In [None]:
os.getcwd()

In [None]:
train=pd.read_csv("../input/janatahack-healthcare-analytics-part-2/train.csv")

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.isnull().sum()

In [None]:
test=pd.read_csv("../input/janatahack-healthcare-analytics-part-2/test.csv")

In [None]:
test.info()

In [None]:
test.describe()

In [None]:
test.isnull().sum()

In [None]:
train.isnull().sum()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train_df=train.drop(['Stay'],axis=1)

In [None]:
train_df.columns

In [None]:
train_df['flag']=1

In [None]:
train_df.head(2)

In [None]:
test_df=test

In [None]:
test_df['flag']=0

In [None]:
test_df.head(2)

In [None]:
clean=pd.concat([train_df,test_df])

In [None]:
clean

In [None]:
clean.columns

In [None]:
clean.drop(['case_id'],axis=1,inplace=True)

In [None]:
clean.columns

In [None]:
clean.drop(['patientid'],axis=1,inplace=True)

In [None]:
clean.columns

In [None]:
clean.shape

In [None]:
clean.isnull().sum()

In [None]:
for column in ['Bed Grade','City_Code_Patient' ]:
    clean[column].fillna(clean[column].mode()[0], inplace=True)

In [None]:
clean.isnull().sum()

In [None]:
clean.columns

In [None]:
train_set=clean.loc[clean['flag']==1]

In [None]:
train_set.shape

In [None]:
test_set=clean.loc[clean['flag']==0]

In [None]:
test_set.shape

In [None]:
train_set.columns

In [None]:
train_set.shape

In [None]:
train_set.drop('flag',axis=1,inplace=True)

In [None]:
train_set.shape

In [None]:
test_set.shape

In [None]:
test_set.columns

In [None]:
test_set.drop('flag',axis=1,inplace=True)

In [None]:
train_set.head(2)

In [None]:
train_set['Hospital_code'] = train_set['Hospital_code'].astype('category')
train_set['Hospital_type_code'] = train_set['Hospital_type_code'].astype('category')
train_set['City_Code_Hospital'] = train_set['City_Code_Hospital'].astype('category')
train_set['Hospital_region_code'] = train_set['Hospital_region_code'].astype('category')
train_set['Department'] = train_set['Department'].astype('category')
train_set['Ward_Type'] = train_set['Ward_Type'].astype('category')
train_set['Ward_Facility_Code'] = train_set['Ward_Facility_Code'].astype('category')
train_set['Bed Grade'] = train_set['Bed Grade'].astype('category')
train_set['City_Code_Patient'] = train_set['City_Code_Patient'].astype('category')
train_set['Type of Admission'] = train_set['Type of Admission'].astype('category')
train_set['Severity of Illness'] = train_set['Severity of Illness'].astype('category')
train_set['Age'] = train_set['Age'].astype('category')

In [None]:
from sklearn.preprocessing import LabelEncoder 


In [None]:
le = LabelEncoder() 

In [None]:
train_set= pd.get_dummies(train_set)

In [None]:
train_set.columns

In [None]:
train_set.info()

In [None]:
train_set.shape

In [None]:
train_set.info()

In [None]:
training_set=pd.concat([train_set,train['Stay']],axis=1)

In [None]:
training_set.head(2)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x=training_set.loc[:, training_set.columns != 'Stay']


In [None]:
y=training_set['Stay']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [None]:
x_train.columns

In [None]:
x_test.head(2)

In [None]:
y_train.head()

In [None]:
y_test.head()

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot


In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# define dataset
#X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
# evaluate the model
model = LGBMClassifier()
#cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
#n_scores = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=cv, error_score='raise')
#print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# fit the model on the whole dataset
model.fit(x_train, y_train)
# make a single prediction
predict = model.predict(x_test)
print(accuracy_score(y_test,predict))

In [None]:
# define dataset
#X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
# evaluate the model
model = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=100, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0, max_bin= 100)

model.fit(x_train, y_train)
# make a single prediction
predict = model.predict(x_test)
print(accuracy_score(y_test,predict))

In [None]:
# define dataset
#X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
# evaluate the model
model = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=100, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=300000, subsample_freq=0, max_bin= 100)

model.fit(x_train, y_train)
# make a single prediction
predict = model.predict(x_test)
print(accuracy_score(y_test,predict))

In [None]:
import pandas as pd
import numpy as np
pred=pd.DataFrame(predict)

In [None]:
test_set.head(2)

In [None]:
test_set['Hospital_code'] = test_set['Hospital_code'].astype('category')
test_set['Hospital_type_code'] = test_set['Hospital_type_code'].astype('category')
test_set['City_Code_Hospital'] = test_set['City_Code_Hospital'].astype('category')
test_set['Hospital_region_code'] = test_set['Hospital_region_code'].astype('category')
test_set['Department'] = test_set['Department'].astype('category')
test_set['Ward_Type'] = test_set['Ward_Type'].astype('category')
test_set['Ward_Facility_Code'] = test_set['Ward_Facility_Code'].astype('category')
test_set['Bed Grade'] = test_set['Bed Grade'].astype('category')
test_set['City_Code_Patient'] = test_set['City_Code_Patient'].astype('category')
test_set['Type of Admission'] = test_set['Type of Admission'].astype('category')
test_set['Severity of Illness'] = test_set['Severity of Illness'].astype('category')
test_set['Age'] = test_set['Age'].astype('category')

In [None]:
test_set1= pd.get_dummies(test_set)

In [None]:
test_set1.columns

In [None]:
pred= model.predict(test_set1)

In [None]:
final_predictions=pd.DataFrame(pred)

In [None]:
final_predictions.columns=['Stay']

In [None]:
Submission=pd.concat((test['case_id'],final_predictions['Stay']),axis=1)

In [None]:
import os

In [None]:
os.getcwd()

In [None]:
Submission.to_csv("submission.csv",index=False)

Different hyper tuning parameters

In [None]:
model1 = LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.8,
               learning_rate=0.04,min_child_samples=228,
               n_estimators=50000, n_jobs=-1, objective='multi_class',
               reg_alpha=1, reg_lambda=1,
               max_bin= 63,num_class=11 )

model.fit(x_train, y_train)
# make a single prediction
predict = model.predict(x_test)
print(accuracy_score(y_test,predict))