In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
pd.set_option("display.max_columns", None)
from sklearn.experimental import enable_iterative_imputer, enable_hist_gradient_boosting
from sklearn.impute import KNNImputer, IterativeImputer

from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer, PolynomialFeatures, LabelEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomTreesEmbedding

from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.model_selection import learning_curve, validation_curve, train_test_split
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, cross_val_predict, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, roc_curve
from sklearn.utils.class_weight import compute_class_weight

In [None]:
train = pd.read_csv("/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data.csv")
attrib = pd.read_csv("/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data_dictionary.csv")
test = pd.read_csv("/kaggle/input/av-healthcare-analytics-ii/healthcare/test_data.csv")
sub = pd.read_csv("/kaggle/input/av-healthcare-analytics-ii/healthcare/sample_sub.csv")

train.columns = [col.replace(" ","_") for col in train.columns]
test.columns = [col.replace(" ","_") for col in test.columns]
test["Stay"] = np.nan
alldata = pd.concat((train, test))
alldata.head()

In [None]:
train.shape, test.shape

In [None]:
alldata.info()

In [None]:
alldata.isna().sum()

In [None]:
for col in alldata.columns:
    print(f"{col}, unique values :{alldata[col].nunique()}")
    print(alldata[col].unique())
    print()

In [None]:
map_age = {'0-10':0, '11-20':1, '21-30':2, '31-40':3, '41-50':4, '51-60':5, '61-70':6,
            '71-80':7, '81-90':8, '91-100':9}
map_stay = {'0-10':0, '11-20':1, '21-30':2, '31-40':3, '41-50':4, '51-60':5, '61-70':6,
            '71-80':7, '81-90':8, '91-100':9, 'More than 100 Days':10}
alldata['Stay'] = alldata.Stay.replace(map_stay)
alldata['Age'] = alldata.Age.replace(map_age)

In [None]:
alldata.Stay.value_counts()

In [None]:
alldata.Bed_Grade.fillna(1, inplace=True)
alldata.City_Code_Patient.fillna(0, inplace=True)

In [None]:
alldata.isna().sum()

In [None]:
hosptype_stay_mean = alldata.groupby("Hospital_type_code")["Stay"].mean().reset_index()
hosptype_stay_mean.columns = ["Hospital_type_code", "hosptype_stay_mean"]
dept_stay_mean = alldata.groupby("Department")["Stay"].mean().reset_index()
dept_stay_mean.columns = ["Department", "dept_stay_mean"]
ward_stay_mean = alldata.groupby("Ward_Type")["Stay"].mean().reset_index()
ward_stay_mean.columns = ["Ward_Type", "Ward_stay_mean"]
bed_stay_mean = alldata.groupby("Bed_Grade")["Stay"].mean().reset_index()
bed_stay_mean.columns = ["Bed_Grade", "bed_stay_mean"]

admittype_stay_mean = alldata.groupby("Type_of_Admission")["Stay"].mean().reset_index()
admittype_stay_mean.columns = ["Type_of_Admission", "admittype_stay_mean"]
severity_stay_mean = alldata.groupby("Severity_of_Illness")["Stay"].mean().reset_index()
severity_stay_mean.columns = ["Severity_of_Illness", "severity_stay_mean"]

age_stay_mean = alldata.groupby("Age")["Stay"].mean().reset_index()
age_stay_mean.columns = ["Age", "age_stay_mean"]

alldata = alldata.merge(hosptype_stay_mean,on='Hospital_type_code', how='left')
alldata = alldata.merge(dept_stay_mean, on='Department', how='left')
alldata = alldata.merge(ward_stay_mean, on='Ward_Type', how='left')
alldata = alldata.merge(bed_stay_mean, on='Bed_Grade', how='left')
alldata = alldata.merge(admittype_stay_mean, on='Type_of_Admission', how='left')
alldata = alldata.merge(severity_stay_mean, on='Severity_of_Illness', how='left')
alldata = alldata.merge(age_stay_mean, on='Age', how='left')
alldata.isna().sum()

patient_visits = alldata.groupby("patientid")["Stay"].size().reset_index()
patient_visits.columns = ["patientid","visits"]
alldata = alldata.merge(patient_visits, on='patientid')

In [None]:
alldata.head()

In [None]:
drop = ["case_id","Stay","patientid"]
dummy_bed = ["Type_of_Admission","Severity_of_Illness","Department","Hospital_region_code",
         "Hospital_type_code","Ward_Type","Ward_Facility_Code","Bed_Grade"]
dummy = ["Type_of_Admission","Severity_of_Illness","Department","Hospital_region_code",
         "Hospital_type_code","Ward_Type","Ward_Facility_Code"]
numerical = list(set(alldata.columns).difference(dummy+["Stay","case_id","patientid"]))

In [None]:
def submission(pred, model):
    sub["Stay"] = pred
    stay_map = {0:'0-10', 1:'11-20', 2:'21-30', 3:'31-40', 4:'41-50', 5:'51-60', 6:'61-70',
                7:'71-80', 8:'81-90', 9:'91-100', 10:'More than 100 Days'}
    sub["Stay"] = sub["Stay"].replace(stay_map)
    sub.to_csv("model_"+model+".csv", index=False)
    print("Submission for "+model+"created")
    
newtrain = alldata[~(alldata.Stay.isna())].drop(drop, axis=1)
newtest = alldata[(alldata.Stay.isna())].drop(drop, axis=1)
y = alldata[~(alldata.Stay.isna())]["Stay"]

xtrain, xtest, ytrain, ytest = train_test_split(newtrain, y, random_state=1, train_size=0.7, stratify=y)

## Pipelining

In [None]:
scaler = StandardScaler()
onehot = OneHotEncoder(handle_unknown='ignore')
label = LabelEncoder()
transformer = [('scaler',scaler, numerical),
               ('onehot',onehot, dummy) ]
ct = ColumnTransformer( transformers=transformer, remainder='passthrough')

In [None]:
model_lgbm = LGBMClassifier(random_state=1, n_jobs=4, objective='multiclass')
model_xgb = XGBClassifier(random_state=1, n_jobs=4, objective='muti:softmax', num_class=11)
model_rf = RandomForestClassifier( random_state=1, n_jobs=4)
model_cat = CatBoostClassifier(random_state=1, objective='MultiClass', verbose=0)
feature = SelectFromModel( model_rf, threshold=0.0001 )

In [None]:
models = []
models.append(('LGBM', model_lgbm))
models.append(('XGB', model_xgb))
models.append(('CAT', model_cat))

In [None]:
def pipe_models(models, ct, feature, xtrain, ytrain, xtest, ytest):
    for name, model in models:
        pipe = Pipeline( [ ('transform', ct), 
                           ('fselect',feature),
                           ('estimator', model) ] )
        pipe.fit(xtrain, ytrain)
        trainpred = pipe.predict(xtrain)
        train_accuracy = accuracy_score(ytrain, trainpred)
        train_f1 = f1_score(ytrain, trainpred, average='weighted')
        print("Model "+name+"\n")
        print("Train accuracy : %.4f  Train f1 score : %.4f"%(train_accuracy, train_f1))
        
        if xtest is not None:
            testpred = pipe.predict(xtest)
            test_accuracy = accuracy_score(ytest, testpred)
            test_f1 = f1_score(ytest, testpred, average='weighted')
            print("Test accuracy : %.4f  Test f1 score : %.4f"%(test_accuracy, test_f1))
        print("-------------------------------------")
        
def pipe_models_cv(models, ct, feature, cv, x, y):
    results = []
    names = []
    for name, model in models:
        pipe = Pipeline( [ ('transform', ct), 
                           ('fselect',feature),
                           ('estimator', model) ] )
        scores = cross_val_score(pipe, x, y, scoring='f1_weighted', n_jobs=-1, cv=cv)
        results.append(scores)
        names.append(name)
    plt.boxplot(results)
    plt.xticks(np.arange(1, len(names)+1), names)
    plt.xlabel("Model")
    plt.ylabel("F1 score")
    plt.title("Model score comparison")
    plt.show()

In [None]:
#pipe_models(models, ct, feature, xtrain, ytrain, xtest, ytest)

In [None]:
model_lgbm = LGBMClassifier(random_state=1, n_jobs=4, objective='multiclass')
model_xgb = XGBClassifier(random_state=1, n_jobs=4, objective='muti:softmax', num_class=11)
model_cat = CatBoostClassifier(random_state=1, objective='MultiClass', verbose=0)

In [None]:
x_select_transform = feature.fit_transform(ct.fit_transform(newtrain),y)
test_select_transform = feature.transform(ct.transform(newtest))

### Training Individual best models to combine the results

In [None]:
model_lgbm.fit(x_select_transform, y)
trainpred = model_lgbm.predict(x_select_transform)
train_accuracy = accuracy_score(y, trainpred)
pred_lgbm = model_lgbm.predict(test_select_transform)
train_f1 = f1_score(y, trainpred, average='weighted')
print("Train accuracy : %.4f  Train f1 score : %.4f"%(train_accuracy, train_f1))
submission(pred_lgbm, 'lgbm')

In [None]:
model_xgb.fit(x_select_transform, y)
trainpred = model_xgb.predict(x_select_transform)
train_accuracy = accuracy_score(y, trainpred)
pred_xgb = model_xgb.predict(test_select_transform)
train_f1 = f1_score(y, trainpred, average='weighted')
print("Train accuracy : %.4f  Train f1 score : %.4f"%(train_accuracy, train_f1))
submission(pred_xgb, 'xgb')

In [None]:
model_cat.fit(x_select_transform, y)
trainpred = model_cat.predict(x_select_transform)
train_accuracy = accuracy_score(y, trainpred)
pred_cat = model_cat.predict(test_select_transform)
train_f1 = f1_score(y, trainpred, average='weighted')
print("Train accuracy : %.4f  Train f1 score : %.4f"%(train_accuracy, train_f1))
submission(pred_cat, 'cat')

In [None]:
lgbm_proba = model_lgbm.predict_proba(test_select_transform)
xgb_proba = model_xgb.predict_proba(test_select_transform)
cat_proba = model_cat.predict_proba(test_select_transform)

In [None]:
probas = np.mean((lgbm_proba, xgb_proba, cat_proba), axis=0)
submission(np.argmax(probas, axis=1), 'all')

## LGBM Tuning

In [None]:
params = {'n_estimators':np.arange(50, 500),
          'lambdal1':np.linspace(0.001, 10),
          'lambdal2':np.linspace(0.001, 10),
          'min_data_in_leaf':np.arange(10,30),
          'boosting':['gbdt','dart'],
          'max_depth':np.arange(2,30),
          'colsample_bytree':np.linspace(0.7,1)}
cv = StratifiedKFold(n_splits=3, random_state=1, shuffle=True)
model_lgbm = LGBMClassifier(random_state=1, n_jobs=4, objective='multiclass')
rsearch_lgbm = RandomizedSearchCV(model_lgbm,  param_distributions=params,
                                  n_iter=100, scoring='f1_weighted', n_jobs=4, cv=cv, verbose=1)
rsearch_lgbm.fit(x_select_transform, y)

In [None]:
rsearch_lgbm.best_params_, rsearch_lgbm.best_score_

In [None]:
model_lgbm = LGBMClassifier(random_state=1, n_jobs=4, objective='multiclass')
model_lgbm.set_params(**rsearch_lgbm.best_params_)

model_lgbm.fit(x_select_transform, y)
trainpred_lgbm = model_lgbm.predict(x_select_transform)
train_accuracy = accuracy_score(y, trainpred)
train_f1 = f1_score(y, trainpred, average='weighted')
print("Model "+name+"\n")
print("Train accuracy : %.4f  Train f1 score : %.4f"%(train_accuracy, train_f1))

## Catboost Tuning

In [None]:
params = {'n_estimators':np.arange(50, 500),
          'reg_lambda':np.linspace(0.001, 10),
          'reg_alpha':np.linspace(0.001, 10),
          'max_depth':np.arange(2,15),
          'colsample_bytree':np.linspace(0.7,1),
          'min_data_in_leaf':}

model_cat = CatBoostClassifier(random_state=1, loss_function='MultiClass', verbose=0)
rsearch_cat = RandomizedSearchCV(model_cat, x_select_transform, y, n_iter=100,
                                 scoring='f1_weighted', n_jobs=4, cv=None)

In [None]:
rsearch_cat.best_params_, rsearch_cat.best_score_

In [None]:
model_cat = CatBoostClassifier(random_state=1, loss_function='MultiClass', verbose=0)
model_cat.set_params(**rsearch_cat.best_params_)

model_cat.fit(x_select_transform, y)
trainpred_cat = model_cat.predict(x_select_transform)
train_accuracy = accuracy_score(y, trainpred)
train_f1 = f1_score(y, trainpred, average='weighted')
print("Model "+name+"\n")
print("Train accuracy : %.4f  Train f1 score : %.4f"%(train_accuracy, train_f1))