In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

from scipy import stats
from scipy.stats import norm, skew

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, OrthogonalMatchingPursuit
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor, KernelDensity, KDTree
from sklearn.metrics import *

import sys, os
import random 

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
from IPython import display, utils

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 300)
pd.set_option('max_colwidth', 400)


def set_seed(seed=4242):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed()

In [None]:
train = pd.read_csv('../input/covid19-hospital-treatment/host_train.csv')
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
missing = train.isnull().sum()/len(train)
missing = missing[missing>0]
plt.figure(figsize=(15, 11))
missing = missing.sort_values(ascending=False)
plt.style.use('ggplot')
font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 14}
plt.rc('font', **font)
plt.figure(figsize=(8, 5))
missing.plot.bar(color='teal')

In [None]:
train.describe(include=['O'])

In [None]:
cats = [c for c in train.columns if train[c].dtypes=='object']
cats

In [None]:
nums = ['Patient_Visitors', 'Admission_Deposit']
nums

In [None]:
true_cats = [c for c in train.columns if c not in nums]
true_cats

Numerics

In [None]:
target = train.Stay_Days



plt.figure(figsize=(20, 8))
sns.countplot(target, palette='bone_r')

In [None]:
cats.remove('Stay_Days')

In [None]:


def analyse_cats(df, cat_cols):
    d = pd.DataFrame()
    cl = []
    u = []
    s =[]
    nans =[]
    for c in cat_cols:
        #print("column:" , c ,"--Uniques:" , train[c].unique(), "--Cardinality:", train[c].unique().size)
        cl.append(c)
        u.append(df[c].unique())
        s.append(df[c].unique().size)
        nans.append(df[c].isnull().sum())
        
       # plt.figure(figsize=(12, 5))
        #sns.countplot(train[c], palette='bone');
        
    d['"feat"'] = cl
    d["uniques"] = u
    d["cardinality"] = s
    d["nans"] = nans

    return d

catanadf = analyse_cats(train, cats)
catanadf

In [None]:

for col in cats:
    le = LabelEncoder() 
    train[col]  = le.fit_transform(train[col].astype(str)) 
    
le = LabelEncoder()
train.Stay_Days = le.fit_transform(train.Stay_Days.astype(str))
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)



In [None]:
plt.style.use('seaborn-poster')
font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 9}
plt.rc('font', **font)

plt.figure(figsize=(20, 12))
sns.heatmap(train.corr(), annot=True, cmap='coolwarm')

- More stay days more visitor and better Ward_Type
- As we expected Illness_Severity and Available_Extra_Rooms has negative correlation with the target
- Different hospitals and Department operate almost equaly in treatment


In [None]:
target = train.pop('Stay_Days')

In [None]:
from sklearn.impute import SimpleImputer
cols = train.columns
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
train = pd.DataFrame(imp_mean.fit_transform(train), columns= cols)
train.isnull().sum()

In [None]:
scores = []
oof = np.zeros(len(train))
y_le = target.values
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_, (train_ind, val_ind) in enumerate(folds.split(train, y_le)):
    print('fold:', fold_)
    X_tr, X_test = train.iloc[train_ind], train.iloc[val_ind]
    y_tr, y_test = y_le[train_ind], y_le[val_ind]
    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1,max_depth=5, max_features=0.8)
    clf.fit(X_tr, y_tr)
    oof[val_ind]= np.argmax(clf.predict_proba(X_test),axis=1) 
    y = np.argmax(clf.predict_proba(X_tr),axis=1) 
    print('train:',accuracy_score(y_tr, y),'val :' , accuracy_score(y_test, (oof[val_ind])))
    print(20 * '-')
    
    scores.append(accuracy_score(y_test, oof[val_ind]))
    
    
    
print('Random Forest accuracy=  ', np.mean(scores))
np.save('oof_rf', oof)


*We also could use class weight in Random foredt to see the difference (imbalanced data)*

### LGB rf booster

In [None]:
params = {
    
    'objective': 'multiclass',
    'boosting': 'rf',
    'metric': 'multi_logloss',
    'max_depth': -1,
    'num_leaves': 12,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
       
    'lambda_l2': 2.0,
    'lambda_l1': 2.0,
    'unbalanced': True,
    'num_class': len(np.unique(target)),
     }

import lightgbm as lgb

scores = []


oof = np.zeros(len(train))


feature_importances_gain = pd.DataFrame()
feature_importances_gain['feature'] = train.columns

feature_importances_split = pd.DataFrame()
feature_importances_split['feature'] = train.columns

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4242)

for fold_, (train_ind, val_ind) in enumerate(folds.split(train, target)):
    print("fold :::::::: " , fold_)
    trn_data = lgb.Dataset(train.iloc[train_ind], target.iloc[train_ind])
    val_data = lgb.Dataset(train.iloc[val_ind], target.iloc[val_ind])
    
    model = lgb.train(params, trn_data, valid_sets=(trn_data, val_data), num_boost_round=1000, verbose_eval=100, early_stopping_rounds=100)
    oof[val_ind] = np.argmax(model.predict(train.iloc[val_ind], num_iteration=model.best_iteration), axis=1)
    
        
    print('f1 :', f1_score(target.iloc[val_ind], oof[val_ind], average='micro'))
    scores.append(f1_score(target.iloc[val_ind], oof[val_ind], average='micro'))
    
    feature_importances_gain['fold_{}'.format(fold_ + 1)] = model.feature_importance(importance_type='gain')
    feature_importances_split['fold_{}'.format(fold_ + 1)] = model.feature_importance(importance_type='split')
    
    
   
    
print('mean f1: ', np.mean(scores))


In [None]:
feature_importances_gain['average'] = feature_importances_gain[['fold_{}'.format(fold + 1) for fold in range(folds.n_splits)]].mean(axis=1)
feature_importances_gain.to_csv('feature_importances.csv')

plt.figure(figsize=(15, 10))
sns.barplot(data=feature_importances_gain.sort_values(by='average', ascending=False).head(100),palette='bone',  x='average', y='feature');
plt.title('TOP feature importance over {} folds average'.format(folds.n_splits));