In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import os
from sklearn import preprocessing
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from collections import Counter
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
df_train = pd.read_csv('data/train.csv', header=0)
df_test = pd.read_csv('data/test.csv', header=0)
submission = pd.read_csv('data/sample_submission.csv', header=0)

In [3]:
df_train.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [4]:
df_train.isnull().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

In [5]:
df_test.isnull().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                              35
patientid                               0
City_Code_Patient                    2157
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
dtype: int64

In [6]:
cat_cols = df_train.select_dtypes('object').columns.tolist()

In [7]:
cat_cols.remove('Stay')

In [8]:
cat_cols

['Hospital_type_code',
 'Hospital_region_code',
 'Department',
 'Ward_Type',
 'Ward_Facility_Code',
 'Type of Admission',
 'Severity of Illness',
 'Age']

In [9]:
label_encoders = {}
for col_ in cat_cols:
    lbl_encoder = preprocessing.LabelEncoder()
    lbl_encoder.fit(df_train[col_])
    df_train[col_] = lbl_encoder.transform(df_train[col_])
    if col_ not in ['Stay']:
        df_test[col_] = lbl_encoder.transform(df_test[col_])
    label_encoders[col_] = lbl_encoder
    

In [10]:
label_encoders

{'Hospital_type_code': LabelEncoder(),
 'Hospital_region_code': LabelEncoder(),
 'Department': LabelEncoder(),
 'Ward_Type': LabelEncoder(),
 'Ward_Facility_Code': LabelEncoder(),
 'Type of Admission': LabelEncoder(),
 'Severity of Illness': LabelEncoder(),
 'Age': LabelEncoder()}

In [11]:
missing_impute = -9999
df_train.fillna(missing_impute, inplace=True)
df_test.fillna(missing_impute, inplace=True)

In [12]:
df_train.columns

Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay'],
      dtype='object')

In [13]:
pid_hc_counts_train = df_train.groupby(['patientid', 'Hospital_code']).size().reset_index(name='pid_hc_counts')
df_train = df_train.merge(pid_hc_counts_train, on=['patientid', 'Hospital_code'])

pid_hc_counts_test = df_test.groupby(['patientid', 'Hospital_code']).size().reset_index(name='pid_hc_counts')
df_test = df_test.merge(pid_hc_counts_test, on=['patientid', 'Hospital_code'])

In [14]:
pid_htc_counts_train = df_train.groupby(['patientid', 'Hospital_type_code']).size().reset_index(name='pid_htc_counts')
df_train = df_train.merge(pid_htc_counts_train, on=['patientid', 'Hospital_type_code'])

pid_htc_counts_test = df_test.groupby(['patientid', 'Hospital_type_code']).size().reset_index(name='pid_htc_counts')
df_test = df_test.merge(pid_htc_counts_test, on=['patientid', 'Hospital_type_code'])

In [15]:
df_train_folds = df_train[['patientid']].drop_duplicates()
df_train_folds['fold'] = df_train_folds['patientid'].apply(lambda _: 1 if np.random.random() > 0.3 else 0)

In [16]:
df_train = df_train.merge(df_train_folds, on=['patientid'])
df_train, df_eval = df_train[df_train.fold == 0], df_train[df_train.fold == 1]

In [17]:
df_train.columns

Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay', 'pid_hc_counts', 'pid_htc_counts', 'fold'],
      dtype='object')

In [19]:
feature_cols = ['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit',  'pid_hc_counts', 'pid_htc_counts']
label_col = 'Stay'

In [24]:
params = {
    'learning_rate': 0.030220439365535014, 
    'max_depth': 12.628092517544065, 
    'min_data_in_leaf': 7.494872015543642, 
    'n_estimators': 168.85320409201972, 
    'num_leaves': 99.59032585474291, 
    'reg_alpha': 3.8776144684024416,
    'reg_lambda': 2.0208715786620166
}
print(params)

{'learning_rate': 0.030220439365535014, 'max_depth': 12.628092517544065, 'min_data_in_leaf': 7.494872015543642, 'n_estimators': 168.85320409201972, 'num_leaves': 99.59032585474291, 'reg_alpha': 3.8776144684024416, 'reg_lambda': 2.0208715786620166}


In [25]:
params['n_estimators'] = 1000 #int(params['n_estimators'])
params['min_data_in_leaf'] = int(params['min_data_in_leaf'])
params['max_depth'] = int(params['max_depth'])
params['num_leaves'] = int(params['num_leaves'])
params['objective'] = 'multiclass'
params['boosting_type'] = 'gbdt'
params['subsample'] = 0.7
params['colsample_bytree'] = 0.7

In [26]:
clf = lgb.LGBMClassifier(**params)

clf.fit(df_train[feature_cols], df_train[label_col], early_stopping_rounds=100, eval_set=[(df_eval[feature_cols], df_eval[label_col])], eval_metric='multi_error', verbose=False, categorical_feature=cat_cols)

# eval_score_auc = roc_auc_score(df_train[label_col], clf.predict(df_train[feature_cols]))
train_score_acc = accuracy_score(df_train[label_col], clf.predict(df_train[feature_cols]))
eval_score_acc = accuracy_score(df_eval[label_col], clf.predict(df_eval[feature_cols]))

print('Train ACC: {}, Eval ACC: {}'.format(train_score_acc, eval_score_acc))

Train ACC: 0.48779044415374073, Eval ACC: 0.424788402665226


In [27]:
best_iter = clf.best_iteration_
params['n_estimators'] = best_iter

In [28]:
print(params)

{'learning_rate': 0.030220439365535014, 'max_depth': 12, 'min_data_in_leaf': 7, 'n_estimators': 182, 'num_leaves': 99, 'reg_alpha': 3.8776144684024416, 'reg_lambda': 2.0208715786620166, 'objective': 'multiclass', 'boosting_type': 'gbdt', 'subsample': 0.7, 'colsample_bytree': 0.7}


In [29]:
df_train = pd.concat((df_train, df_eval))

In [30]:
clf = lgb.LGBMClassifier(**params)

clf.fit(df_train[feature_cols], df_train[label_col], eval_metric='multi_error', verbose=True, categorical_feature=cat_cols)

# eval_score_auc = roc_auc_score(df_train[label_col], clf.predict(df_train[feature_cols]))
eval_score_acc = accuracy_score(df_train[label_col], clf.predict(df_train[feature_cols]))

print('ACC: {}'.format(eval_score_acc))

ACC: 0.45377436110012


In [31]:
preds = clf.predict(df_test[feature_cols])

In [32]:
submission = pd.DataFrame({'case_id':df_test['case_id'], 'Stay':preds})

In [33]:
submission.to_csv('lgb_baseline_feats.csv', index=None)