In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import pandas_profiling as pd_prof
import missingno as misno
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
from subprocess import check_output
print(check_output(['ls','input']).decode('utf8'))

In [None]:
train = pd.read_csv('input/train.csv', low_memory=False)
test = pd.read_csv('input/test.csv', low_memory=False)

In [None]:
print(train.shape, test.shape)


In [None]:
le = LabelEncoder()
data_sets = [train,test]
for data in data_sets:
    for col in [x for x in data.columns if x not in ['UCIC_ID','Responders']]:
        if data[col].dtypes == 'object':
            data[col] = data[col].fillna('none')
            data[col] = le.fit_transform(data[col])
            if data[col].max() < 255:
                data[col] = data[col].astype(np.int8)
            else:
                data[col] = data[col].astype(np.int32)
                
        if data[col].dtypes == 'int64':        
            data[col] = data[col].fillna(-99).astype(np.int32)
            
        if data[col].dtypes == 'float64':        
            data[col] = data[col].fillna(-99.0).astype(np.float32)

In [None]:
constant_cols = []
for col in [x for x in train.columns if x not in ['UCIC_ID','Responders']]:
    if len(train[col].value_counts()) == 1:
        constant_cols.append(col)

In [None]:
for data in data_sets:
    data.drop(constant_cols, inplace=True, axis=1)

In [None]:
#test.head(5)

In [None]:
responders= train['Responders'].values
train.drop(['UCIC_ID','Responders'], inplace=True, axis=1)
ucic_id = test['UCIC_ID'].values
test.drop(['UCIC_ID'], inplace=True, axis=1)

In [None]:
del data_sets

In [None]:
params = {'learning_rate': 0.02, 
          'num_leaves':78,
          'min_data_in_leaf': 130,
          'max_depth': 6, 
          'colsample_bytree': 0.522,
          'boosting': 'gbdt', 
          'objective': 'binary', 
          'metric': 'auc',           
          'seed': 32}

In [None]:
lgb_train = lgb.Dataset(train, label=responders)

In [None]:
lgb_cv = lgb.cv(params, lgb_train, num_boost_round=3000, nfold= 4, shuffle=True, 
                stratified=True, verbose_eval=20, early_stopping_rounds=40)

In [None]:
nround = lgb_cv['auc-mean'].index(np.max(lgb_cv['auc-mean']))
#print(nround)

In [None]:
model_0 = lgb.train(params, lgb_train, num_boost_round=nround)

In [None]:
pred_0 = model_0.predict(test)

In [None]:
future_df = pd.DataFrame({'futures':model_0.feature_name(),'score':model_0.feature_importance()})
col_to_use = future_df[future_df['score'] > 20]['futures'].tolist()


In [None]:
lgb_train = lgb.Dataset(train[col_to_use], label=responders)

In [None]:
lgb_cv = lgb.cv(params, lgb_train, num_boost_round=3000, nfold= 4, shuffle=True, 
                stratified=True, verbose_eval=20, early_stopping_rounds=40)

In [None]:
nround = lgb_cv['auc-mean'].index(np.max(lgb_cv['auc-mean']))
#print(nround)

In [None]:
model_1 = lgb.train(params, lgb_train, num_boost_round=nround)

In [None]:
pred_1 = model_1.predict(test[col_to_use])

In [None]:
future_df = pd.DataFrame({'futures':model_0.feature_name(),'score':model_0.feature_importance()})
col_to_use = future_df[future_df['score'] > 50]['futures'].tolist()


In [None]:
lgb_train = lgb.Dataset(train[col_to_use], label=responders)

In [None]:
lgb_cv = lgb.cv(params, lgb_train, num_boost_round=3000, nfold= 4, shuffle=True, 
                stratified=True, verbose_eval=20, early_stopping_rounds=40)

In [None]:
nround = lgb_cv['auc-mean'].index(np.max(lgb_cv['auc-mean']))

In [None]:
model_2 = lgb.train(params, lgb_train, num_boost_round=nround)

In [None]:
pred_2 = model_1.predict(test[col_to_use])

In [None]:
sub = pd.DataFrame({'UCIC_ID':ucic_id, 'Responders':(pred_1 + pred_2)/2.})
sub = sub[['UCIC_ID','Responders']] 
sub.head(5)

In [None]:
sub.to_csv('lgb_v10.csv', index=False)