In [12]:
import numpy as np
import pandas as pd
import gc; gc.enable()
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
#from sklearn.cross_validation import KFold
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
import lightgbm as lgb
from rgf.sklearn import RGFClassifier 
from catboost import CatBoostClassifier
from subprocess import check_output
print(check_output(['ls', 'input']).decode('utf8'))

data_dict.xlsx
sample_submission_fBo3EW5.csv
test_9tJUnaB.zip
test.csv
train.csv
train_zsTMYVA.zip



In [13]:
train = pd.read_csv('input/train.csv', low_memory=False)
test = pd.read_csv('input/test.csv', low_memory=False)
print(train.shape, test.shape)

(300000, 377) (200000, 376)


In [14]:
le = LabelEncoder()
data_sets = [train,test]
for data in data_sets:
    for col in [x for x in data.columns if x not in ['UCIC_ID','Responders']]:
        if data[col].dtypes == 'object':
            data[col] = data[col].fillna('none')
            data[col] = le.fit_transform(data[col])
            if data[col].max() < 255:
                data[col] = data[col].astype(np.int8)
            else:
                data[col] = data[col].astype(np.int32)
                
        if data[col].dtypes == 'int64':        
            data[col] = data[col].fillna(-99).astype(np.int32)
            
        if data[col].dtypes == 'float64':        
            data[col] = data[col].fillna(-99.0).astype(np.float32)

In [15]:
constant_cols = []
for col in [x for x in train.columns if x not in ['UCIC_ID','Responders']]:
    if len(train[col].value_counts()) == 1:
        constant_cols.append(col)

In [6]:
for data in data_sets:
    data.drop(constant_cols, inplace=True, axis=1)

In [16]:
y_train= train['Responders'].values
train.drop(['UCIC_ID','Responders'], inplace=True, axis=1)
ucic_id = test['UCIC_ID'].values
test.drop(['UCIC_ID'], inplace=True, axis=1)

In [17]:
X_train = train.values
X_test = test.values
ntrain = train.shape[0]
ntest= test.shape[0]
NFOLDS = 3
SEED = 4
#kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)
skf = list(StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=2016).split(X_train, y_train))

In [19]:
del data_sets, train, test

In [20]:
class BaseModels(object):
    
    def __init__(self, clf, seed=0, params=None):
        #params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self,X_train, y_train):
        self.clf.fit(X_train, y_train)
    
    def predict(self, X):
        return self.clf.predict_proba(X)[:,1]       
        
    
class XgbModel(object):
    
    def __init__(self, seed=0, params=None):
        self.params = params
        self.params['seed'] = seed
        self.nrounds = params.pop('nrouns', 250)
        
    def train(self, X_train, y_train):
        dtrain = xgb.DMatrix(X_train, y_train)
        self.mdl = xgb.train(self.params, dtrain, self.nrounds) 
        
    def predict(self, X):
        return self.mdl.predict(xgb.DMatrix(X))
    

In [31]:
def get_oof(clf, s):#, X_train, y_train, X_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_kf = np.zeros((NFOLDS,ntest))
    
    for i, (train_idx, valid_idx) in enumerate(skf):
        trn_X = X_train[train_idx]
        trn_y = y_train[train_idx]
        val_X = X_train[valid_idx]
        
        print ("Fit Model %s fold %d" % (s, i+1))
        clf.train(trn_X, trn_y)
        
        oof_train[valid_idx] = clf.predict(val_X)
        oof_test_kf[i,:] = clf.predict(X_test)
        
    oof_test[:] = oof_test_kf.mean(axis=0) 
    
    return oof_train.reshape(-1,1), oof_test.reshape(-1,1)
    

In [32]:
# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 6,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'error',
    'nrounds': 350
}

# LightGBM params
lgb_params = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 1800
lgb_params['max_depth'] = 6
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500

# CatBoost params
cat_params = {}
cat_params['iterations'] = 900
cat_params['depth'] = 8
cat_params['rsm'] = 0.95
cat_params['learning_rate'] = 0.03
cat_params['l2_leaf_reg'] = 3.5  
cat_params['border_count'] = 8
cat_params['gradient_iterations'] = 4

# Regularized Greedy Forest params
rgf_params = {}
rgf_params['max_leaf'] = 2000
rgf_params['learning_rate'] = 0.5
rgf_params['algorithm'] = "RGF_Sib"
rgf_params['test_interval'] = 100
rgf_params['min_samples_leaf'] = 3 
rgf_params['reg_depth'] = 1.0
rgf_params['l2'] = 0.5  
rgf_params['sl2'] = 0.005
rgf_params['n_jobs'] = -1


In [33]:
rgf = BaseModels(clf=RGFClassifier, seed=SEED, params=rgf_params)
lg = BaseModels(clf=lgb.LGBMClassifier, seed=SEED, params=lgb_params)
cb  = BaseModels(clf=CatBoostClassifier, seed=SEED, params=cat_params)
gb = BaseModels(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
xg = XgbModel(seed=SEED, params=xgb_params)

In [34]:
rgf_oof_train, rgf_oof_test = get_oof(rgf, 'RGF')
lg_oof_train, lg_oof_test = get_oof(lg, 'LGB')
gb_oof_train, gb_oof_test = get_oof(gb, 'GBC')
cb_oof_train, cb_oof_test = get_oof(cb, 'CBC' )
xgb_oof_train, xgb_oof_test = get_oof(xg, 'XGB')
print("complete")

Fit Model RGF fold 1
Fit Model RGF fold 2
Fit Model RGF fold 3
Fit Model LGB fold 1
Fit Model LGB fold 2
Fit Model LGB fold 3
Fit Model GBC fold 1
Fit Model GBC fold 2
Fit Model GBC fold 3
Fit Model CBC fold 1
Fit Model CBC fold 2
Fit Model CBC fold 3
Fit Model XGB fold 1
Fit Model XGB fold 2
Fit Model XGB fold 3
complete


In [37]:
stk_train = np.concatenate((rgf_oof_train, lg_oof_train, gb_oof_train, cb_oof_train, xgb_oof_train), axis=1)
stk_test = np.concatenate((rgf_oof_test, lg_oof_test, gb_oof_test, cb_oof_test, xgb_oof_test), axis=1)

In [38]:
params = {'learning_rate': 0.02, 
          'num_leaves':78,
          'min_data_in_leaf': 130,
          'max_depth': 6, 
          'colsample_bytree': 0.522,
          'boosting': 'gbdt', 
          'objective': 'binary', 
          'metric': 'auc',           
          'seed': 32}

In [39]:
lgb_train = lgb.Dataset(stk_train, y_train)
model_lgb = lgb.train(params, lgb_train, 1800)   

In [40]:
pred = model_lgb.predict(stk_test)

In [41]:
sub = pd.DataFrame({'UCIC_ID':ucic_id, 'Responders':pred})
sub = sub[['UCIC_ID','Responders']]
sub.head(5)

Unnamed: 0,UCIC_ID,Responders
0,337734,0.25606
1,488166,0.017199
2,410785,0.041041
3,389145,0.151335
4,221090,0.155564


In [42]:
sub.to_csv('stacking_v1.csv', index=False)