In [2]:
import numpy as np
import pandas as pd
import zipfile as zp
pd.set_option('display.max_columns', 1000)
from myfeatures import engineer
from myfeatures2 import engineer2
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

### Try LightGBM

In [24]:
import pickle

In [19]:
train = pd.read_pickle('papa_train.pkl')
test = pd.read_pickle('papa_test.pkl')

In [20]:
del train, test

In [10]:
print(train.shape)
print(test.shape)

(300000, 521)
(200000, 520)


In [4]:
feature_names = [col for col in train.columns if col not in ['UCIC_ID','Responders']]
target = train['Responders']

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train, target, test_size = 0.3, stratify = target)

In [6]:
import lightgbm as lgb

In [7]:
ltrain = lgb.Dataset(X_train[feature_names], label=y_train)
lvalid = lgb.Dataset(X_valid[feature_names], label=y_valid)

In [8]:
paramslgb = {
    
    'learning_rate':0.1,
    'max_depth':6,
    'boosting':'dart',
    'objective':'binary',
    'metric':'auc',
    'seed':2017,
    'feature_fraction':1,
    'bagging_fraction':1,
    'num_leaves':30,
    'lambda_l1':16,
    'lambda_l2':16
}

In [11]:
mod1 = lgb.train(paramslgb, ltrain, num_boost_round=100, valid_sets=lvalid, verbose_eval=20, early_stopping_rounds=40)

Train until valid scores didn't improve in 40 rounds.
[20]	valid_0's auc: 0.840819
[40]	valid_0's auc: 0.847633
[60]	valid_0's auc: 0.852485
[80]	valid_0's auc: 0.855439
[100]	valid_0's auc: 0.856508


In [15]:
mod1.best_iteration

0

In [90]:
pred = mod1.predict(test[feature_names])

In [95]:
sub = pd.read_csv('sample_submission_fBo3EW5.csv')
sub['UCIC_ID'] = test['UCIC_ID']
sub['Responders'] = pred

In [96]:
sub.to_csv('submissions/lgb_forest.csv', index=False) ## 0.68245125

### LightGBM - 2 (Tuning)

In [105]:
paramslgb = {
    
    'learning_rate':0.1,
    'max_depth':4,
    'boosting':'dart',
    'objective':'binary',
    'metric':'auc',
    'seed':2017,
    'feature_fraction':0.9,
    'bagging_fraction':0.9,
    'num_leaves':30
#     'lambda_l1':16,
#     'lambda_l2':16
}

In [106]:
mod1 = lgb.train(paramslgb, ltrain, num_boost_round=1000, valid_sets=lvalid, verbose_eval=20, early_stopping_rounds=40)

Train until valid scores didn't improve in 40 rounds.
[20]	valid_0's auc: 0.838126
[40]	valid_0's auc: 0.841155
[60]	valid_0's auc: 0.846211
[80]	valid_0's auc: 0.84912
[100]	valid_0's auc: 0.851312
[120]	valid_0's auc: 0.852827
[140]	valid_0's auc: 0.854367
[160]	valid_0's auc: 0.856813
[180]	valid_0's auc: 0.858489
[200]	valid_0's auc: 0.861245
[220]	valid_0's auc: 0.862738
[240]	valid_0's auc: 0.863574
[260]	valid_0's auc: 0.863941
[280]	valid_0's auc: 0.864526
[300]	valid_0's auc: 0.865103
[320]	valid_0's auc: 0.865259
[340]	valid_0's auc: 0.865534
[360]	valid_0's auc: 0.865968
[380]	valid_0's auc: 0.866088
[400]	valid_0's auc: 0.866567
[420]	valid_0's auc: 0.866799
[440]	valid_0's auc: 0.866854
[460]	valid_0's auc: 0.867041
[480]	valid_0's auc: 0.867158
[500]	valid_0's auc: 0.867477
[520]	valid_0's auc: 0.867699
[540]	valid_0's auc: 0.867862
[560]	valid_0's auc: 0.86795
[580]	valid_0's auc: 0.868069
[600]	valid_0's auc: 0.868115
[620]	valid_0's auc: 0.868123
[640]	valid_0's auc: 0

## Keras 

In [134]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.optimizers import SGD, Adadelta, Adagrad
from keras.regularizers import l1, l2, l1_l2
from keras.callbacks import EarlyStopping

In [135]:
def build_model(tr):
    
    input_dim = tr.shape[0]
    classes = 2
    
    model = Sequential()
    model.add(Dense(64, input_shape = (input_dim, )))
    model.add(Activation('relu'))
    model.add(Dropout(0.1))
    model.add(Dense(30, init = 'glorot_uniform', W_regularizer = l1(1e-6)))
    model.add(Activation('relu'))
    model.add(Dropout(0.4))

    model.add(Dense(classes))
    model.add(Activation('sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer='adagrad')
    return model

callback = EarlyStopping(monitor='val_acc',patience=3)

In [148]:
# model architechture
def keras_model(train):
    
    input_dim = train.shape[1]
    classes = 2
    
    model = Sequential()
    model.add(Dense(100, activation = 'relu', input_shape = (input_dim,))) #layer 1
    model.add(Dense(30, activation = 'relu')) #layer 2
    model.add(Dense(classes, activation = 'sigmoid')) #output
    model.compile(optimizer = 'adam', loss='binary_crossentropy')
    return model

callback = EarlyStopping(monitor='val_acc',patience=3)

In [149]:
ky_train = np_utils.to_categorical(np.array(y_train))
ky_valid = np_utils.to_categorical(np.array(y_valid))

In [118]:
nb_epoch = 120
nb_batch_size = 8

In [152]:
model = keras_model(X_train)

In [150]:
X_train = np.matrix(X_train[feature_names])
X_valid = np.matrix(X_valid[feature_names])

In [None]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback


class IntervalEvaluation(Callback):
    def __init__(self, validation_data=(), interval=10):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict_proba(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            logging.info("interval evaluation - epoch: {:d} - score: {:.6f}".format(epoch, score))


In [None]:
ival = IntervalEvaluation(validation_data=(X_test, y_test), interval=10)

In [None]:
model.fit(X_train, ky_train, batch_size=32,epochs=nb_epoch, callbacks=[ival])

### XGB Stacker

In [None]:
from sklearn.model_selection import StratifiedKFold
n_folds = 5
skf = StratifiedKFold(n_splits = n_folds, random_state=420)

In [None]:
def xgbStackerModel(train, test, target, features):
    
    oof_pred = pd.DataFrame({'UCIC_ID':train['UCIC_ID'], 'Responders':0})
    all_predictions = pd.DataFrame()
    
    increase = True
    best_rounds = []
    
    for i, (train_index, valid_index) in enumerate(skf.split(train, target)):
        print('[{}/{} Folds]'.format(i+1, n_folds))
        
        X_train, X_valid = train.iloc[train_index], train.iloc[valid_index]
        y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
        
        
        if increase:
            pos = pd.Series(target == 1)
            
            X_train = pd.concat([X_train, X_train[pos]], axis=0)
            y_train = pd.concat([y_train, y_train[pos]], axis=0)
            
            idx = np.arange(len(X_train))
            np.random.shuffle(idx)
            
            X_train = X_train.iloc[idx]
            y_train = y_train.iloc[idx]
            
        dtrain = xgb.DMatrix(X_train[features], y_train, missing=np.nan)
        dvalid = xgb.DMatrix(X_valid[features], y_valid, missing=np.nan)
        dtest = xgb.DMatrix(test[features])
            
        watchlist = [(dtrain, 'train'),(dvalid, 'valid')]
        clf1 = xgb.train(params, dtrain, num_boost_round=1000, evals=watchlist, maximize=True, verbose_eval=20, early_stopping_rounds=40)    
        
        best_rounds.append(clf1.best_iteration)
        
        preds1 = clf1.predict(dvalid)
        oof_pred.loc[valid_index, 'Responders'] = preds1
        
        
    ## for test, predict on whole data
    Ndtrain = xgb.DMatrix(data=train[features], label = target, missing=np.nan)
    
    n_round = int(np.round(np.mean(best_rounds)))
    clf2 = xgb.train(params, Ndtrain, n_round)
    preds2 = clf2.predict(dtest)
    
    all_predictions['pred' + str(i)] = preds2
    
    return oof_pred, all_predictions

In [None]:
oof_train, test_pred = xgbStackerModel(train, test, target, feature_names)

In [None]:
test_pred['UCIC_ID'] = test['UCIC_ID']
test_pred.rename(columns = {'pred4':'Responders'}, inplace=True)

In [None]:
oof_train.to_csv('l1/xgb_train.csv', index=False)
test_pred.to_csv('l1/xgb_test.csv', index=False)