### Now creating oof_preds and training another xgb on top

### steps from here:
### create a dataset without normalising, drop missing values and set missing as -999 (create data v4)

In [1]:
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score



In [2]:
## loading non-scaled data

train = pd.read_pickle('submissions/train_standard.pkl')
test = pd.read_pickle('submissions/test_standard.pkl')

### logistic

In [4]:
feature_names = [x for x in train.columns if x not in ['CUSTOMER_ID','RESPONDERS']]

In [5]:
params = {"objective": "binary:logistic",
          "booster": "gblinear",
          "nthread": 4,
          "alpha": 2**3,
          "lambda": 2**3,
         "eval_metric":"auc"}

In [5]:
train['RESPONDERS'] = train['RESPONDERS'].map(lambda x: 1 if x == 'Y' else 0)

In [6]:
target = train['RESPONDERS']

In [6]:
oof_train = pd.DataFrame({'CUSTOMER_ID':train['CUSTOMER_ID'], 'RESPONDERS':0})
allpredictions = pd.DataFrame()
score = []

In [7]:
nfolds = 7
skf = StratifiedKFold(n_splits=nfolds, random_state=20178)

In [16]:
## with upscaling

increase = True
for i, (train_index, test_index) in enumerate(skf.split(train, target)):
    print('Fold %d/%d'%(i+1, nfolds))
    X_train, X_valid = train.iloc[train_index], train.iloc[test_index]
    y_train, y_valid = target.iloc[train_index], target.iloc[test_index]
    
    if increase:
        pos = pd.Series(target == 1)

        X_train = pd.concat([X_train, X_train[pos]], axis=0)
        y_train = pd.concat([y_train, y_train.loc[pos]], axis=0)
        
        idx = np.arange(len(X_train))
        np.random.shuffle(idx)
        X_train = X_train.iloc[idx]
        y_train = y_train.iloc[idx]
    
    dtrain = xgb.DMatrix(X_train[feature_names], label=y_train)
    dvalid = xgb.DMatrix(X_valid[feature_names], label=y_valid)
    
    watchlist = [(dtrain, 'train'),(dvalid,'valid')]
    
    clf = xgb.train(params,
                    dtrain,
                    num_boost_round=10000,
                    evals=watchlist,
                    early_stopping_rounds=40,
                    verbose_eval=20,
                    maximize = True)
    
    pred1 = clf.predict(dvalid)
    oof_train.loc[test_index, 'RESPONDERS'] = pred1
    
    scr = roc_auc_score(y_valid, pred1)
    dtest = xgb.DMatrix(test[feature_names])
    preds2 = clf.predict(dtest)
    
    allpredictions['p'+str(i)] = preds2
    score.append(scr)
    
    del X_train, X_valid, y_train, y_valid, clf, pred1, preds2, scr

Fold 1/7




[0]	train-auc:0.823646	valid-auc:0.813217
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 40 rounds.
[20]	train-auc:0.853555	valid-auc:0.839755
[40]	train-auc:0.854175	valid-auc:0.841716
[60]	train-auc:0.854181	valid-auc:0.842037
[80]	train-auc:0.85422	valid-auc:0.842039
[100]	train-auc:0.854271	valid-auc:0.841869
Stopping. Best iteration:
[64]	train-auc:0.854183	valid-auc:0.842105

Fold 2/7
[0]	train-auc:0.822069	valid-auc:0.832578
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 40 rounds.
[20]	train-auc:0.851731	valid-auc:0.851937
[40]	train-auc:0.852651	valid-auc:0.850707
Stopping. Best iteration:
[17]	train-auc:0.851208	valid-auc:0.852072

Fold 3/7
[0]	train-auc:0.827616	valid-auc:0.824697
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved 

In [19]:
sub = pd.read_csv('sample_submission_ROqqAAN.csv')
sub['CUSTOMER_ID'] = test['CUSTOMER_ID']
sub['RESPONDERS'] = allpredictions.mean(axis=1).values

In [20]:
sub.to_csv('submissions/oof_preds/xgb_LR_test.csv', index=False)
oof_train.to_csv('submissions/oof_preds/xgb_LR_train.csv', index=False)

### random forest

In [7]:
oof_train = pd.DataFrame({'CUSTOMER_ID':train['CUSTOMER_ID'], 'RESPONDERS':0})
allpredictions = pd.DataFrame()
score = []

In [8]:
nfolds = 7
skf = StratifiedKFold(n_splits=nfolds, random_state=20178)

In [9]:
param_dart = {'booster': 'dart',
         'max_depth': 5, 'learning_rate': 0.1,
         'objective': 'binary:logistic',
            'sample_type': 'uniform',
         'normalize_type': 'tree',
         'rate_drop': 0.1,
         'skip_drop': 0.5,
             'eval_metric':'auc'}

In [10]:
## with upscaling

increase = True
for i, (train_index, test_index) in enumerate(skf.split(train, target)):
    print('Fold %d/%d'%(i+1, nfolds))
    X_train, X_valid = train.iloc[train_index], train.iloc[test_index]
    y_train, y_valid = target.iloc[train_index], target.iloc[test_index]
    
    if increase:
        pos = pd.Series(target == 1)

        X_train = pd.concat([X_train, X_train[pos]], axis=0)
        y_train = pd.concat([y_train, y_train.loc[pos]], axis=0)
        
        idx = np.arange(len(X_train))
        np.random.shuffle(idx)
        X_train = X_train.iloc[idx]
        y_train = y_train.iloc[idx]
    
    dtrain = xgb.DMatrix(X_train[feature_names], label=y_train)
    dvalid = xgb.DMatrix(X_valid[feature_names], label=y_valid)
    
    watchlist = [(dtrain, 'train'),(dvalid,'valid')]
    
    clf = xgb.train(param_dart,
                    dtrain,
                    num_boost_round=10000,
                    evals=watchlist,
                    early_stopping_rounds=40,
                    verbose_eval=20,
                    maximize = True)
    
    pred1 = clf.predict(dvalid)
    oof_train.loc[test_index, 'RESPONDERS'] = pred1
    
    scr = roc_auc_score(y_valid, pred1)
    dtest = xgb.DMatrix(test[feature_names])
    preds2 = clf.predict(dtest)
    
    allpredictions['p'+str(i)] = preds2
    score.append(scr)
    
    del X_train, X_valid, y_train, y_valid, clf, pred1, preds2, scr

Fold 1/7




[0]	train-auc:0.823002	valid-auc:0.806643
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 40 rounds.
[20]	train-auc:0.881757	valid-auc:0.854044
[40]	train-auc:0.894689	valid-auc:0.861931
[60]	train-auc:0.908237	valid-auc:0.87082
[80]	train-auc:0.918199	valid-auc:0.874581
[100]	train-auc:0.924495	valid-auc:0.874997
[120]	train-auc:0.929634	valid-auc:0.877916
[140]	train-auc:0.933171	valid-auc:0.879144
[160]	train-auc:0.937841	valid-auc:0.880437
[180]	train-auc:0.941114	valid-auc:0.882856
[200]	train-auc:0.943189	valid-auc:0.883432
[220]	train-auc:0.946161	valid-auc:0.884324
[240]	train-auc:0.948729	valid-auc:0.884734


KeyboardInterrupt: 

In [19]:
sub = pd.read_csv('sample_submission_ROqqAAN.csv')
sub['CUSTOMER_ID'] = test['CUSTOMER_ID']
sub['RESPONDERS'] = allpredictions.mean(axis=1).values

In [20]:
sub.to_csv('submissions/oof_preds/xgb_RF_test.csv', index=False)
oof_train.to_csv('submissions/oof_preds/xgb_RF_train.csv', index=False)

### keras


In [19]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers.core import Dense, Dropout, Activation

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [14]:
scaler = StandardScaler().fit(train[feature_names])

In [15]:
strain = scaler.transform(train[feature_names])
stest = scaler.transform(test[feature_names])

In [17]:
# train validation split
X_train, X_valid, Y_train, Y_valid = train_test_split(strain, target, test_size = 0.7, stratify = target, random_state=2017)

In [27]:
# model architechture
def keras_model(feature_names):
    
    input_dim = len(feature_names)
    classes = 2
    
    model = Sequential()
    model.add(Dense(128, activation = 'relu', input_shape = (input_dim,))) #layer 1
    model.add(Dropout(0.2))
    model.add(Dense(64, activation = 'relu')) #layer 2
    model.add(Dropout(0.2))
    model.add(Dense(64, activation = 'relu')) #layer 3
    model.add(Dense(classes, activation = 'sigmoid')) #output
    model.compile(optimizer = 'adam', loss='binary_crossentropy',metrics = ['accuracy'])
    return model

In [28]:
model = keras_model(feature_names)

In [23]:
oof_train = pd.DataFrame({'CUSTOMER_ID':train['CUSTOMER_ID'], 'RESPONDERS':0})
allpredictions = pd.DataFrame()
score = []

In [24]:
nfolds = 7
skf = StratifiedKFold(n_splits=nfolds, random_state=20178)

In [31]:
## class weight

from sklearn.utils import class_weight

cw = class_weight.compute_class_weight('balanced', np.unique(target), target)

In [None]:
## with upscaling

increase = True
for i, (train_index, test_index) in enumerate(skf.split(strain, target)):
    print('Fold %d/%d'%(i+1, nfolds))
    X_train, X_valid = strain[train_index], strain[test_index]
    y_train, y_valid = target.iloc[train_index], target.iloc[test_index]
    
#     if increase:
#         pos = pd.Series(target == 1)

#         X_train = pd.concat([X_train, X_train[pos]], axis=0)
#         y_train = pd.concat([y_train, y_train.loc[pos]], axis=0)
        
#         idx = np.arange(len(X_train))
#         np.random.shuffle(idx)
#         X_train = X_train.iloc[idx]
#         y_train = y_train.iloc[idx]
    
    # one hot target columns
    Y_train = to_categorical(y_train)
    Y_valid = to_categorical(y_valid)
    
    callback = EarlyStopping(monitor='val_acc',patience=3)
    model.fit(X_train, Y_train, 500, 10, callbacks=[callback],validation_data=(X_valid, Y_valid), class_weight=cw)
    
    pred1 = model.predict_proba(X_valid)[:,1]
    oof_train.loc[test_index, 'RESPONDERS'] = pred1
    
    scr = roc_auc_score(y_valid, pred1)
    preds2 = model.predict_proba(stest)[:,1]
    
    allpredictions['p'+str(i)] = preds2
    score.append(scr)
    
    del X_train, X_valid, y_train, y_valid, pred1, preds2, scr

Fold 1/7
Train on 257142 samples, validate on 42858 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

In [37]:
del strain, stest

### xgb

In [9]:
feature_names = [x for x in train.columns if x not in ['CUSTOMER_ID','RESPONDERS']]

In [3]:
params = {"objective": "binary:logistic",
          "booster": "gbtree",
          "nthread": 4,
          "eta": 0.1, # 0.1
          "max_depth": 5, # 7
          "subsample": 0.8,
          "colsample_bytree": 0.4,
          "min_child_weight": 2**3,
          "seed": 2016, 
          "tree_method": "exact",
         "eval_metric":"auc"}

In [4]:
train['RESPONDERS'] = train['RESPONDERS'].map(lambda x: 1 if x == 'Y' else 0)

In [5]:
target = train['RESPONDERS']

In [6]:
oof_train = pd.DataFrame({'CUSTOMER_ID':train['CUSTOMER_ID'], 'RESPONDERS':0})
allpredictions = pd.DataFrame()
score = []

In [7]:
nfolds = 7
skf = StratifiedKFold(n_splits=nfolds, random_state=20178)

In [10]:
## with upscaling

increase = True
for i, (train_index, test_index) in enumerate(skf.split(train, target)):
    print('Fold %d/%d'%(i+1, nfolds))
    X_train, X_valid = train.iloc[train_index], train.iloc[test_index]
    y_train, y_valid = target.iloc[train_index], target.iloc[test_index]
    
    if increase:
        pos = pd.Series(target == 1)

        X_train = pd.concat([X_train, X_train[pos]], axis=0)
        y_train = pd.concat([y_train, y_train.loc[pos]], axis=0)
        
        idx = np.arange(len(X_train))
        np.random.shuffle(idx)
        X_train = X_train.iloc[idx]
        y_train = y_train.iloc[idx]
    
    dtrain = xgb.DMatrix(X_train[feature_names], label=y_train)
    dvalid = xgb.DMatrix(X_valid[feature_names], label=y_valid)
    
    watchlist = [(dtrain, 'train'),(dvalid,'valid')]
    
    clf = xgb.train(params,
                    dtrain,
                    num_boost_round=10000,
                    evals=watchlist,
                    early_stopping_rounds=40,
                    verbose_eval=20,
                    maximize = True)
    
    pred1 = clf.predict(dvalid)
    oof_train.loc[test_index, 'RESPONDERS'] = pred1
    
    scr = roc_auc_score(y_valid, pred1)
    dtest = xgb.DMatrix(test[feature_names])
    preds2 = clf.predict(dtest)
    
    allpredictions['p'+str(i)] = preds2
    score.append(scr)
    
    del X_train, X_valid, y_train, y_valid, clf, pred1, preds2, scr

Fold 1/7




[0]	train-auc:0.830076	valid-auc:0.800329
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 40 rounds.
[20]	train-auc:0.883565	valid-auc:0.85243
[40]	train-auc:0.900354	valid-auc:0.864264
[60]	train-auc:0.914598	valid-auc:0.872302
[80]	train-auc:0.924065	valid-auc:0.878817
[100]	train-auc:0.930343	valid-auc:0.880814
[120]	train-auc:0.935609	valid-auc:0.881091
[140]	train-auc:0.939979	valid-auc:0.882412
[160]	train-auc:0.94411	valid-auc:0.882767
[180]	train-auc:0.948222	valid-auc:0.883248
[200]	train-auc:0.951981	valid-auc:0.883406
[220]	train-auc:0.955035	valid-auc:0.883809
[240]	train-auc:0.957791	valid-auc:0.883783
Stopping. Best iteration:
[214]	train-auc:0.954131	valid-auc:0.883901

Fold 2/7
[0]	train-auc:0.825935	valid-auc:0.8196
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 40 rounds.
[20]	train-auc:0.880976	valid-

In [11]:
sub = pd.read_csv('sample_submission_ROqqAAN.csv')
sub['CUSTOMER_ID'] = test['CUSTOMER_ID']
sub['RESPONDERS'] = allpredictions.mean(axis=1).values

In [13]:
sub.to_csv('submissions/oof_preds/xgb_XG_test.csv', index=False)
oof_train.to_csv('submissions/oof_preds/xgb_XG_train.csv', index=False)

### extra trees

In [15]:
from sklearn.ensemble import ExtraTreesClassifier

In [32]:
def runET(train_X, train_y,val_X, val_y, test_X):
        
    et = ExtraTreesClassifier(n_estimators=100,
                              criterion = "entropy",
                              max_depth = 5,
                              max_features = 0.7,
                              min_samples_leaf = 1,
                              random_state=2017,
                              n_jobs=-1,
                              class_weight = {0:0.5, 1:32.33})

    et.fit(train_X, train_y)

    pred_val_y = et.predict_proba(val_X)[:, 1]
    pred_test = et.predict_proba(test_X)[:, 1]
    
    return pred_val_y, pred_test

In [33]:
oof_train = pd.DataFrame({'CUSTOMER_ID':train['CUSTOMER_ID'], 'RESPONDERS':0})
allpredictions = pd.DataFrame()
score = []

In [34]:
nfolds = 7
skf = StratifiedKFold(n_splits=nfolds, random_state=20178)

In [35]:
## with upscaling

increase = True
for i, (train_index, test_index) in enumerate(skf.split(train, target)):
    print('Fold %d/%d'%(i+1, nfolds))
    X_train, X_valid = train.iloc[train_index], train.iloc[test_index]
    y_train, y_valid = target.iloc[train_index], target.iloc[test_index]
    
    if increase:
        pos = pd.Series(target == 1)

        X_train = pd.concat([X_train, X_train[pos]], axis=0)
        y_train = pd.concat([y_train, y_train.loc[pos]], axis=0)
        
        idx = np.arange(len(X_train))
        np.random.shuffle(idx)
        X_train = X_train.iloc[idx]
        y_train = y_train.iloc[idx]
    
    pred1, preds2 = runET(X_train[feature_names], y_train, X_valid[feature_names], y_valid, test[feature_names])
    
    
    oof_train.loc[test_index, 'RESPONDERS'] = pred1
    
    scr = roc_auc_score(y_valid, pred1)
    print(scr)
    
    allpredictions['p'+str(i)] = preds2
    score.append(scr)
    
    del X_train, X_valid, y_train, y_valid, pred1, preds2, scr

Fold 1/7




0.830864876409
Fold 2/7




0.844425471466
Fold 3/7




0.838671938296
Fold 4/7




0.845827974357
Fold 5/7




0.857631800974
Fold 6/7




0.831169461174
Fold 7/7




0.845568820084


In [36]:
sub = pd.read_csv('sample_submission_ROqqAAN.csv')
sub['CUSTOMER_ID'] = test['CUSTOMER_ID']
sub['RESPONDERS'] = allpredictions.mean(axis=1).values

In [37]:
sub.to_csv('submissions/oof_preds/et_test.csv', index=False)
oof_train.to_csv('submissions/oof_preds/et_train.csv', index=False)