In [2]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import gc

In [3]:
NFOLDS = 3
SEED = 0
NROWS = None

In [18]:
data = pd.read_csv('application_train.csv')
test = pd.read_csv("application_test.csv")
prev = pd.read_csv('previous_application.csv')

In [19]:
categorical_feats = [f for f in data.columns if data[f].dtype == 'object']

for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_])
    test[f_] = indexer.get_indexer(test[f_])
    
gc.enable()

y_train = data['TARGET']
del data['TARGET']

prev_cat_features = [f_ for f_ in prev.columns if prev[f_].dtype == 'object']

for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])

In [20]:
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['np_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

In [21]:
x_train = data.merge(right = avg_prev.reset_index(), how = 'left', on = 'SK_ID_CURR')
x_test = test.merge(right = avg_prev.reset_index(), how = 'left', on = 'SK_ID_CURR')

x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

ntrain = x_train.shape[0]
ntest = x_test.shape[0]

excluded_feats = ['SK_ID_CURR']
features = [i for i in x_train.columns if i not in excluded_feats]

x_train = x_train[features]
x_test = x_test[features]

kf = KFold(n_splits = NFOLDS, shuffle = True, random_state = SEED)

In [25]:
class SklearnWrapper(object):
    def __init__(self, clf, seed = 0, params = None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:, 1]
    
class CatboostWrapper(object):
    def __init__(self, clf, seed = 0, params = None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:, 1]

class LightGBMWrapper(object):
    def __init__(self, clf, seed = 0, params = None):
        params['feature_fraction'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:, 1]
    
class XgbWrapper(object):
    def __init__(self, seed = 0, params = None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)
        
    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label = y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)
        
    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))
    
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]
        
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        
    oof_test[:] = oof_test_skf.mean(axis = 0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

lightgbm_params = {
    'n_estimators':200,
    'learning_rate':0.1,
    'num_leaves':123,
    'colsample_bytree':0.8,
    'subsample':0.9,
    'max_depth':15,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'min_split_gain':0.01,
    'min_child_weight':2    
}

In [26]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf= CatBoostClassifier, seed = SEED, params=catboost_params)
lg = LightGBMWrapper(clf = LGBMClassifier, seed = SEED, params = lightgbm_params)

xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)

print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, cb_oof_train))))

x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))

logistic_regression = LogisticRegression()
logistic_regression.fit(x_train,y_train)

test['TARGET'] = logistic_regression.predict_proba(x_test)[:,1]

0:	total: 115ms	remaining: 22.9s
1:	total: 156ms	remaining: 15.5s
2:	total: 195ms	remaining: 12.8s
3:	total: 234ms	remaining: 11.5s
4:	total: 274ms	remaining: 10.7s
5:	total: 313ms	remaining: 10.1s
6:	total: 376ms	remaining: 10.4s
7:	total: 412ms	remaining: 9.88s
8:	total: 451ms	remaining: 9.56s
9:	total: 490ms	remaining: 9.31s
10:	total: 527ms	remaining: 9.06s
11:	total: 570ms	remaining: 8.93s
12:	total: 629ms	remaining: 9.05s
13:	total: 670ms	remaining: 8.89s
14:	total: 710ms	remaining: 8.75s
15:	total: 750ms	remaining: 8.63s
16:	total: 783ms	remaining: 8.43s
17:	total: 820ms	remaining: 8.3s
18:	total: 872ms	remaining: 8.31s
19:	total: 909ms	remaining: 8.18s
20:	total: 946ms	remaining: 8.06s
21:	total: 985ms	remaining: 7.97s
22:	total: 1.02s	remaining: 7.89s
23:	total: 1.07s	remaining: 7.83s
24:	total: 1.12s	remaining: 7.84s
25:	total: 1.16s	remaining: 7.76s
26:	total: 1.2s	remaining: 7.67s
27:	total: 1.24s	remaining: 7.59s
28:	total: 1.27s	remaining: 7.5s
29:	total: 1.31s	remaining:

42:	total: 1.81s	remaining: 6.62s
43:	total: 1.85s	remaining: 6.57s
44:	total: 1.89s	remaining: 6.51s
45:	total: 1.93s	remaining: 6.46s
46:	total: 1.97s	remaining: 6.42s
47:	total: 2.02s	remaining: 6.39s
48:	total: 2.06s	remaining: 6.36s
49:	total: 2.1s	remaining: 6.31s
50:	total: 2.14s	remaining: 6.26s
51:	total: 2.18s	remaining: 6.2s
52:	total: 2.22s	remaining: 6.15s
53:	total: 2.26s	remaining: 6.12s
54:	total: 2.31s	remaining: 6.08s
55:	total: 2.34s	remaining: 6.03s
56:	total: 2.38s	remaining: 5.97s
57:	total: 2.42s	remaining: 5.94s
58:	total: 2.47s	remaining: 5.9s
59:	total: 2.51s	remaining: 5.87s
60:	total: 2.56s	remaining: 5.82s
61:	total: 2.59s	remaining: 5.77s
62:	total: 2.63s	remaining: 5.71s
63:	total: 2.66s	remaining: 5.66s
64:	total: 2.71s	remaining: 5.62s
65:	total: 2.75s	remaining: 5.57s
66:	total: 2.79s	remaining: 5.54s
67:	total: 2.83s	remaining: 5.49s
68:	total: 2.86s	remaining: 5.44s
69:	total: 2.91s	remaining: 5.4s
70:	total: 2.96s	remaining: 5.38s
71:	total: 2.99s	r

83:	total: 3.53s	remaining: 4.87s
84:	total: 3.57s	remaining: 4.83s
85:	total: 3.6s	remaining: 4.78s
86:	total: 3.64s	remaining: 4.73s
87:	total: 3.68s	remaining: 4.69s
88:	total: 3.72s	remaining: 4.63s
89:	total: 3.77s	remaining: 4.61s
90:	total: 3.83s	remaining: 4.59s
91:	total: 3.88s	remaining: 4.55s
92:	total: 3.91s	remaining: 4.5s
93:	total: 3.96s	remaining: 4.46s
94:	total: 4s	remaining: 4.42s
95:	total: 4.04s	remaining: 4.38s
96:	total: 4.08s	remaining: 4.33s
97:	total: 4.12s	remaining: 4.29s
98:	total: 4.16s	remaining: 4.24s
99:	total: 4.2s	remaining: 4.2s
100:	total: 4.25s	remaining: 4.16s
101:	total: 4.29s	remaining: 4.12s
102:	total: 4.33s	remaining: 4.08s
103:	total: 4.37s	remaining: 4.03s
104:	total: 4.41s	remaining: 3.99s
105:	total: 4.44s	remaining: 3.94s
106:	total: 4.49s	remaining: 3.9s
107:	total: 4.53s	remaining: 3.86s
108:	total: 4.57s	remaining: 3.81s
109:	total: 4.61s	remaining: 3.77s
110:	total: 4.64s	remaining: 3.72s
111:	total: 4.68s	remaining: 3.68s
112:	total

In [28]:
test[['SK_ID_CURR', 'TARGET']]

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.047240
1,100005,0.110492
2,100013,0.026113
3,100028,0.038719
4,100038,0.148671
...,...,...
48739,456221,0.042605
48740,456222,0.075681
48741,456223,0.027298
48742,456224,0.064708


In [None]:
test[['SK_ID_CURR', 'TARGET']].to_csv('first_submission.csv', index=False, float_format='%.8f')