In [1]:
from sklearn import cross_validation
from sklearn.datasets import load_boston

import pandas as pd
import numpy as np
from tqdm import tqdm
from logging import getLogger
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, ParameterGrid
import xgboost as xgb
from sklearn.metrics import log_loss, roc_auc_score, roc_curve, auc
import pickle

from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import svm
from sklearn.model_selection import KFold



In [2]:
TRAIN_DATA = '../input/train-1.csv'
TEST_DATA = '../input/test-1.csv'

In [3]:
logger = getLogger(__name__)

In [4]:
train = pd.read_csv(TRAIN_DATA)
train.head(10)

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0
5,19,0,5,1,4,0,0,0,0,0,...,4,2,0,9,0,1,0,1,1,1
6,20,0,2,1,3,1,0,0,1,0,...,3,0,0,10,0,1,0,0,1,0
7,22,0,5,1,4,0,0,1,0,0,...,7,1,3,6,1,0,1,0,1,0
8,26,0,5,1,3,1,0,0,0,1,...,4,2,1,5,0,1,0,0,0,1
9,28,1,1,1,2,0,0,0,1,0,...,3,5,0,6,0,1,0,0,1,0


In [5]:
logger = getLogger(__name__)

In [6]:
DIR = 'result_tmp/'
SAMPLE_SUBMIT_FILE = '../input/sample_submission.csv'

In [7]:
def gini(y, pred):
    fpr, tpr, thr = roc_curve(y, pred, pos_label = 1)
    g = 2 * auc(fpr, tpr) - 1
    return g

In [8]:
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', - gini(y, pred)

In [9]:
def read_csv(path):
    logger.debug('enter')
    df = pd.read_csv(path)
    logger.debug('exit')
    return df

In [10]:
def load_train_data():
    logger.debug('enter')
    df = read_csv(TRAIN_DATA)
    logger.debug('exit')
    return df

In [11]:
def load_test_data():
    logger.debug('enter')
    df = read_csv(TEST_DATA)
    logger.debug('exit')
    return df

In [12]:
if __name__ == '__main__':
    print(load_train_data().head())
    print(load_test_data().head())

   id  target  ps_ind_01  ps_ind_02_cat  ps_ind_03  ps_ind_04_cat  \
0   7       0          2              2          5              1   
1   9       0          1              1          7              0   
2  13       0          5              4          9              1   
3  16       0          0              1          2              0   
4  17       0          0              2          0              1   

   ps_ind_05_cat  ps_ind_06_bin  ps_ind_07_bin  ps_ind_08_bin       ...        \
0              0              0              1              0       ...         
1              0              0              0              1       ...         
2              0              0              0              1       ...         
3              0              1              0              0       ...         
4              0              1              0              0       ...         

   ps_calc_11  ps_calc_12  ps_calc_13  ps_calc_14  ps_calc_15_bin  \
0           9           1    

In [13]:
if __name__ == '__main__':
    log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
    handler = StreamHandler()
    handler.setLevel('INFO')
    handler.setFormatter(log_fmt)
    logger.addHandler(handler)

    handler = FileHandler(DIR + 'train.py.log', 'a')
    handler.setLevel(DEBUG)
    handler.setFormatter(log_fmt)
    logger.setLevel(DEBUG)
    logger.addHandler(handler)

    logger.info('start')

2017-11-29 18:18:44,450 __main__ 14 [INFO][<module>] start 


In [14]:
df = load_train_data()
df.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [17]:
drop_col = ['target', 'ps_ind_05_cat','ps_reg_03','ps_car_03_cat','ps_car_05_cat','ps_car_07_cat','ps_car_09_cat','ps_car_14','ps_calc_01','ps_calc_02','ps_calc_03','ps_calc_04','ps_calc_05','ps_calc_06','ps_calc_07','ps_calc_08','ps_calc_09','ps_calc_10','ps_calc_11','ps_calc_12','ps_calc_13','ps_calc_14','ps_calc_15_bin','ps_calc_16_bin','ps_calc_17_bin','ps_calc_18_bin','ps_calc_19_bin','ps_calc_20_bin']

In [18]:
x_train = df.drop(drop_col,axis = 1)
y_train = df['target'].values

In [19]:
use_cols = x_train.columns.values

In [20]:
logger.debug('train columns:{} {}'.format(use_cols.shape, use_cols))
logger.info('data preparation end{}'.format(x_train.shape))

2017-11-29 18:42:25,945 __main__ 2 [INFO][<module>] data preparation end(595212, 31) 


In [21]:
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)

In [22]:
all_params = {'max_depth': [6],
                         'learning_rate': [0.2],
                         'min_child_weight': [3],
                         'n_estimators': [10000],
                         'colsample_bytree': [0.8],
                         'colsample_bylevel':[0.8],
                         'reg_alpha': [0.1],
                         'max_delta_step': [0.1],
                         'seed': [0],}

In [23]:
min_score = 100
min_params = None

In [24]:
for params in tqdm(list(ParameterGrid(all_params))):
    logger.info('params:{}'.format(params))
    
    list_gini_score = []
    list_logloss_score = []
    list_best_iterations = []
    for train_idx, valid_idx in cv.split(x_train, y_train):
        trn_x = x_train.iloc[train_idx, :]
        val_x = x_train.iloc[valid_idx, :]
        
        trn_y = y_train[train_idx]
        val_y = y_train[valid_idx]
        
        clf = xgb.sklearn.XGBClassifier(**params)
        clf.fit(trn_x, trn_y, eval_set = [(val_x, val_y)],early_stopping_rounds = 100, eval_metric = gini_xgb)
        
        pred = clf.predict_proba(val_x, ntree_limit = clf.best_ntree_limit)[:, 1]
        sc_logloss = log_loss(val_y, pred)
        sc_gini = -gini(val_y, pred)
        
        list_logloss_score.append(sc_logloss)
        list_gini_score.append(sc_gini)
        list_best_iterations.append(clf.best_iteration)
        logger.debug('  logloss:{},  gini:{}'.format(sc_logloss, sc_gini))
        break

  0%|          | 0/1 [00:00<?, ?it/s]2017-11-29 18:42:28,859 __main__ 2 [INFO][<module>] params:{'colsample_bylevel': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_delta_step': 0.1, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 10000, 'reg_alpha': 0.1, 'seed': 0} 


[0]	validation_0-error:0.036449	validation_0-gini:-0
Multiple eval metrics have been passed: 'validation_0-gini' will be used for early stopping.

Will train until validation_0-gini hasn't improved in 100 rounds.
[1]	validation_0-error:0.036449	validation_0-gini:-0
[2]	validation_0-error:0.036449	validation_0-gini:-0
[3]	validation_0-error:0.036449	validation_0-gini:-0
[4]	validation_0-error:0.036449	validation_0-gini:-0
[5]	validation_0-error:0.036449	validation_0-gini:-0
[6]	validation_0-error:0.036449	validation_0-gini:-0
[7]	validation_0-error:0.036449	validation_0-gini:-0
[8]	validation_0-error:0.036449	validation_0-gini:-0
[9]	validation_0-error:0.036449	validation_0-gini:-0
[10]	validation_0-error:0.036449	validation_0-gini:-0
[11]	validation_0-error:0.036449	validation_0-gini:-0
[12]	validation_0-error:0.036449	validation_0-gini:-0
[13]	validation_0-error:0.036449	validation_0-gini:-0
[14]	validation_0-error:0.036449	validation_0-gini:-0
[15]	validation_0-error:0.036449	validat

[134]	validation_0-error:0.036432	validation_0-gini:-0.052179
[135]	validation_0-error:0.036432	validation_0-gini:-0.054771
[136]	validation_0-error:0.036432	validation_0-gini:-0.05607
[137]	validation_0-error:0.036432	validation_0-gini:-0.056288
[138]	validation_0-error:0.036424	validation_0-gini:-0.058117
[139]	validation_0-error:0.036424	validation_0-gini:-0.062144
[140]	validation_0-error:0.036424	validation_0-gini:-0.09369
[141]	validation_0-error:0.036424	validation_0-gini:-0.103586
[142]	validation_0-error:0.036424	validation_0-gini:-0.120818
[143]	validation_0-error:0.036424	validation_0-gini:-0.126002
[144]	validation_0-error:0.036424	validation_0-gini:-0.126471
[145]	validation_0-error:0.036424	validation_0-gini:-0.131758
[146]	validation_0-error:0.036424	validation_0-gini:-0.132948
[147]	validation_0-error:0.036424	validation_0-gini:-0.134735
[148]	validation_0-error:0.036424	validation_0-gini:-0.140893
[149]	validation_0-error:0.036424	validation_0-gini:-0.145879
[150]	vali

[267]	validation_0-error:0.036432	validation_0-gini:-0.262378
[268]	validation_0-error:0.036432	validation_0-gini:-0.262321
[269]	validation_0-error:0.036424	validation_0-gini:-0.262338
[270]	validation_0-error:0.036432	validation_0-gini:-0.262359
[271]	validation_0-error:0.036432	validation_0-gini:-0.262348
[272]	validation_0-error:0.036432	validation_0-gini:-0.262256
[273]	validation_0-error:0.036432	validation_0-gini:-0.262319
[274]	validation_0-error:0.036432	validation_0-gini:-0.262238
[275]	validation_0-error:0.036432	validation_0-gini:-0.262207
[276]	validation_0-error:0.036432	validation_0-gini:-0.262315
[277]	validation_0-error:0.036432	validation_0-gini:-0.262021
[278]	validation_0-error:0.036432	validation_0-gini:-0.261911
[279]	validation_0-error:0.036432	validation_0-gini:-0.261777
[280]	validation_0-error:0.036432	validation_0-gini:-0.261637
[281]	validation_0-error:0.036432	validation_0-gini:-0.261466
[282]	validation_0-error:0.036432	validation_0-gini:-0.261423
[283]	va

100%|██████████| 1/1 [04:35<00:00, 275.91s/it]


In [25]:
params['n_estimators'] = int(np.mean(list_best_iterations))
sc_logloss = np.mean(list_gini_score)
if min_score > sc_gini:
    min_score = sc_gini
    min_params = params
logger.info('logloss:{}, gini: {}'.format(sc_logloss, sc_gini))
logger.info('current min score:{}, params: {}'.format(min_score, min_params))

2017-11-29 18:47:04,780 __main__ 6 [INFO][<module>] logloss:-0.2629604249506956, gini: -0.2629604249506956 
2017-11-29 18:47:04,783 __main__ 7 [INFO][<module>] current min score:-0.2629604249506956, params: {'colsample_bylevel': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_delta_step': 0.1, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 232, 'reg_alpha': 0.1, 'seed': 0} 


In [26]:
logger.info('minimum params:{}'.format(min_params))
logger.info('minimum gini:{}'.format(min_score))

2017-11-29 18:47:04,793 __main__ 1 [INFO][<module>] minimum params:{'colsample_bylevel': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_delta_step': 0.1, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 232, 'reg_alpha': 0.1, 'seed': 0} 
2017-11-29 18:47:04,795 __main__ 2 [INFO][<module>] minimum gini:-0.2629604249506956 


In [None]:
clf = xgb.sklearn.XGBClassifier(**min_params)
clf.fit(x_train, y_train)

In [None]:
with open(DIR + 'model.pkl', 'wb') as f:
    pickle.dump(clf, f, -1)

logger.info('train end')

In [None]:
with open(DIR + 'model.pkl', 'rb') as f:
    clf = pickle.load(f)
df = load_test_data()

In [None]:
x_test = df[use_cols].sort_values('id')

In [None]:
for col in use_cols:
    if col not in df.columns:
        logger.info('{} is not in test data'.format(col))
        df[col] = np.zeros(df.shape[0])

In [None]:
logger.info('test data load end{}'.format(x_test.shape))

In [None]:
pred_test = clf.predict_proba(x_test)[:, 1]

df_submit = pd.read_csv(SAMPLE_SUBMIT_FILE).sort_values('id')
df_submit['target'] = pred_test
df_submit.to_csv(DIR + 'submit.csv', index = False)

logger.info('end')