In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import StratifiedKFold



In [2]:
train = pd.read_csv('../input/train.csv', nrows=1000)
test = pd.read_csv('../input/test.csv', nrows=1000)

In [3]:
def preprocessing(train, test):
    X = train.iloc[:, 2:]
    y = train.target
    test_id = test.id
    test.drop('id', axis=1, inplace=True)
    
    # other preprocessing
    
    return X, y, test

In [4]:
X, y, test = preprocessing(train, test)

In [9]:
params = {
        'objective':'binary:logistic',        
        'max_depth':10,
        'learning_rate':0.01,
        'eval_metric':'auc',
        'min_child_weight':6,
        'subsample':0.8,
        'colsample_bytree':0.8,
        'seed':1971,
        'reg_lambda':1.3,
        'reg_alpha':8,
        'gamma':10,
        'scale_pos_weight':1.6,
        'n_thread':-1
    }

In [10]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

In [11]:
def do_xgb(X, y, test, n_splits):
    skf = StratifiedKFold(n_splits=n_splits)
    skf.get_n_splits(X, y)
    prediction = 0
    oof_train = pd.DataFrame(np.zeros((X.shape[0], 1)))
    oof_test = pd.DataFrame(np.zeros((test.shape[0], 1)))
    scores = []
    feature_imp = 0
    for train_index, test_index in skf.split(X, y):   
       X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
       y_train, y_test = y[train_index], y[test_index]
       train_data = xgb.DMatrix(X_train, y_train)
       valid_data = xgb.DMatrix(X_test, y_test)
       watchlist = [(train_data,'train'),(valid_data,'test')]
       model = xgb.train(params, 
                         train_data, 
                         2500,
                         watchlist, 
                         verbose_eval=50,                               
                         early_stopping_rounds=256)       
       scores.append(model.best_score)
       prediction += model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
       oof_train.iloc[test_index, 0] = model.predict(xgb.DMatrix(X_test), ntree_limit=model.best_ntree_limit)
       

    prediction /= n_splits
    feature_imp /= n_splits
    score = np.mean(scores)
    var = np.var(scores)

    oof_test = pd.DataFrame(prediction)

    return prediction, feature_imp, score, var, oof_train, oof_test

prediction, feature_imp, score, variance, oof_train, oof_test = do_xgb(X, y, test, 5)

[0]	train-auc:0.5	test-auc:0.5
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 256 rounds.
[50]	train-auc:0.5	test-auc:0.5
[100]	train-auc:0.5	test-auc:0.5
[150]	train-auc:0.5	test-auc:0.5
[200]	train-auc:0.625093	test-auc:0.617552
[250]	train-auc:0.625093	test-auc:0.617552
[300]	train-auc:0.625093	test-auc:0.617552
[350]	train-auc:0.625093	test-auc:0.617552
[400]	train-auc:0.625093	test-auc:0.617552
Stopping. Best iteration:
[151]	train-auc:0.625093	test-auc:0.617552

[0]	train-auc:0.5	test-auc:0.5
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 256 rounds.
[50]	train-auc:0.5	test-auc:0.5
[100]	train-auc:0.5	test-auc:0.5
[150]	train-auc:0.5	test-auc:0.5
[200]	train-auc:0.5	test-auc:0.5
[250]	train-auc:0.5	test-auc:0.5
Stopping. Best iteration:
[0]	train-auc:0.5	test-auc:0.5

[0]	train-auc:0.5	test-auc:0.5
Multiple eval metr