In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import xgboost as xgb

In [None]:
X_train = pd.read_csv('../input/xgb-fraud-with-magic-0-9600/X_train.csv')
X_test = pd.read_csv('../input/xgb-fraud-with-magic-0-9600/X_test.csv')
y_train = pd.read_csv('../input/xgb-fraud-with-magic-0-9600/y_train.csv',header=None)

In [None]:
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc = np.array(auc, dtype='f')
    auc /= (nfalse * (n - nfalse))
    return auc

In [None]:
from sklearn.model_selection import KFold


kf = KFold(n_splits=10)
fold_score = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X_train)):
    clf = xgb.XGBClassifier( 
        objective='binary:logistic',
        n_estimators=2000,
        max_depth=12, 
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        missing=-1, 
        eval_metric=['auc','logloss'],
        nthread=4,
        tree_method='hist' 
    )
    clf.fit(X_train.loc[train_idx], y_train.iloc[:,1][train_idx],
           eval_set=[(X_train.loc[test_idx],y_train.iloc[:,1][test_idx])],verbose=50, early_stopping_rounds=100)
    fold_score.append(fast_auc(y_train.iloc[:,1][test_idx],clf.predict_proba(X_train.loc[test_idx])[:,1]))
    clf.save_model('model_fold_' +str(fold)+ '.json')

In [None]:
print('Cross validation score = %1.5f' % np.mean(fold_score))

In [None]:
result = []

for fold in range(10):

    clf = xgb.XGBClassifier()
    clf.load_model('model_fold_' +str(fold)+ '.json')
    result.append(clf.predict_proba(X_test)[:,1])

In [None]:
submission_df = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [None]:
submission_df['isFraud'] = np.mean(result,axis=0)

In [None]:
submission_df.to_csv('submission.csv', index=False)