In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import joblib

In [None]:
X_train = pd.read_csv('../input/to-the-top-v2/X_train.csv',index_col=[0]).reset_index(drop=True)
X_test = pd.read_csv('../input/to-the-top-v2/X_test.csv',index_col=[0]).reset_index(drop=True)
y_train = pd.read_csv('../input/to-the-top-v2/y_train.csv',index_col=[0])

In [None]:
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc = np.array(auc, dtype='f')
    auc /= (nfalse * (n - nfalse))
    return auc

In [None]:
from sklearn.model_selection import KFold


kf = KFold(n_splits=10)
fold_score = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X_train)):
    clf = LogisticRegression(solver='liblinear')
    clf.fit(X_train.loc[train_idx], y_train.iloc[train_idx].values.ravel())

    fold_score.append(fast_auc(y_train.iloc[test_idx], clf.predict_proba(X_train.loc[test_idx])[:,1]))
    joblib.dump(clf,'model_fold_' +str(fold)+ '.joblib')

In [None]:
print('Cross Validation score = %1.6f' % np.mean(fold_score))

In [None]:
result = []

for fold in range(10):
    clf = joblib.load('model_fold_' +str(fold)+ '.joblib')
    result.append(clf.predict_proba(X_test)[:,1])

In [None]:
submission_df = pd.read_csv('../input/santander-customer-satisfaction/sample_submission.csv')

In [None]:
submission_df['TARGET'] = np.mean(result,axis=0)

In [None]:
submission_df.to_csv('submission.csv', index=False)