In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import lightgbm as lgbm
from sklearn.ensemble import RandomForestClassifier
import joblib
pd.set_option('use_inf_as_na', True)

In [None]:
X_train = pd.read_csv('../input/base-model-with-0-804-auc-on-home-credit/X_train.csv').drop(['index'],axis=1)
y_train = pd.read_csv('../input/base-model-with-0-804-auc-on-home-credit/y_train.csv')
X_test = pd.read_csv('../input/base-model-with-0-804-auc-on-home-credit/X_test.csv').drop(['index','TARGET'],axis=1)

In [None]:
X_train = X_train.fillna(-1)
X_test = X_test.fillna(-1)

In [None]:
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc = np.array(auc, dtype='f')
    auc /= (nfalse * (n - nfalse))
    return auc

In [None]:
from sklearn.model_selection import KFold

fold_score = []
kf = KFold(n_splits=10)
for fold, (train_idx, test_idx) in enumerate(kf.split(X_train)):
    clf = RandomForestClassifier()
    clf.fit(X_train.loc[train_idx], y_train.iloc[:,1][train_idx])
    fold_score.append(fast_auc(y_train.iloc[:,1][test_idx], clf.predict_proba(X_train.loc[test_idx])[:,1]))
    joblib.dump(clf,'model_fold_' +str(fold)+ '.joblib')

In [None]:
print('Cross validation score = %1.5f' % np.mean(fold_score))

In [None]:
result = []

for fold in range(10):
    clf = joblib.load('model_fold_' +str(fold)+ '.joblib')
    result.append(clf.predict_proba(X_test)[:,1])



In [None]:
submission_df = pd.read_csv('../input/home-credit-default-risk/sample_submission.csv')

In [None]:
submission_df['TARGET'] = np.mean(result,axis=0)

In [None]:
submission_df.to_csv('submission.csv', index=False)