In [None]:
import time
import numpy as np
import pandas as pd
import pickle as pkl

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
import xgboost as xgb

In [None]:
from sklearn.model_selection import KFold
from tensorflow import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.layers import LeakyReLU

In [None]:
random_seed = 8789
np.random.seed(random_seed)
random.set_seed(random_seed)

In [None]:
def pkl_load(name):
    pkl_file = open("/kaggle/input/"+name, 'rb')
    X = pkl.load(pkl_file)
    pkl_file.close()
    return X

In [None]:
def func_xgbc(name, data, labels):
    xgbc = xgb.XGBClassifier(n_estimators=150, scale_pos_weight=6, max_depth=7, objective="binary:logistic", eval_metric="auc", use_label_encoder=False)
    rskf = RepeatedStratifiedKFold(n_splits=11, n_repeats=3, random_state=random_seed)
    
    scoring = ('roc_auc', 'f1', 'recall')
    scores = cross_validate(xgbc, data, labels, scoring=scoring, cv=rskf)

    for key in scores:
        print("Mean "+key+": %.3f" % np.mean(scores[key]))
    
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, stratify=labels)
    xgbc.fit(X_train, y_train)
    plot_confusion_matrix(xgbc, X_test, y_test, display_labels=['NoDlqin2yrs', 'SeriousDlqin2yrs'], cmap='viridis')
    
    return xgbc, scores

In [None]:
df_train = pd.read_csv("/kaggle/input/GiveMeSomeCredit/cs-training.csv")
df_test = pkl_load("gmsc-data-overview-eda/x_test.pkl")
df_pca = pkl_load("gmsc-data-overview-eda/pca.pkl")

In [None]:
print(df_train.loc[df_train.SeriousDlqin2yrs == 0].shape)
print(df_train.loc[df_train.SeriousDlqin2yrs == 1].shape)

# xgBoost

In [None]:
y_train = df_train.SeriousDlqin2yrs.astype('int32')
X_train = df_train.drop(columns=["SeriousDlqin2yrs"])
name = "legacy"
start_time = time.time()
# classifier, scores = func_xgbc(name, X_train, y_train)
print("--- %s minutes ---" % ((time.time() - start_time)/60))

In [None]:
df_train_curated = pkl_load("gmsc-data-overview-eda/x_train.pkl")
y_train_curated = df_train.SeriousDlqin2yrs.astype('int32')
X_train_curated = df_train.drop(columns=["SeriousDlqin2yrs"])
name = "curated"

start_time = time.time()
# classifier, scores = func_xgbc(name, X_train_curated, y_train_curated)
print("--- %s minutes ---" % ((time.time() - start_time)/60))

In [None]:
df_pca = pkl_load("gmsc-data-overview-eda/pca.pkl")
y_pca = df_pca.label
X_pca = df_pca.drop(columns=["label"])
name = "pca"

start_time = time.time()
classifier, scores = func_xgbc(name, X_pca, y_pca)
print("--- %s minutes ---" % ((time.time() - start_time)/60))

# Deep Learning

In [None]:
def get_model(feature_count, hidden_layer_size=1024, drop_out=0.2):
    model = Sequential()
    model.add(Dense(hiddenLayerSize, input_dim=feature_count))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(dropOut))
    model.add(Dense(hiddenLayerSize, input_dim=feature_count))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(dropOut))
    model.add(Dense(hiddenLayerSize))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["AUC"])
    return model

In [None]:
nSplits = 7
batchSize = 5000
epochCount = 32
feature_count = X_pca.shape[1]
hiddenLayerSize = 1024
dropOut = 0.1

In [None]:
models = []
history = {}
verbosity = 1

kfold = KFold(n_splits=nSplits, shuffle=True)
start_time = time.time()
for j, (train_idx, val_idx) in enumerate(kfold.split(X_pca)):
    model = get_model(feature_count=feature_count, hidden_layer_size=hiddenLayerSize, drop_out=dropOut)
    history[j] = model.fit(X_pca.iloc[train_idx], y_pca.iloc[train_idx], validation_data=(X_pca.iloc[val_idx], y_pca.iloc[val_idx]), batch_size=batchSize, epochs=epochCount, verbose=verbosity)
    scores = model.evaluate(X_pca.iloc[val_idx], y_pca.iloc[val_idx], verbose=verbosity)
    print('Fold %d: %s of %f' % (j,model.metrics_names[0],scores[0]))
    print('Fold %d: %s of %f' % (j,model.metrics_names[1],scores[1]))
    models.append(model)
print("--- %s minutes ---" % ((time.time() - start_time)/60))

# Predictions

In [None]:
X_test = pkl_load("gmsc-data-overview-eda/x_test.pkl")
xgbc = xgb.XGBClassifier(n_estimators=150, scale_pos_weight=6, max_depth=7,objective="binary:logistic", eval_metric="auc", use_label_encoder=False)
xgbc.fit(X_train, y_train)
y_hat = xgbc.predict_proba(X_test)
submission = pd.read_csv("/kaggle/input/GiveMeSomeCredit/sampleEntry.csv")
submission['Probability'] = y_hat[:,1]
submission.to_csv("submission-xgboost.csv", index=False)

In [None]:
X_test = pkl_load("gmsc-data-overview-eda/pca_test.pkl")
dl_model = get_model(feature_count=feature_count, hidden_layer_size=hiddenLayerSize, drop_out=dropOut)
dl_model.fit(X_pca, y_pca)
y_hat = dl_model.predict_proba(X_test)
submission = pd.read_csv("/kaggle/input/GiveMeSomeCredit/sampleEntry.csv")
submission['Probability'] = y_hat
submission.to_csv("submission-deeplearning.csv", index=False)