In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# modeling
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization
from keras.layers import PReLU
from keras.optimizers import Adam
from sklearn.model_selection import KFold

# memory management
import gc

# Calling Datasets

In [None]:
#Datasets
application_train = pd.read_csv('/content/drive/MyDrive/Thesis/Home Credit Section/Final [Small]/predictor_train_small.csv')
application_test = pd.read_csv('/content/drive/MyDrive/Thesis/Home Credit Section/Final [Small]/predictor_test_small.csv')


merged=pd.concat([application_train, application_test],  ignore_index=True, sort=False)

In [None]:
merged_rates = pd.DataFrame()
merged_rates['SK_ID_CURR']=merged['SK_ID_CURR']
merged_rates['drawdown_rate']=merged['drawdown_rate']
merged_rates['utilization_rate']=merged['utilization_rate']

In [None]:
print("Raw shape: ", merged.shape)

y=merged['TARGET']

feats = [f for f in application_train.columns if f not in ['TARGET','SK_ID_CURR','index']]

X = merged[feats]

print("X shape: ", X.shape, "    y shape:", y.shape)

print("\nPreparing data...")
X = X.fillna(X.mean()).clip(-1e11,1e11)

Raw shape:  (356255, 340)
X shape:  (356255, 338)     y shape: (356255,)

Preparing data...


In [None]:
def rank_gauss(x):
    from scipy.special import erfinv
    N = x.shape[0]
    temp = x.argsort()
    rank_x = temp.argsort() / N
    rank_x -= rank_x.mean()
    rank_x *= 2
    efi_x = erfinv(rank_x)
    efi_x -= efi_x.mean()
    return efi_x

In [None]:
for i in X.columns:
    #print('Categorical: ',i)
    X[i] = rank_gauss(X[i].values)

In [None]:
training = y.notnull()
testing = y.isnull()
X_train = X[training].values
X_test = X[testing].values
y_train = np.array(y[training])
print( X_train.shape, X_test.shape, y_train.shape )
gc.collect()

(307511, 338) (48744, 338) (307511,)


73

In [None]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
class roc_callback(Callback):
    def __init__(self,training_data,validation_data):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]


    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.x)
        roc = roc_auc_score(self.y, y_pred)
        y_pred_val = self.model.predict(self.x_val)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=42)
sub_preds = np.zeros(X_test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X_train)):
    trn_x, trn_y = X_train[trn_idx], y_train[trn_idx]
    val_x, val_y = X_train[val_idx], y_train[val_idx]

    print( 'Setting up neural network...' )
    nn = Sequential()
    nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = 338))
    nn.add(PReLU())
    nn.add(Dropout(.3))
    nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(.3))
    nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(.3))
    nn.add(Dense(units = 26, kernel_initializer = 'normal'))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(.3))
    nn.add(Dense(units = 12, kernel_initializer = 'normal'))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(.3))
    nn.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    nn.compile(loss='binary_crossentropy', optimizer='adam')

    print( 'Fitting neural network...' )
    nn.fit(trn_x, trn_y, validation_data = (val_x, val_y), epochs=10, verbose=2,
          callbacks=[roc_callback(training_data=(trn_x, trn_y),validation_data=(val_x, val_y))])

    print( 'Predicting...' )
    sub_preds += nn.predict(X_test).flatten().clip(0,1) / folds.n_splits

    gc.collect()

Setting up neural network...
Fitting neural network...
Epoch 1/10
roc-auc: 0.7602 - roc-auc_val: 0.7564                                                                                                    
7688/7688 - 80s - loss: 0.2764 - val_loss: 0.2468 - 80s/epoch - 10ms/step
Epoch 2/10
roc-auc: 0.7758 - roc-auc_val: 0.768                                                                                                    
7688/7688 - 69s - loss: 0.2518 - val_loss: 0.2432 - 69s/epoch - 9ms/step
Epoch 3/10
roc-auc: 0.778 - roc-auc_val: 0.7689                                                                                                    
7688/7688 - 69s - loss: 0.2489 - val_loss: 0.2432 - 69s/epoch - 9ms/step
Epoch 4/10
roc-auc: 0.7837 - roc-auc_val: 0.7671                                                                                                    
7688/7688 - 69s - loss: 0.2467 - val_loss: 0.2431 - 69s/epoch - 9ms/step
Epoch 5/10
roc-auc: 0.7901 - roc-auc_val: 0.7698          

In [None]:
print( 'Saving results...' )
sub = pd.DataFrame()
sub['SK_ID_CURR'] = merged[testing]['SK_ID_CURR']
sub['TARGET'] = sub_preds

sub = sub.merge(merged_rates, on = 'SK_ID_CURR')

print( sub.head() )

Saving results...
   SK_ID_CURR    TARGET  drawdown_rate  utilization_rate
0      100001  0.014165       0.008140          0.024832
1      100005  0.113689       0.023754          0.072462
2      100013  0.073343       0.008975          0.027380
3      100028  0.053413       0.003909          0.005133
4      100038  0.142832       0.009657          0.033126


In [None]:
path='/content/drive/MyDrive/Thesis/Home Credit Section/Final [Small]/'
sub.to_csv(path+'predictor_small_NN.csv', index = False)