In [None]:
# Data manipulation and set-up
import numpy as np
import pandas as pd

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1)

# Modelling (including set-up & evaluation)
from sklearn.model_selection import KFold, StratifiedKFold
import lightgbm as lgb
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

# Ignore warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

#for neural network
import lightgbm as lgb

from sklearn.metrics import roc_auc_score, log_loss
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold,KFold
from scipy.stats import norm, skew

from tqdm import tqdm_notebook as tqdm
from copy import copy
from multiprocessing import Pool

np.random.seed(42)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile
!unzip '/content/drive/MyDrive/santander-customer-transaction-prediction.zip'

Archive:  /content/drive/MyDrive/santander-customer-transaction-prediction.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


## Importing Data

In [None]:
random_state = 42
np.random.seed(random_state)
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Data Oversampling
* We are increasing our number of samples to train on by increasing the percentage of samples where Target = 1. Only 10% of the features are Target = 1. We are oversampling Target = 1 by 3 times.

In [None]:
def oversampling(x,y,t=2):
    samples =  []
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        samples.append(x1)


    samples = np.vstack(samples)
    sample_targets = np.ones(samples.shape[0])
    x = np.vstack([x,samples])
    y = np.concatenate([y,sample_targets])
    return x,y

In [None]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
val_aucs = []

In [None]:
features = [col for col in train.columns if col not in ['target', 'ID_code']]
X_test = test[features].values

# Neural Network(CNN) - 4 layers

### CNN Model parameters, Model, learning rate scheduler

In [None]:
import keras

n_splits = 7
num_preds = 1
epochs = 60
learning_rate_init = 0.02
batch_size = 4000
num_features = 200



def get_model_3():
    inp = keras.layers.Input((num_features*num_preds,))
    x = keras.layers.Reshape((num_features*num_preds,1))(inp)
    x = keras.layers.Conv1D(32,num_preds,strides=num_preds, activation='elu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Conv1D(24,1, activation='elu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Conv1D(16,1, activation='elu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Conv1D(4,1, activation='elu')(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Reshape((num_features*4,1))(x)
    x = keras.layers.AveragePooling1D(2)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.BatchNormalization()(x)
    out = keras.layers.Dense(1, activation='sigmoid')(x)
    return keras.Model(inputs=inp, outputs=out)


def lr_scheduler(epoch):
    if epoch <= epochs*0.8:
        return learning_rate_init
    else:
        return learning_rate_init * 0.1

predictions_nn1 = test[['ID_code']]

In [None]:
for fold, (trn_idx, val_idx) in enumerate(skf.split(train, train['target'])):
    X_train, y_train = train.iloc[trn_idx][features], train.iloc[trn_idx]['target']
    X_valid, y_valid = train.iloc[val_idx][features], train.iloc[val_idx]['target']
     
    p_valid,yp = 0,0

    X_t, y_t = oversampling(X_train.values, y_train.values)
    X_t = pd.DataFrame(X_t)
    X_t = X_t.add_prefix('var_')

    optimizer = keras.optimizers.Adam(lr = learning_rate_init, decay = 0.00001)
    model = get_model_3()
    callbacks = []
    callbacks.append(keras.callbacks.LearningRateScheduler(lr_scheduler))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_t, y_t, validation_data=(X_valid, y_valid), epochs=epochs, verbose=2, batch_size=batch_size, callbacks=callbacks)
    p_valid += model.predict(X_valid, batch_size=2000)[:,0]
    yp = model.predict(test[features])
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)

    print(roc_auc_score(y_valid, p_valid))
  
    predictions_nn1['fold{}'.format(fold+1)] = yp

Epoch 1/60
41/41 - 73s - loss: 0.4247 - accuracy: 0.8063 - val_loss: 0.3574 - val_accuracy: 0.9012
Epoch 2/60
41/41 - 75s - loss: 0.3667 - accuracy: 0.8406 - val_loss: 0.6236 - val_accuracy: 0.8996
Epoch 3/60
41/41 - 74s - loss: 0.3613 - accuracy: 0.8429 - val_loss: 1.0190 - val_accuracy: 0.8995
Epoch 4/60
41/41 - 75s - loss: 0.3566 - accuracy: 0.8458 - val_loss: 0.8598 - val_accuracy: 0.8995
Epoch 5/60
41/41 - 74s - loss: 0.3591 - accuracy: 0.8449 - val_loss: 0.8315 - val_accuracy: 0.8995
Epoch 6/60
41/41 - 71s - loss: 0.3535 - accuracy: 0.8474 - val_loss: 0.4749 - val_accuracy: 0.9001
Epoch 7/60
41/41 - 73s - loss: 0.3541 - accuracy: 0.8465 - val_loss: 0.3927 - val_accuracy: 0.9024
Epoch 8/60
41/41 - 81s - loss: 0.3558 - accuracy: 0.8469 - val_loss: 0.5499 - val_accuracy: 0.9000
Epoch 9/60
41/41 - 82s - loss: 0.3525 - accuracy: 0.8481 - val_loss: 0.6774 - val_accuracy: 0.8996
Epoch 10/60
41/41 - 90s - loss: 0.3511 - accuracy: 0.8484 - val_loss: 0.2559 - val_accuracy: 0.9134
Epoch 11/

In [None]:
    yp = model.predict(test[features])
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)

    print(roc_auc_score(y_valid, p_valid))
  
    predictions_nn1['fold{}'.format(fold+1)] = yp

0.8905509676338804


In [None]:
print(get_model_3().summary())
    

In [None]:
# Submission
predictions_nn1['target'] = np.mean(predictions_nn1[[col for col in predictions_nn1.columns if col not in ['ID_code', 'target']]].values, axis=1)
sub = pd.DataFrame({"ID_code":test["ID_code"].values})
sub["target"] = predictions_nn1['target']
sub.to_csv("nn_submission_2.csv", index=False)