In [None]:
import pandas as pd
import numpy as np
import keras
from tensorflow.keras import layers, optimizers, callbacks, utils, losses, metrics, backend as K
from keras.models import Sequential
import tensorflow_addons as tfa
import tensorflow as tf
from keras.layers import Flatten, Activation, Dropout,BatchNormalization
from keras.layers import Dense
from tensorflow.keras import layers
from tensorflow.keras import activations,callbacks
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix, roc_auc_score, plot_roc_curve, classification_report
from matplotlib import pyplot as plt
plt.style.use('dark_background')
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

In [None]:
initial_csv = pd.read_csv('../input/dataset/train.csv') # to follow the preprocess function
train= pd.read_csv('../input/dataset/train.csv')
target=train['target']

<h1> Encoding One Hot <h1>

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')

#combine train and test data vertically
X_nums = np.vstack([
    train_data.iloc[:, 20:-1].to_numpy(),
    test_data.iloc[:, 20:].to_numpy()
])
X_nums = (X_nums - X_nums.mean(0)) / X_nums.std(0) #normalize

#stack the categorical data
X_cat = np.vstack([
    train_data.iloc[:, 1:20].to_numpy(),
    test_data.iloc[:, 1:20].to_numpy()
])
#encode the categoricals
encoder = OneHotEncoder(sparse=False)
X_cat = encoder.fit_transform(X_cat)

#join the categorical and continuous data horizontally
X = np.hstack([X_cat, X_nums])
y = train_data['target'].to_numpy().reshape(-1, 1)

In [None]:
train = X[:300000,:]
target = train_data.iloc[:300000,-1]
x_train, x_test, y_train, y_test = train_test_split(train, target, test_size=0.2)

In [None]:
x_train.shape

<h2> Get_dummies encoding <h2>

In [None]:
def preprocess_x(df):
    try: df.set_index('id',inplace=True)
    except: pass

    df = pd.get_dummies(df, drop_first=False)
    for col in pd.get_dummies(initial_csv.drop(columns=['target']), drop_first=False).columns:
        if col not in df.columns:
            df[col]=0

    return df

def preprocess(df):
    try: df.set_index('id',inplace=True)
    except: pass

    x = df.drop(columns=['target'])
    x = preprocess_x(x)
    return x

train = preprocess(train)

In [None]:
train.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train, target, test_size=0.2)


In [None]:
print("x_train",x_train.shape)
print("x_test",x_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)

<h1> Training <h1>

<h3> Model definition <h3>

In [None]:
model = Sequential()

model.add(Dense(300, activation='relu',input_dim=642)) # depends on the shape of train !
model.add(Dropout(0.3))
model.add(Dense(300, activation='relu'))
#model.add(Dropout(0.3))
#model.add(Dense(30, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
tf.keras.metrics.AUC(
    num_thresholds=200,
    curve="ROC",
    summation_method="interpolation",
    name= 'val_AUC',
    dtype=None,
    thresholds=None,
    multi_label=False,
    label_weights=None,
)

In [None]:
model.compile(
        optimizer=tfa.optimizers.SWA(tf.keras.optimizers.Adam(learning_rate=0.0001)),
        loss=losses.BinaryCrossentropy(),
        metrics=metrics.AUC(name="AUC"))
 


es = callbacks.EarlyStopping(monitor='val_AUC', 
                             min_delta=0.0000001,
                             patience=5, 
                             mode='max', 
                             baseline=None, 
                             restore_best_weights=True,
                             verbose=1)

plateau  = callbacks.ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.5,
                                       patience=2,
                                       mode='max',
                                       min_delta=0.00001,
                                       cooldown=0,
                                       min_lr=1e-7,
                                       verbose=1) 

sb = callbacks.ModelCheckpoint('./nn_model.w8',
                               save_weights_only=True,
                               save_best_only=True,
                               verbose=1,
                               monitor='val_AUC',
                               mode='max')


In [None]:

"""
history=model.fit(x=x_train,
                  y=y_train,
                  validation_data=(x_test, y_test),
                  batch_size=256,
                  epochs=20,
                  shuffle=False,
                  verbose=1,
                  callbacks=[es,sb,plateau])
"""

history=model.fit(x=x_train,
                  y=y_train,
                  validation_data=(x_test, y_test),
                  epochs=20)


In [None]:
plt.figure(figsize=(20,10)) 
loss = history.history['AUC']
val_loss = history.history['val_AUC']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training AUC')
plt.plot(epochs, val_loss, 'r', label='Validation AUC')
plt.title('Training and validation AUC')
plt.xlabel('Epochs')
plt.ylabel('AUC')
plt.legend()
plt.show()

In [None]:
pred=model.predict(x_test)

In [None]:
print(roc_auc_score(y_test, pred))

In [None]:
np.mean(y_test)

In [None]:
np.mean(pred)

In [None]:
np.mean(y_train)