# Importing modules

In [None]:
# to load and manipulate data
import pandas as pd
import numpy as np
from collections import defaultdict

# to visualize the data
import matplotlib.pyplot as plt

# to preprocess the data
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# to fit the neural network
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import ReLU
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# Loading data

In [None]:
# getting the train data
train_data = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
train_data.shape

In [None]:
# getting the test data
test_data = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
test_data.shape

# Feature engineering

In [None]:
# merging train and test data
df = pd.concat([train_data, test_data])

In [None]:
# extracting the first letter of all two letter objects
df['cat5_first'] = df.cat5.str.extract(pat=r'(^[A-Z])', expand=False)
df['cat7_first'] = df.cat7.str.extract(pat=r'(^[A-Z])', expand=False)
df['cat8_first'] = df.cat8.str.extract(pat=r'(^[A-Z])', expand=False)
df['cat10_first'] = df.cat10.str.extract(pat=r'(^[A-Z])', expand=False)

# extracting the last letter of all two letter objects - adding a flag to encode 
df['cat5_second'] = df.cat5.str.extract(pat=r'(?<=[A-Z])([A-Z]$)', expand=False).fillna('NS')
df['cat7_second'] = df.cat7.str.extract(pat=r'(?<=[A-Z])([A-Z]$)', expand=False).fillna('NS')
df['cat8_second'] = df.cat8.str.extract(pat=r'(?<=[A-Z])([A-Z]$)', expand=False).fillna('NS')
df['cat10_second'] = df.cat10.str.extract(pat=r'(?<=[A-Z])([A-Z]$)', expand=False).fillna('NS')

# Encoding categorical data

In [None]:
# getting a list of categorical columns
cat_columns = [column for column in df.columns if 'cat' in column]

In [None]:
# calculating the frequency of occurrence of each level
frequencies_cat10 = df['cat10'].value_counts(normalize=False)

# mapping each frequency to do the level of cat10 and creating a mask to filter next
masking = df['cat10'].map(frequencies_cat10)

# replacing rare levels by 'Other'
df['cat10'] = df['cat10'].mask(masking < 500, 'Other')

In [None]:
# creating dictionary to store the label encoder
dict_le = defaultdict(LabelEncoder)

# label encoding categorical columns
df[cat_columns] = df[cat_columns].apply(lambda x: dict_le[x.name].fit_transform(x))

In [None]:
# listing columns to drop or retain
## columns that were splitted
cat_drop_split = ['cat5', 'cat7', 'cat8', 'cat10', 'id']

## categorical columns that have a poor support from Inforation Value
cat_drop_support = ['cat13', 'cat9', 'cat6', 'cat10_second', 'cat8_second', 'cat10_first', 'cat5', 'cat5_second', 'cat12', 'cat3', 'cat5_first']

## numerical columns that have poor suport from Inforation Value
cont_drop_support = ['cont8', 'cont3', 'cont9', 'cont4', 'cont2', 'cont10', 'cont7', 'cont0']

## engineered columns
cat_drop_eng = ['cat5_first', 'cat7_first', 'cat8_first', 'cat10_first', 'cat5_second', 'cat7_second', 'cat8_second', 'cat10_second', 'id']

## columns with more support from Information Value
columns_support = ['cat16', 'cat15', 'cat18', 'cat10', 'cat1', 'cat8', 'cat0', 'cat14', 'cat2', 'cat7', 'cat11', 
                   'cat17', 'cat4', 'cat8_first', 'cont5', 'cat6', 'cat7_second', 'cont1', 'cat7_first', 'target']

In [None]:
# selecting columns to try
df = df.drop(columns=cat_drop_eng)

# uptdating the list of numerical and categorical columns
cat_columns = [column for column in df.columns if 'cat' in column]
num_columns = [column for column in df.columns if 'cont' in column]

In [None]:
# splitting training and test data once again
train_df, test_df = df[:train_data.shape[0]], df[train_data.shape[0]:]

# Separating inputs from targets

In [None]:
# applying the split for the training data
X, y = train_df[cat_columns + num_columns], train_df.target

# encoding the target values
y = LabelEncoder().fit_transform(y)

In [None]:
# applying the split for the test data
X_test = test_df[cat_columns + num_columns]

# Instantiating the StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)

# Function to separate categorical from numerical features

In [None]:
def separate_inputs(X_input, categoricals, numerics):
    # separating numerical from categorical columns
    X_cat, X_num = X_input[categoricals], X_input[numerics]
    
    # parsing the numerical inputs to an array
    X_num = np.array(X_num)

    # parsing the categorical inputs to an array
    ## creating an empty list to store the data
    X_cat_enc = list()
    
    ## looping through columns to extract the data
    for column in range(X_cat.shape[1]):
        X_cat_enc.append(X_cat.iloc[:, column].values)
        
    return X_num, X_cat_enc

In [None]:
## applying the function to the test set
X_test_num, X_test_cat_enc = separate_inputs(X_input = X_test, categoricals = cat_columns, numerics = num_columns)

# Creating the model architecture

In [None]:
# wrapping the architecture and compilation into a function
def get_model(X_numeric, X_categorical):
    # creating the input layers for the numerical values
    input_layer_numerical = Input(shape = X_numeric.shape[1])

    # creating a dense layer to encode the numerical features
    dense_numerical = Dense(units = 64, activation = 'relu')(input_layer_numerical)

    # creating the input layers for the categories that will go through the embedding
    ## creating empty lists to store each of the input and embedding layers
    input_layer_categorical = list()
    embedding_layers = list()

    ## looping through each of categorical columns and creating their input and embedding layers
    for column in range(len(X_categorical)):
        # defining the size of the input that will be used - each label in a column will have its own embedding
        n_labels = len(np.unique(X_categorical[column]))
        # defining the input layer of the column
        input_layer = Input(shape = (1, ))
        # defining the embedding layer of the columns
        if n_labels == 2:
            embedding_layer = Embedding(input_dim = n_labels + 1, output_dim = n_labels)(input_layer)
        else:
            embedding_layer = Embedding(input_dim = n_labels + 1, output_dim = 20)(input_layer)
        # storing the input-embedding layer pairs
        input_layer_categorical.append(input_layer)
        embedding_layers.append(embedding_layer)
    
    ## concatenating the embedding layer
    embedding = concatenate(embedding_layers)
    
    ## flattening the embedding layer
    embedding_flat = Flatten()(embedding)
    
    ## creating a dense representation of the embedding
    dense_embedding = Dense(units = 128, activation = 'relu')(embedding_flat)
    
    # combining the embedding and the numerical inputs
    combined_inputs = Concatenate()([dense_numerical, dense_embedding])
    
    # batch normalizing
    bn_0 = BatchNormalization()(combined_inputs)
    
    # dropout layer
    dropout_1 = Dropout(rate = 0.5)(bn_0)
    
    # creating the first dense layer
    dense_1 = Dense(units = 256)(dropout_1)
    
    # batch normalizing
    bn_1 = BatchNormalization()(dense_1)
    
    # relu on bn
    relu_1 = ReLU()(bn_1)
    
    # dropout layer
    dropout_2 = Dropout(rate = 0.5)(relu_1)
    
    # creating the second dense layer
    dense_2 = Dense(units = 512, activation = 'relu')(dropout_2)
    # dropout layer
    dropout_3 = Dropout(rate = 0.5)(dense_2)
    
    # creating the output layer
    output_layer = Dense(units = 1, activation = 'sigmoid')(dropout_3)
    
    # instantiating the model
    embedding_model = Model(inputs = [input_layer_numerical, input_layer_categorical], outputs = output_layer)
    
    # compiling the model
    embedding_model.compile(optimizer = Adam(learning_rate = 0.0006), loss = 'binary_crossentropy', metrics = [AUC()])
    
    # returning the model
    return embedding_model

# Fitting the model

In [None]:
# creating a numpy array to store the predictions of each fold
predictions = np.zeros(shape=(test_df.shape[0],1))

# starting a fold counter
fold = 1

# looping over each fold and fitting the model to a different subset of the data
for train_index, test_index in skf.split(X, y):
    
    print(f'\nStarting fold {fold}.\n')
    
    # filter the indexes for training and test data
    X_train, X_val, y_train, y_val = X.loc[train_index], X.loc[test_index], y[train_index], y[test_index]
    
    # separate the categorical and numerical inputs for each dataset
    X_train_num, X_train_cat = separate_inputs(X_input = X_train, categoricals = cat_columns, numerics = num_columns)
    X_val_num, X_val_cat = separate_inputs(X_input = X_val, categoricals = cat_columns, numerics = num_columns)
    
    # instantiating the model
    embedding_model = get_model(X_train_num, X_train_cat)
    
    # defining the callbacks
    early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10, min_delta = 0.0001, mode = 'min', restore_best_weights = True)
    
    # fitting the model
    embedding_model.fit(x = [X_train_num, X_train_cat], y = y_train, batch_size = 512, epochs = 100, callbacks = [early_stopping], 
                        validation_data = ([X_val_num, X_val_cat], y_val))
    
    # getting the predictions for the model trained on that fold
    predicted_probas = embedding_model.predict(x = [X_test_num, X_test_cat_enc], verbose = 1, batch_size = 128)
    
    # summing up the predictions made by the model
    predictions = predictions + predicted_probas
    
    # incrementing the fold counter
    fold += 1

# Summarizing predictions

In [None]:
# putting the predictions in the target column
test_data['target'] = predictions / 10

In [None]:
# creating the submission data frame
submission = test_data.loc[:, ['id', 'target']]

# Saving predictions

In [None]:
# Saving the submission
submission.to_csv('submission.csv', index=False)