![](https://storage.googleapis.com/kaggle-competitions/kaggle/28009/logos/header.png?)

# **This is a updated version of the public notebook. I have updated it with full explanation. I have also added all the refereneces to each line, so t will be easier for evryone to modify it.**

# Your task should be,
# **Focus on the hyperparameters of the neural network. Try to change the values and check whether your auc value is improving or not.**

# If you find the notebook useful, you can thumbs it up.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os 
#The functions that the OS module provides allows you to interface with the operating system that Python is running on
from sklearn.pipeline import Pipeline 
#Pipeline is used to assemble several steps that can be cross-validated together while setting different parameters. 


import tensorflow as tf

from tensorflow.keras import layers 
#for building up the different layers of neural network
#Keras tensor flow deep learning library to create a deep learning model for both regression and classification problems.
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Embedding,  Flatten
#importing necessary dunctions for building the network
from tensorflow.keras.models import Model, Sequential
#Sequential model allows us to create a deep learning model by adding layers to it. 
#Here, every unit in a layer is connected to every unit in the previous layer. 
from keras.callbacks import ReduceLROnPlateau
#ReduceLROnPlateau is used to Reduce learning rate when a metric has stopped improving.
from keras.optimizers import RMSprop
#Optimizer that implements the RMSprop algorithm.

from tensorflow.data import Dataset
#It handles downloading and preparing the data deterministically and constructing a tf.data.Dataset 
from sklearn.model_selection import train_test_split
#From splitting the dataset for training and testing
from sklearn.preprocessing import QuantileTransformer,  KBinsDiscretizer
#QuantileTransformer method transforms the features to follow a uniform or a normal distribution
#KBinsDiscretizer bins continuous data into intervals.
from tensorflow import keras

from sklearn.impute import SimpleImputer
#Imputation transformer for completing missing values.
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
#For quantifying the quality of predictions

In [None]:
#You need to first install datatable with pip command
#pip install datatable

In [None]:
#datatable package is like pandas which can read data up to 10 times faster than pandas.
#import datatable as dt 

References:

1)https://www.tensorflow.org/datasets/overview

2)https://scikit-learn.org/stable/modules/model_evaluation.html

3)https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html

4)https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html

5)https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

6)https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

7)https://keras.io/api/optimizers/learning_rate_schedules/

8)https://keras.io/api/optimizers/

9)https://www.tutorialspoint.com/keras/keras_model_compilation.htm

10)https://www.kdnuggets.com/2019/08/overview-python-datatable-package.html#:~:text=Modern%20machine%20learning%20applications%20need%20to%20process%20a,a%20single-node%20machine%2C%20at%20the%20maximum%20possible%20speed.

# Load Dataset

In [None]:
%%time
# %%time prints the wall time for the entire cell whereas %time gives you the time for first line only

train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test  = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sub   = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
#We are again coverting it to pandas.
#Check refrence (10) for details

# Preprocessing

In [None]:
train.head()

In [None]:
%%time
train['n_missing'] = train.isna().sum(axis=1) 
#Checking the total null value row wise and storing it to a new column

test['n_missing'] = test.isna().sum(axis=1)
#Checking the total null value row wise and storing it to a new column
train['claim'] = train['claim'].astype(str)
#Converting the int datatypes to string datatype

features = [col for col in train.columns if col not in ['claim', 'id']]
#Here we are taking only the features . We are not invluding the output(claim) and id.
pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median',missing_values=np.nan)),
        ("scaler", QuantileTransformer(n_quantiles=128,output_distribution='uniform')),
        ('bin', KBinsDiscretizer(n_bins=128, encode='ordinal',strategy='uniform'))
        ])
#Now we are ready to create a pipeline object by providing with the list of steps. 
#Our steps are — SimpleImputer,QuantileTransformer and KBinsDiscretizer
#These steps are list of tuples consisting of name and an instance of the transformer or estimator. 
#For imputing the missing value, we have used median.

train[features] = pipe.fit_transform(train[features])
test[features] = pipe.transform(test[features])
#transforming the features with pipeline

# Modeling

In [None]:
train

In [None]:
model = Sequential([
    Input(train[features].shape[1:]), #train[features].shape[1:] is used to get the input shape
    #The model needs to know what input shape it should expect. 
    #For this reason, the first layer in a Sequential model needs to receive information about its input shape
    Embedding(input_dim=512, output_dim=4),
    #we use an embedding layer to compress the input feature space into a smaller one.
    #There are three parameters to the embedding layer
    #input_dim : Size of the vocabulary
    #output_dim : Length of the vector for each word
    #input_length : Maximum length of a sequence
    
    Flatten(),
    Dense(64,  activation='relu'),
    Dropout(0.5),
    Dense(32,  activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid'),
])

auc = tf.keras.metrics.AUC(name='aucroc') #Defining how we want to evaluate our model
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=5e-2,
        decay_steps = 3000,
        decay_rate= 0.8)
#We are using a learning rate schedule to modulate how the learning rate of your optimizer changes over time

optimizer = RMSprop(lr=1e-3, rho=0.9, epsilon=1e-08, decay=0.0) 
#This optimizer is usually a good choice for recurrent neural networks.
#"rho" is the decay factor or the exponentially weighted average over the square of the gradients.
#"decay" decays the learning rate over time, so we can move even closer to the local minimum in the end of training.
#Check reference (8) for details
model.compile(loss='binary_crossentropy', optimizer = optimizer, metrics=[auc]) 
#Keras model provides a method, compile() to compile the model.
#The important arguments are as follows −loss function,Optimizer and metrics
#Check reference (9) for details

In [None]:
model.fit(x = np.float32(train[features]), y = np.float32(train.claim),
          batch_size = 512, shuffle = True, epochs = 10) 
#Models are trained by NumPy arrays using fit(). 
#The main purpose of this fit function is used to evaluate your model on training.
#Check reference (9) for details

In [None]:
sub['claim'] = model.predict(np.float32(test[features]))
sub=sub.set_index('id')
sub.to_csv('submission.csv')
#and lastly we are creating our submission model

In [None]:
sub