In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras

Read **training and testing datasets** form csv as a dataframe

In [None]:
dataset = pd.read_csv(r'../input/cat-in-the-dat/train.csv')
testset = pd.read_csv(r'../input/cat-in-the-dat/test.csv')

Seperating the training set into Input and target.

In [None]:
X= dataset.loc[:,:'month']
Y= dataset.loc[:,'target']

Combining data for **OneHotEncoding**

In [None]:
alldata = pd.concat((X,testset))
alldata.drop('id', axis=1, inplace=True)

In [None]:
print(str(X.shape[0])+" rows of X")
print(str(testset.shape[0])+" rows of testSet")
print(str(alldata.shape[0])+" rows of Combined")

# Applying OneHotEncoding
I chose One Hot Encoding because Label Encoding was giving lesser score(0.71) and loss was greater in the given no. of epochs ie. model was less accurate.

<a href='#comparison'>**See comparison**</a>

In [None]:
ohcInstance=OneHotEncoder()
ohcInstance.fit(alldata)
alldata=ohcInstance.transform(alldata)

After Onehot encoding we get roughly 16000 columns,form 24.

In [None]:
print("After one hot encoding no. of columns become "+str(alldata.shape[1]))

Reducing back to X and Test_X(testSet).

In [None]:
X=alldata[0:300000]
Test_X=alldata[300000:]

# Defining the Model
Used Multilayer perceptron. 
Since, Our problem is kind of a logistic regression problem. A single layered perceptron would have done the work. 
But to increase the accuracy, multiple layers are used(for early convergence also) and  non-linearity is introduced by 'RELU' activation functions. 


In [None]:
model = keras.models.Sequential([
                                 tf.keras.layers.Dense(512,input_dim=X.shape[1],activation='relu'),
                                tf.keras.layers.Dense(128,activation='relu'),
                                 tf.keras.layers.Dense(64,activation='relu'),
                                  tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
model.summary()

# **Model Fitting**
In order to avoid overfitting no. of epochs and batchsize are tweaked by hit and trial to get highest accuracy and minimum loss.

In [None]:
history_OneHot=model.fit(X, np.asarray(Y).astype(np.int32), epochs=11, batch_size=10000, verbose=1)


# Writing in to the output file
ie ./submission.csv

In [None]:
predictions = model.predict(Test_X)
submit = pd.concat([testset['id'], pd.Series(predictions[:,0]).rename('target')], axis=1)
submit.to_csv('submission.csv', index=False, header=True)

In [None]:
submit

# Training same model,But Label Encoding is Used.
**To prove our comparison** 

In [None]:
dataset = pd.read_csv(r'../input/cat-in-the-dat/train.csv')

X= dataset.iloc[:,1:24].values
Y= dataset.iloc[:,24].values


In [None]:
def labelEncode(listData,index):
    labelEncoder=LabelEncoder()
    listData[:,index]=labelEncoder.fit_transform(listData[:,index]) 

In [None]:
dataset.iloc[:,1:24]

if We observe the dataset only columns 3-14 and 16-21 are having **Categorical Data**.
Hence, Applying encoding on that data only.

In [None]:
from sklearn.preprocessing import LabelEncoder
for i in range(3,15):
    labelEncode(X,i)
for i in range(16,21):
    labelEncode(X,i)


In [None]:
model = keras.models.Sequential([
                                 tf.keras.layers.Dense(512,input_dim=X.shape[1],activation='relu'),
                                tf.keras.layers.Dense(128,activation='relu'),
                                 tf.keras.layers.Dense(64,activation='relu'),
                                  tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
history_label=model.fit(np.asarray(X).astype(np.int32), np.asarray(Y).astype(np.int32), epochs=11, batch_size=10000, verbose=1)

In [None]:
# PLOT LOSS AND ACCURACY
%matplotlib inline

import matplotlib.image  as mpimg
import matplotlib.pyplot as plt
oneacc=history_OneHot.history['acc']
labelacc=history_label.history['acc']
oneloss=history_OneHot.history['loss']
labelloss=history_label.history['loss']

# Label Encoding vs One Hot Encoding
<a id="comparison">**The Following is the comparison of Accuracy and Loss for Both the techniques.**</a>

In [None]:
epochs=range(len(oneacc)) # Get number of epochs

plt.plot(epochs, oneacc, 'b', "Label Encoding Accuracy")
plt.plot(epochs, labelacc, 'r', "OneHotEncoding Accuracy")
plt.title('Difference between accuracy of Label and OneHotEncoder')
plt.figure()




As you can see in the above plot, Accuracy of OneHotEncoder approches 1 nearby 6th epoch. Which in the case of Labelled Encoder has very low slope so will take more no. of epochs to reach.

In [None]:
plt.plot(epochs, oneloss, 'r', "Label Encoding  Loss")
plt.plot(epochs, labelloss, 'b', "OneHotEncoding Loss")
plt.title('Difference between Loss of Label and OneHotEncoder')
plt.figure()

As you can see in the above plot, Loss of OneHotEncoder is very less from the initial Epoch, which in the case of Labelled Encoder will take a large no. of epochs to reach so.
