In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

# **1.Basic EDA**

In [None]:
# Visualize the training set
train.head()

In [None]:
# Visualize the test set
test.head()

In [None]:
print("Training data shape:",train.shape)
print("Test data shape:",test.shape)

In [None]:
#checking for null values
print(train.isna().sum().sum())
print(test.isna().sum().sum())

The ID column in both training and test set is innecesary.

In [None]:
train.drop('id',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)

In [None]:
cols = train.columns.tolist()

## Plotting the distribution of each feature

In [None]:
plt.figure(figsize=(24, 156))
for i in range(len(train.columns.tolist())):
    plt.subplot(26, 4, i+1)
    if i <= 99:
        plt.hist(train[f'f{i}'])
        plt.xlabel(f'f{i}')
    else:
        plt.hist(train['loss'])
        plt.xlabel('Loss')
plt.show()

We can see that all the features are continuous and there is no discrete values.

## Plotting the correlation matrix

In [None]:
corr = train.corr()
plt.figure(figsize=(20,20))
sns.heatmap(corr, annot = False, cmap= 'coolwarm')
plt.show()

We can see that relationship between the features and loss is very low.

## Data preparation

In [None]:
print("Training data shape after droping ID colunmn:",train.shape)
print("Test data shape after droppig ID column:",test.shape)

In [None]:
cols = test.columns

In [None]:
X = train[cols]
y = train['loss']
test = test

In [None]:
X.head()

In [None]:
test.head()

In [None]:
y.head()

In [None]:
#scaling the data 
ss = StandardScaler()
X_scaled = ss.fit_transform(X)
test_scaled = ss.fit_transform(test)

# Model training

## Neural Network Model

Import libraries

In [None]:
import tensorflow as tf
from tensorflow import keras

We create the model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(1024,input_dim=X_scaled.shape[1],kernel_initializer='normal',activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(512,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(256,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dense(1,activation='relu')
])
model.summary()

Set the optimizer and loss

In [None]:
adam = tf.keras.optimizers.Adam()
model.compile(loss='mean_squared_error',optimizer=adam)

Training

In [None]:
history = model.fit(X_scaled,y,validation_split=0.25,epochs=5,verbose=1,shuffle=True)

# Results

In [None]:
#Deffining figure:
f = plt.figure(figsize = (20, 20))
#Loss curve for the training set
plt.plot(history.epoch,history.history['loss'],label="loss")
#Loss curve for the test set
plt.plot(history.epoch,history.history['val_loss'],label="val_loss")

plt.title("Loss Curve", fontsize = 18)
plt.xlabel("Epochs", fontsize = 15)
plt.ylabel("Loss", fontsize = 15)
plt.grid(alpha = 0.3)
plt.legend()

plt.show()

In [None]:
train_pred = model.predict(X_scaled)

In [None]:
from sklearn.metrics import mean_squared_error
print("RMSE for Neural Network Model",np.sqrt(mean_squared_error(y,train_pred)))

## Predictions

In [None]:
y_pred = model.predict(test_scaled)

In [None]:
sub

In [None]:
sub['loss'] = y_pred

In [None]:
sub

In [None]:
sub.to_csv('NN_submission.csv',index=False)

You can use hyperparameter tuning with this network too