Hello! In this notebook we will test a simple DNN model for regression. We will use a keras Sequential model with 3 hidden dense layers for now. 

There is room for much improvement in this so please do suggest any changes that can make my model better.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Step 1 : Importing the required Libraries

In [None]:
# Step 1 - Importing the required Libraries
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
from xgboost import XGBRegressor   # We will compare our NN performance using a XGboost base model

# Step 2 - Fetching the Data

In [None]:
# Step 2 - Get the data
def get_data():
    train_data_path ='/kaggle/input/tabular-playground-series-feb-2021/train.csv'
    train = pd.read_csv(train_data_path)
    
    #get test data
    test_data_path ='/kaggle/input/tabular-playground-series-feb-2021/test.csv'
    test = pd.read_csv(test_data_path)
    
    return train , test

train, test = get_data()

# Step 3 - Data Pre Processing

In [None]:
train.describe()

Some interesting observations can be made seeing this table:

1. Almost all the continuous variables are standardised (values between 0 and 1). Only some variables have a value greater than one.
2. The mean value of our target variable is 7.5 and the standard deviation is 0.88.

In [None]:
# Seeing what other columns our data has
train.columns

From the above output, we see that we have 10 categorical features and 14 continuous features. 

In [None]:
# Dividind the data into categorical and continuous features for better visualization and pre processing.
cat_features = ['cat{0}'.format(x) for x in range(0,10)]
cont_features = ['cont{0}'.format(x) for x in range(0,14)]

# Appending the target column name to both the lists
cat_features.append('target')
cont_features.append('target')

train_cat = train[cat_features]
train_cont = train[cont_features]

# Now removing the 'target' from the list
cat_features.remove('target')
cont_features.remove('target')

# Doing the same for test data
test_cat = test[cat_features]
test_cont = test[cont_features]


In [None]:
# Analysis of Categorical Features
plt.figure(figsize = (12,9))
j = 1
for i in cat_features:
    plt.subplot(5,2,j)
    sns.countplot(x = i, data = train_cat)
    j+=1

plt.show()

Please note that the empty columns just indicate that there are very less values compared to the scale of the graph. For instance, see the values counts for 'cat4' in the next cell 

In [None]:
train_cat['cat4'].value_counts()

In [None]:
# Plotting the distribution of each continuous variable
plt.figure(figsize = (16,10))
j = 1
for con in cont_features:
    plt.subplot(7,2,j)
    sns.distplot(train_cont[con])
    j+=1

Now that we have seen the plots, we can move on to the pre processing part. That will incude two main tasks:

1. Label Encoding the categorical features. (Code Reference from the starter notebook)
2. Checking the correlation of continuous variables to remove or combine correlated features.

In [None]:
# Label Encoding categorical features
# Code Reference : https://www.kaggle.com/inversion/get-started-feb-tabular-playground-competition

for c in train_cat.columns:
    if train_cat[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train_cat[c].values) + list(test_cat[c].values))
        train_cat[c] = lbl.transform(train_cat[c].values)
        test_cat[c] = lbl.transform(test_cat[c].values)
        
display(train_cat.head())

In [None]:
# Checking the correlation matrix for continuous features
plt.figure(figsize = (12,10))
sns.heatmap(train_cont.corr(), annot = True)
plt.show()

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

Y = train_cont.pop('target')
X = train_cont
# feature extraction
model = ExtraTreesRegressor(n_estimators=10)
model.fit(X, Y)
print(model.feature_importances_)

We cannot draw any conclusive decision from these values as all the values are almost the same. Hence, for now we will keep all the continuous features and see how our model performs.

# Creating our Neural Network and training it

In [None]:
# Getting our final train and test data
X_train = pd.concat([train_cat, train_cont], axis = 1)
X_train.pop('target')
y_train = Y
X_test = pd.concat([test_cat, test_cont], axis = 1)

In [None]:
# Our DNN Model! Test 1

NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(512, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(128, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mse', optimizer='adam', metrics=[tf.keras.metrics.MeanSquaredError()])
NN_model.summary()

In [None]:
# Training the model
NN_model.fit(X_train, y_train, epochs=50, batch_size=2000, validation_split = 0.2)

In [None]:
predictions = NN_model.predict(X_test)

In [None]:
predictions

In [None]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv', index_col='id')
submission['target'] = predictions
submission.to_csv('final_submission.csv')