In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%bash
cd ../input/tabular-playground-series-jan-2021/train.csv
ls

In [None]:
import pandas as pd
data_train = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
data_test = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')

# Exploratory Data Analysis

Lets explore the spread of the dataset and also try to find  if there are any missing values

In [None]:
data_test.describe()

In [None]:
data_train.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as  plt

fig,ax = plt.subplots(14,1,figsize = (15,50))
cols = data_train.iloc[:,1:].columns
for col in range(0,len(cols) - 1):
        plt.sca(ax[col]) 
        ax[col]  =  sns.distplot(data_train[cols[col]])
        ax[col].set_xlabel(cols[col])
plt.show()


The plots give us a small insight into the nature of the data . We can see that the dataset is quite intact and has no missing values. However there are some values which show a certain degree of corellation . Lets explore this a bit more.

In [None]:
fig,ax = plt.subplots(figsize = (15,15))
sns.heatmap(data_train.iloc[:,1:].corr(),annot = True,ax = ax)

The above heat map shows us that some  the data is corellated. Let us also study the relative rannge of the data

In [None]:
fig,ax = plt.subplots(figsize = (25,15))
sns.boxplot(ax = ax, x="variable", y="value", data=pd.melt(data_train.iloc[:,1:-1]))

We will do some feature engineering to further improve the quality  of the data

We will use PCA to reduce dimentionality

In [None]:
from sklearn.decomposition import PCA

pca = PCA(7)
out = pca.fit_transform(data_train.iloc[:,1:-1])



In [None]:
pd.DataFrame(out)

In [None]:
fig,ax = plt.subplots(figsize = (15,15))
sns.heatmap(pd.DataFrame(out).corr(),annot = True,ax = ax)

# Albation experiment

We will split the training , test and eval data , and then start training our neural network

In [None]:
def train_test_split(data,train_split = 0.1):
    train_mask = data_train.loc[:,['id']].applymap(lambda x : abs(hash(str(x))) % 10000  < 10000 * train_split)
    test_mask = data_train.loc[:,['id']].applymap(lambda x : abs(hash(str(x))) % 10000  >= 10000 * train_split)
    return (data.iloc[[x[0] for x in train_mask.values],1:15] , data.iloc[[x[0] for x in test_mask.values],1:15] , data.iloc[[x[0] for x in train_mask.values],-1] , data.iloc[[x[0] for x in test_mask.values],-1])

In [None]:
x_train, x_test,y_train,y_test = train_test_split(data_train,0.7)

Now we will define our pre-processing pipeline based on tensorflow. This will help us make preprocessing part of the model itself.Also since we are using neural networks , we will use regularisation as well , so that it will work as dimentioality reducer.

In [None]:
import tensorflow as tf

def generate_feature_column(x):
    features = []
    for i in x.columns:
        features.append(tf.feature_column.numeric_column(i))
    features.append(tf.feature_column.embedding_column(tf.feature_column.crossed_column(['cont6', 'cont9','cont10','cont11','cont12','cont13'], hash_bucket_size=3000),dimension = 100))
    return features
    

In [None]:
generate_feature_column(x_train)

Now we will define the functions for dataset generation . We will first use a small dataset for albation experiments , then we will feed the full data

In [None]:
def generate_train_data(x,y,albation = False,batch = 100,epochs = 10):
    if(not albation):
        return tf.data.Dataset.from_tensor_slices((dict(x), y.values)).shuffle(buffer_size=batch).repeat(count=epochs).batch(batch)
    else:
        return tf.data.Dataset.from_tensor_slices((dict(x.iloc[0:1000,:]),y[0:1000].values)).shuffle(buffer_size=batch).repeat(count=epochs).batch(batch)

In [None]:
def generate_test_data(x,batch = 100):
        return tf.data.Dataset.from_tensor_slices(dict(x)).batch(batch)

In [None]:
next(iter(generate_train_data(x_train,y_train)))

Our data generator is working fine , lets start building our model

In [None]:
batch = 100
epochs = 5
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1,
                              patience=1, min_lr=0.00001)
model = tf.keras.Sequential([
    tf.keras.layers.DenseFeatures(generate_feature_column(x_train)),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.0001), loss = 'mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
model.fit(generate_train_data(x_train,y_train,batch = batch,epochs = epochs) ,validation_data=generate_train_data(x_test,y_test),epochs=epochs,verbose = 1,workers=-1,batch_size = batch)

In [None]:
pred = model.predict(generate_test_data(data_test))

In [None]:
pd.DataFrame({'id':data_test.iloc[:,0].values,'target': np.reshape(pred,(200000,))}).to_csv('./result.csv',index=False)