In [None]:
import numpy as np 
import pandas as pd
import os

from sklearn import preprocessing

import matplotlib.pyplot as plt
import seaborn as sns

# Read Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv', index_col='id')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv',index_col='id')

# Visualize Data

In [None]:
train.hist(figsize = (20,15), grid=False, ylabelsize=5, xlabelsize=5)
plt.show()

# Correlation

In [None]:
corrs = train.corr()
corrs = corrs.sort_values(by=['loss'],ascending=False)

In [None]:
fig = plt.figure(figsize = (15,20))
sns.barplot(y=corrs.index[1:],
            x=corrs['loss'].values[1:],
            orient="h",
            )
plt.title("Correlation Between Feature Columns and Target Column (Loss)")
plt.xlabel("Correlation with Target")
plt.ylabel("Feature Columns")
plt.show()

In [None]:
target = train['loss']
train.drop('loss', axis=1, inplace=True)

In [None]:
#Distribution of loss is censored
fig = plt.figure(figsize = (14,8))
sns.countplot(x=target)
plt.show()

In [None]:
feature_cols = [col for col in test.columns.tolist()]
scaler = preprocessing.StandardScaler()
train[feature_cols] = scaler.fit_transform(train[feature_cols])
test[feature_cols] = scaler.transform(test[feature_cols])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train, 
                                                      target, test_size=0.25, 
                                                      random_state=0, 
                                                      stratify=target)
print(X_train.shape)
print(X_valid.shape)
print(y_train.shape)
print(y_valid.shape)

In [None]:
X_train=X_train.astype(np.float32)
X_valid=X_valid.astype(np.float32)
y_train=y_train.astype(np.float32)
y_valid=y_valid.astype(np.float32)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
model = tf.keras.Sequential([
    layers.Dense(128, input_shape=(100,),activation="tanh"),
    layers.Dense(256, activation="tanh"),
    layers.Dense(256, activation="tanh"),
    layers.Dropout(0.2),
    layers.Dense(128, activation="tanh"),
    layers.Dropout(0.2),
    layers.Dense(1)
])

def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean((y_true-y_pred)**2))


model.compile(loss="mse",
              optimizer=tf.keras.optimizers.Adagrad(),
              metrics=[rmse])

lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_rmse', patience=2, factor=0.2)
cb = tf.keras.callbacks.EarlyStopping(monitor="val_rmse", 
                                      patience=10, 
                                      restore_best_weights=True)

In [None]:
history = model.fit(X_train, 
                    y_train, 
                    epochs=200, 
                    validation_data=(X_valid, y_valid), 
                    batch_size=128, 
                    validation_batch_size=64,
                    callbacks=[cb, lr])

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt


acc=history.history['rmse']
val_acc=history.history['val_rmse']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

plt.plot(epochs, acc, 'r')
plt.plot(epochs, val_acc, 'b')
plt.title('Training and validation RMSE')
plt.legend(["Training", "Validation" ])
plt.xlabel("epochs")
plt.ylabel("Accuracy")
plt.ylim(top=8.1)
plt.figure()

plt.plot(epochs, loss, 'r')
plt.plot(epochs, val_loss, 'b')
plt.legend(["Training", "Validation" ])
plt.xlabel("Epochs")
plt.ylim(top=66)
plt.ylabel("Loss")

plt.title('Training and validation loss (MSE)')
plt.show()

In [None]:
predictions = model.predict(test)
output = pd.DataFrame({'id': test.index,'loss': np.squeeze(predictions)})
output.to_csv('submission.csv', index=False)