# Tabular Playground Series - Aug 2021

## 1. Download Datasets & Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.utils.data as Data
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import warnings 
warnings.filterwarnings('ignore')

## 2. Exploratory Data Analysis (EDA)

In [None]:
# read csv files
train_df = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv', index_col='id')
test_df = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv', index_col='id')

print('Shape of Train data:', train_df.shape)
print('Shape of Test data:', test_df.shape)

In [None]:
train_df.head()

In [None]:
# check the null entities in train data
train_df.isnull().sum().sum()

### Target Distribution

In [None]:
# distribution of target variable(train_df['loss'])
train_df['loss'].value_counts()

In [None]:
# visualization 
plt.figure(figsize=(20, 8))
sns.countplot(x='loss', data=train_df)
plt.title('Distribution of Target Variable (loss)', fontsize=14)
plt.xlabel('Target variable (loss)'); plt.ylabel('N', rotation=0)
plt.show()

### Description Table of Train data

In [None]:
train_df.describe().T.style.bar(subset=['mean', 'min', 'max'], color='#d65f5f')

The values of the column 'f60' has a very wide distribution  
and other columns (like 'f16', 'f27', 'f52') has also a wide distribution

In [None]:
# f60 column
plt.figure(figsize=(8, 5))
sns.histplot(x='f60', data=train_df)
plt.title('Distribution of f60 column in Train data')
plt.show()

In [None]:
train_df.agg(func=['mean','median', 'min', 'max']).sort_values(by='mean', axis=1, ascending=False)

### Correlation with Target Variable

In [None]:
corr_mat = train_df.corr()

plt.figure(figsize=(25, 6))
corr_mat["loss"][:-1].plot(kind="bar", grid=True)
plt.title("Features correlation to target label", fontdict={"fontsize": 20})

In [None]:
# correlation heatmap
fig, ax = plt.subplots(1, 1, figsize=(12 , 12))

mask = np.zeros_like(corr_mat, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr_mat, ax=ax,
        square=True, center=0, linewidth=1,
        cmap=sns.diverging_palette(240, 10, as_cmap=True),
        cbar_kws={"shrink": .82},    
        mask=mask
       ) 

ax.set_title(f'Correlation', loc='left', fontweight='bold')     

plt.show()

In [None]:
# scatter plots
train_columns = train_df.iloc[:, :-1].columns.tolist()
fig_columns = 10; fig_rows = 10

f, axes = plt.subplots(fig_rows, fig_columns, sharex=False, sharey=False)
f.set_size_inches((4 * fig_rows, 4 * fig_columns)) 

for i, col in enumerate(train_columns):
    axes[i // 10][i % 10].scatter(train_df[col], train_df['loss'], alpha=0.4)
    axes[i // 10][i % 10].set_title(col)
plt.show()

### Distribution of Train data features

In [None]:
plt.subplots(figsize=(16,150))
length = len(train_columns)

for i, j in zip(train_columns, range(length)):
    fig = plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.histplot(x=train_df[i], color='skyblue',edgecolor='black')
    sns.histplot(x=test_df[i], color='salmon',edgecolor='black')
    fig.legend(labels=('Train','Test'))

## 3. Preprocessing

In [None]:
# split X, y

X = train_df.drop('loss', axis=1)
y = train_df['loss']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

### Normalization

In [None]:
std = StandardScaler()
X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)

### Dimension Reduction

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=12)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

## 4. Modeling with MLP (using Pytorch)

In [None]:
# training config
BATCH_SIZE = 64
EPOCHS = 10
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

# custom datasets
X_train = torch.Tensor(X_train)
y_train = torch.Tensor(y_train.to_numpy())
X_test = torch.Tensor(X_test)
y_test = torch.Tensor(y_test.to_numpy())

train_dataset = Data.TensorDataset(X_train, y_train)
test_dataset = Data.TensorDataset(X_test, y_test)

train_loader = Data.DataLoader(
    dataset=train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True)

test_loader = Data.DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

In [None]:
class MLP_Regressor(nn.Module):
  def __init__(self, ):
    super(MLP_Regressor, self).__init__()
    self.fc1 = nn.Linear(12, 128)
    self.fc2 = nn.Linear(128, 256)
    self.fc3 = nn.Linear(256, 128)
    self.fc4 = nn.Linear(128, 64)
    self.fc5 = nn.Linear(64, 32)
    self.fc6 = nn.Linear(32, 1)

    self.dropout_prob = 0.5

    self.batchnorm1 = nn.BatchNorm1d(128)
    self.batchnorm2 = nn.BatchNorm1d(256)
    self.batchnorm3 = nn.BatchNorm1d(128)
    self.batchnorm4 = nn.BatchNorm1d(64)
    self.batchnorm5 = nn.BatchNorm1d(32)

  def forward(self, x):
    out = F.relu(self.batchnorm1(self.fc1(x)))
    out = F.dropout(out, training=self.training, p=self.dropout_prob)
    out = F.relu(self.batchnorm2(self.fc2(out)))
    out = F.dropout(out, training=self.training, p=self.dropout_prob)
    out = F.relu(self.batchnorm3(self.fc3(out)))
    out = F.dropout(out, training=self.training, p=self.dropout_prob)
    out = F.relu(self.batchnorm4(self.fc4(out)))
    out = F.dropout(out, training=self.training, p=self.dropout_prob)
    out = F.relu(self.batchnorm5(self.fc5(out)))
    out = F.dropout(out, training=self.training, p=self.dropout_prob)
    out = self.fc6(out)
    return out

In [None]:
def weight_init(m):
    if isinstance(m, nn.Linear):
        init.kaiming_uniform_(m.weight.data)

In [None]:
model = MLP_Regressor().to(DEVICE)
model.apply(weight_init)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
criterion = nn.MSELoss()

In [None]:
def train(model, train_loader, optimizer, log_interval=200):
  model.train()
  train_epoch_loss = 0.0

  for batch_idx, (feature, label) in enumerate(train_loader):
    feature = feature.to(DEVICE)
    label = label.to(DEVICE)
    optimizer.zero_grad()
    output = model(feature)
    loss = torch.sqrt(criterion(output, label))
    loss.backward()
    optimizer.step()
    train_epoch_loss += loss.item()

    if batch_idx % log_interval == 0:
          print("Train Epoch: {} [{}/{} ({:.0f}%)]\tTrain Loss: {:.6f}".format(
                epoch, batch_idx * len(feature), 
                len(train_loader.dataset), 100. * batch_idx / len(train_loader), 
                loss.item()))
          
  train_epoch_loss /= (len(train_loader.dataset) / BATCH_SIZE)
  return train_epoch_loss

In [None]:
def validate(model, test_loader):
  model.eval()
  valid_epoch_loss = 0.0

  with torch.no_grad():
    for feature, label in test_loader:
      feature = feature.to(DEVICE)
      label = label.to(DEVICE)
      output = model(feature)
      loss = torch.sqrt(criterion(output, label))
      valid_epoch_loss += loss.item()

  valid_epoch_loss /= (len(test_loader.dataset) / BATCH_SIZE)
  return valid_epoch_loss

In [None]:
train_loss = []
val_loss = []
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1} of {EPOCHS}")
    train_epoch_loss = train(model, train_loader, optimizer, log_interval=1000)
    val_epoch_loss = validate(model, test_loader)
    train_loss.append(train_epoch_loss)
    val_loss.append(val_epoch_loss)
    print("\n[EPOCH: {}], \tTest Loss: {:.4f}\n".format(epoch+1, val_epoch_loss))

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(train_loss, color='orange', label='train loss')
plt.plot(val_loss, color='red', label='validataion loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

## 5. Submission Test data

In [None]:
# Normalization
test_data = test_df.values

test_data = std.transform(test_data)
test_data = pca.transform(test_data)
test_data = torch.Tensor(test_data).to(DEVICE)

In [None]:
test_data

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

with torch.no_grad():
  prediction = model(test_data)

sample_submission['loss'] = prediction.cpu().numpy()
sample_submission.to_csv("submission.csv",index=False)