In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
import pytorch_lightning as pl
from torch import nn
import torch.nn.functional as F
import torch
import math
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from torch.utils.data import TensorDataset, DataLoader
from pathlib import Path
from tqdm.notebook import tqdm

In [None]:
class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1)))
    
def RMSLE(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [None]:
class MLP(pl.LightningModule):
  
    def __init__(self, X, y, X_test, learning_rate, y_scaler, seed):
        super().__init__()
        self.save_hyperparameters()
        
        self.layers = nn.Sequential(
            nn.Linear(11, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, 3),
            nn.Sigmoid())

        
        self.X = X
        self.y = y
        self.X_test = X_test
        self.learning_rate = learning_rate
        self.seed = seed
        self.y_scaler = y_scaler
        self.loss = RMSLELoss()
      

    def forward(self, x):
        return self.layers(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.layers(x)
        loss = self.loss(y_hat, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.layers(x)
        y_true = self.y_scaler.inverse_transform(y.cpu().numpy())
        y_pred = self.y_scaler.inverse_transform(y_hat.cpu().numpy())
        loss = RMSLE(y_true, y_pred)
        return loss
    
    def validation_epoch_end(self, val_step_outputs):
        loss = sum(val_step_outputs) / len(val_step_outputs)
        self.log('val_loss', loss)
        

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = 0.75, patience=6, verbose = 1,mode = 'min', cooldown = 0, min_lr = 10e-7)
        optimizer_dict = {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}
        return optimizer_dict
    
    def setup(self, stage):
        X = self.X
        y = self.y
        X_test = self.X_test
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.85, random_state=self.seed)
        
        self.X_train_scaled = X_train
        self.X_val_scaled = X_val
        self.X_test_scaled = X_test

        self.y_train_scaled = y_train
        self.y_val_scaled = y_val
    
    def train_dataloader(self):
        dataset = TensorDataset(torch.FloatTensor(self.X_train_scaled), torch.FloatTensor(self.y_train_scaled))
        train_loader = DataLoader(dataset, batch_size=256, num_workers=8, shuffle=True)
        return train_loader
    
    def val_dataloader(self):
        val_dataset = TensorDataset(torch.FloatTensor(self.X_val_scaled), torch.FloatTensor(self.y_val_scaled))
        val_loader = DataLoader(val_dataset, batch_size=256, num_workers=8, shuffle=False)
        return val_loader
    
    def test_dataloader(self):
        test_dataset = TensorDataset(torch.FloatTensor(self.X_test_scaled))
        test_dataloader = DataLoader(test_dataset, batch_size=512, num_workers=8, shuffle=False)
        return test_dataloader

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv')

train['hourofday'] = pd.DatetimeIndex(train['date_time']).hour.values.astype(np.float32)
test['hourofday'] = pd.DatetimeIndex(test['date_time']).hour.values.astype(np.float32)

train['dayoftheweek'] = pd.DatetimeIndex(train['date_time']).dayofweek.values.astype(np.float32)
test['dayoftheweek'] = pd.DatetimeIndex(test['date_time']).dayofweek.values.astype(np.float32)

train['year'] = pd.DatetimeIndex(train['date_time']).dayofyear.values.astype(np.float32)
test['year'] = pd.DatetimeIndex(test['date_time']).dayofyear.values.astype(np.float32)


X = train.drop(['date_time','target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis=1)
y = train[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]
X_test = test.drop(['date_time'], axis=1)

X_scaler = StandardScaler()
y_scaler = MinMaxScaler()

X = X_scaler.fit_transform(X)
y = y_scaler.fit_transform(y)
X_test = X_scaler.transform(X_test)

In [None]:
N_FOLDS = 10

scores=list()
preds = list()
for fold in tqdm(range(N_FOLDS)):

    early_stop_callback = EarlyStopping(
       monitor='val_loss',
       min_delta=0.00,
       patience=20,
       verbose=True,
       mode='min',
    )
    ckpt_callback = ModelCheckpoint(mode="min", 
                                    monitor="val_loss", 
                                    dirpath='/kaggle/temp/', filename=f'fold_{N_FOLDS}_{fold}')
    
    model = MLP(X, y, X_test, 1e-3, y_scaler=y_scaler, seed=42 + fold)
    trainer = pl.Trainer(auto_lr_find=True)
    trainer.tune(model)
    print('Learning rate:', model.learning_rate)
    trainer = pl.Trainer(callbacks=[early_stop_callback, ckpt_callback])
    trainer.fit(model)
    test_loader = model.test_dataloader()
    
    print(f'FOLD #{fold}| best rmsle: {ckpt_callback.best_model_score.item():.5g}')
    
    model = model.load_from_checkpoint(str(list(Path('/kaggle/temp/').glob(f'fold_{N_FOLDS}_{fold}*ckpt'))[0]))
    model.eval()
    y_test = list()
    for x, in test_loader:
        y_test.append(model.forward(x.to(model.device)).detach().cpu().numpy())
    y_test = y_scaler.inverse_transform(np.concatenate(y_test))
    
    preds.append(y_test)
    scores.append(ckpt_callback.best_model_score.item())

In [None]:
np.mean(scores)

In [None]:
for i, pred in enumerate(preds):
    if i == 0:
        y_test = pred
    else:
        y_test = y_test + pred
y_test = y_test / len(preds)

In [None]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
submission['target_carbon_monoxide']=y_test[:,0]
submission['target_benzene']=y_test[:,1]
submission['target_nitrogen_oxides']=y_test[:,2]

In [None]:
submission.to_csv('submission.csv', index=False)