In [None]:
import numpy as np
import pandas as pd
import os

# Normalizing and split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [None]:
def normalize(X_train, X_valid, X_test, normalize_opt, excluded_feat):
    feats = [f for f in X_train.columns if f not in excluded_feat]
    if normalize_opt is not None:
        if normalize_opt == 'min_max':
            scaler = preprocessing.MinMaxScaler()
        scaler = scaler.fit(X_train[feats])
        X_train[feats] = scaler.transform(X_train[feats])
        X_valid[feats] = scaler.transform(X_valid[feats])
        X_test[feats] = scaler.transform(X_test[feats])
    return X_train, X_valid, X_test

In [None]:
PATH_DATASET = '/kaggle/input/vulcanic-preprocessing/'

train_sample = pd.read_csv(f'{PATH_DATASET}/train_sample.csv')
targets = pd.read_csv(f'{PATH_DATASET}/targets.csv')
test = pd.read_csv(f'{PATH_DATASET}/test.csv').iloc[:,1:]

# LSTM-based neural network

In [None]:
!pip install pywick

In [None]:
import torch

from sklearn import preprocessing
from torch.nn import functional as F
from torch import nn
from pytorch_lightning.core.lightning import LightningModule
from pywick.optimizers.nadam import Nadam
from torch.utils.data import TensorDataset, DataLoader
from torchvision import transforms
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.metrics import mean_squared_error as mse

In [None]:
NUM_MODELS = 20
BATCH_SIZE = 9999
NUM_EPOCHS = 1500
PATH_MODEL = '/kaggle/working/models/'
PATH_DATA = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/'

In [None]:
class VolcanicLSTM(LightningModule):
    
    def __init__(self, num_features):
        super().__init__()
        
        self.bn = nn.BatchNorm1d(num_features=num_features)
        self.lstm = nn.LSTM(input_size=num_features, hidden_size=128, num_layers=1)
        
        self.conv1 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=2, padding=1, stride=2)
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=84, kernel_size=2, padding=1, stride=2)
        self.conv3 = nn.Conv1d(in_channels=84, out_channels=64, kernel_size=2, padding=1, stride=2)
        
        self.flat = nn.Flatten()
        self.lin1 = nn.Linear(in_features=64, out_features=64)
        self.lin2 = nn.Linear(in_features=64, out_features=32)
        self.lin3 = nn.Linear(in_features=32, out_features=1)
        
        
    def forward(self, x):
        batch_size, _, _ = x.size()
        x = x.view(batch_size, -1)
        x = self.bn(x)
        x = torch.unsqueeze(x, 1)
        x, _ = self.lstm(x)
        
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.conv3(x)
        x = F.relu(x)

        x = self.flat(x)
        x = self.lin1(x)
        x = F.relu(x)
        x = self.lin2(x)
        x = F.relu(x)
        x = self.lin3(x)
        x = F.relu(x)

        return x
    
    def configure_optimizers(self):
        return Nadam(self.parameters(), lr=0.005)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = F.l1_loss(preds, y)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = F.l1_loss(preds, y)
        self.log('val_loss', loss)
        return loss
    
    def train_dataloader(self):
        tensor_x = torch.Tensor(train_x)
        tensor_y = torch.Tensor(train_y)
        
        dataset = TensorDataset(tensor_x, tensor_y)
        
        return DataLoader(dataset, batch_size=BATCH_SIZE)

    def val_dataloader(self):
        tensor_x = torch.Tensor(valid_x)
        tensor_y = torch.Tensor(valid_y)
        
        dataset = TensorDataset(tensor_x, tensor_y)
        
        return DataLoader(dataset, batch_size=BATCH_SIZE)


    def test_dataloader(self):
        tensor_x = torch.Tensor(test_scaled)
        
        dataset = TensorDataset(tensor_x)
        
        return DataLoader(dataset, batch_size=BATCH_SIZE)

In [None]:
submission = pd.read_csv(f'{PATH_DATA}/sample_submission.csv')

sub_final = np.zeros(len(submission))
avg_valid_mae = 0
num_used_models = 0

i = 0

while i < NUM_MODELS:
    print('\nRunning model ', i)
    
    train_x, valid_x, train_y, valid_y = train_test_split(train_sample, targets, test_size=0.2, random_state=0)
    train_x, valid_x, test_scaled = normalize(train_x.copy(), valid_x.copy(), test.copy(), 'min_max', [])
    train_x = train_x.values.reshape(train_x.shape[0], 1, train_x.shape[1])
    valid_x = valid_x.values.reshape(valid_x.shape[0], 1, valid_x.shape[1])
    train_y = train_y.to_numpy()
    valid_y = valid_y.to_numpy()
    test_scaled = test_scaled.values.reshape(test_scaled.shape[0], 1, test_scaled.shape[1])

    if not os.path.exists(f'{PATH_MODEL}/{str(i)}'):
        os.makedirs(f'{PATH_MODEL}/{str(i)}')
        
    model = VolcanicLSTM(num_features=train_x.shape[-1])
    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss',
        dirpath=f'{PATH_MODEL}/{str(i)}',
        filename='best_model-{epoch}'
    )
    trainer = Trainer(gpus=1, callbacks=[checkpoint_callback], min_epochs=1, max_epochs=NUM_EPOCHS, progress_bar_refresh_rate=0)
    
    trainer.fit(model)
    
    print(checkpoint_callback.best_model_path)
    test_model = VolcanicLSTM.load_from_checkpoint(checkpoint_callback.best_model_path, num_features=train_x.shape[-1])
    
    valid_preds = torch.squeeze(test_model(torch.Tensor(valid_x))).detach().numpy()
    mae = mse(valid_y, valid_preds, squared=False)
    print(f'{i} MAE: {mae:.0f}')
    if mae < 10000000:
        avg_valid_mae += mae 
        sub_final += torch.squeeze(test_model(torch.Tensor(test_scaled))).detach().numpy() 
        num_used_models += 1
        
    i += 1

In [None]:
avg_valid_mae /= num_used_models
sub_final /= num_used_models

print(f'\nNumber of used models: {num_used_models}')
print(f'\nAverage validation MAE for used models: {avg_valid_mae:.0f}')

submission['time_to_eruption'] = sub_final
submission.to_csv('submission.csv', index=False)