In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from matplotlib import pyplot as plt


from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
import pytorch_lightning as pl

pl.utilities.seed.seed_everything(42, workers=True)


In [None]:
data_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')


In [None]:
data_df.claim.value_counts(normalize=True)

In [None]:
data_df.isna().sum()

In [None]:
data_df = data_df[data_df.columns[1:]]

data_df['fold'] = -1
skf = StratifiedKFold(n_splits=13, shuffle=True)
for fold_i, (train_inds, val_inds) in enumerate(skf.split(data_df, data_df['claim'])):
    data_df.fold.iloc[val_inds] = fold_i

<h1> Network implementation </h1>

In [None]:
!pip install ../input/monaiweekly07/monai_weekly-0.7.dev2135-py3-none-any.whl

In [None]:
from torch.utils.data import TensorDataset, DataLoader
import torch
from torch import nn
from torch.nn import functional as F
from monai.metrics import ROCAUCMetric
from pytorch_lightning.core.memory import ModelSummary


class Model(pl.LightningModule):
    def __init__(self, in_size, lr):
        super().__init__()
        
        self.conv1 = nn.Conv1d(1, 8, kernel_size=1)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=1)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=1)
        self.conv4 = nn.Conv1d(32, 64, kernel_size=1)
        self.fc1 = nn.Linear(7552, 256)
        self.fc2 = nn.Linear(256, 1)
        self.relu = F.relu
        self.flatten = nn.Flatten()
        self.roc_auc_metric = ROCAUCMetric()
        self.bnorm1 = nn.BatchNorm1d(32)
        self.bnorm2 = nn.BatchNorm1d(64)
        self.lr = lr
        self.apply(self.init_weights)
        
    def init_weights(self, m):
        if isinstance(m, nn.Conv1d):
            torch.nn.init.kaiming_normal_(m.weight)
            m.bias.data.fill_(0.01)
    
    def forward(self, x):
        n, s = x.shape
        x = x.reshape(n, 1, s)
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.bnorm1(self.conv3(x)))
        x = self.relu(self.conv4(x))
        x = self.flatten(x)
#         print(x.shape)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
    def training_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X).squeeze(1)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)  
        return {
            'loss': loss,
        }
    
    
    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X).squeeze(1)
        self.roc_auc_metric(y_hat, y)
    
    def validation_epoch_end(self, training_step_outputs):
        roc_auc = self.roc_auc_metric.aggregate()
        self.roc_auc_metric.reset()
        print('roc_auc:', roc_auc)
        self.log('roc_auc', roc_auc)
        
    def predict_step(self, X, batch_idx: int, dataloader_idx: int = None):
        return self(X[0])    
    
    def configure_optimizers(self):
        optimizer = torch.optim.RMSprop(self.parameters(), lr=self.lr)
        return optimizer

In [None]:
import gc
pipes = []
for fold_i in range(13):
    print('Fold:', fold_i)
    train_data = data_df[data_df.fold!=fold_i]
    val_data = data_df[data_df.fold==fold_i]
    X_train = train_data.drop(['claim', 'fold'], axis=1)
    y_train = train_data.claim
    X_val = val_data.drop(['claim', 'fold'], axis=1)
    y_val = val_data.claim

    pipe = Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ("scaler", StandardScaler()),
    ])

    pipe.fit(X_train)
    pipes.append(pipe)
    
    X_train = pd.DataFrame(pipe.transform(X_train), columns=X_train.columns, index=X_train.index)
    X_val = pd.DataFrame(pipe.transform(X_val), columns=X_val.columns, index=X_val.index)


    
    train_ds = TensorDataset(torch.FloatTensor(X_train.values), torch.FloatTensor(y_train.values))
    val_ds = TensorDataset(torch.FloatTensor(X_val.values), torch.FloatTensor(y_val.values))

    train_dl = DataLoader(train_ds, batch_size=256, shuffle=True, num_workers=16)
    val_dl = DataLoader(val_ds, batch_size=256, shuffle=False, num_workers=16)

    model = Model(X_train.shape[1], 0.0001)

    checkpoint_callback = pl.callbacks.ModelCheckpoint(dirpath='models', filename=f'model_{fold_i}_' + '{roc_auc:.3}', monitor='roc_auc', mode='max', save_weights_only=True)

    print(ModelSummary(model))
    trainer = pl.Trainer(
        fast_dev_run=False, max_epochs=10, 
         gpus=1,precision=16,
         auto_lr_find=True,  limit_train_batches=1.0, limit_val_batches=1.0, 
         num_sanity_val_steps=0, val_check_interval=0.33, 
         callbacks=[checkpoint_callback]
     )

    trainer.fit(model, train_dl, val_dl)
    
    del model, trainer, val_data, train_data, X_train, X_val, y_train, y_val, train_ds, val_ds, train_dl, val_dl
    gc.collect()
    torch.cuda.empty_cache()

<h1> Inference </h1>

In [None]:
!ls models

In [None]:
trained_models = [
    'model_0_roc_auc=0.807.ckpt',
    'model_1_roc_auc=0.81.ckpt',
    'model_2_roc_auc=0.807.ckpt',
    'model_3_roc_auc=0.808.ckpt',
    'model_4_roc_auc=0.807.ckpt',
    'model_5_roc_auc=0.807.ckpt',
    'model_6_roc_auc=0.809.ckpt',
    'model_7_roc_auc=0.806.ckpt',
    'model_8_roc_auc=0.808.ckpt',
    'model_9_roc_auc=0.807.ckpt',
    'model_10_roc_auc=0.804.ckpt',
    'model_11_roc_auc=0.807.ckpt',
    'model_12_roc_auc=0.809.ckpt'
]

In [None]:
all_preds = []
test_df = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
test_df = test_df.drop(['id'], axis=1)

sample_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')
trainer = pl.Trainer(gpus=1)

model = Model(118, 0.0001)
for model_name in trained_models:
    fold_n = int(model_name.split('_')[1])
    pipe = pipes[fold_n]
    test_data = pipe.transform(test_df)
    
    model.load_state_dict(torch.load('models/' + model_name)['state_dict'])
    test_ds = TensorDataset(torch.FloatTensor(test_data))
    test_dl = DataLoader(test_ds, batch_size=256, shuffle=False, num_workers=16)
    preds = trainer.predict(model, test_dl)
    preds = torch.cat(preds).cpu().numpy().flatten()
    all_preds.append(preds)

In [None]:
np_all_preds = np.array(all_preds)
np_all_preds[:, :4], np_all_preds[:, -4:]

In [None]:
avg_preds = np.mean(np_all_preds, axis=0)
avg_preds[:4], avg_preds[-4:]

In [None]:
# avg_preds = data_df.claim.mean() + (avg_preds - avg_preds.mean()) * data_df.claim.std() / avg_preds.std()

In [None]:
avg_preds[:5]

In [None]:
sample_df['claim'] = avg_preds
sample_df.to_csv('submission.csv', index=False)

In [None]:
!ls models