In [None]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from functools import partial
from collections import defaultdict

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split, Subset

import catalyst
from catalyst import dl

import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

import os

if os.path.exists("/kaggle"):
    DATA_DIRECTORY = f"/kaggle/input/{os.listdir('/kaggle/input')[0]}"

    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))
else:
    DATA_DIRECTORY = f"data"
RANDOM_SEED = 1235

def file_path(filename):
    global DATA_DIRECTORY
    return os.path.join(DATA_DIRECTORY, filename)

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [None]:
train = pd.read_csv(file_path("train_features.csv")).sort_values(by='sig_id')
targets = pd.read_csv(file_path("train_targets_scored.csv")).sort_values(by='sig_id')
test = pd.read_csv(file_path("test_features.csv"))
submission = test[['sig_id']].assign(**targets.iloc[:, 1:].mean())

In [None]:
mask = test['cp_type'] != 'ctl_vehicle'
submission.iloc[~mask, 1:] = 0

In [None]:
def basic_preprocess(X, y=None, pca=None):
    mask = np.where(X['cp_type'] == 'ctl_vehicle')[0]
    X.drop(mask, inplace=True)
    if y is not None:
        y.drop(mask, inplace=True)
        y.drop(columns='sig_id', inplace=True)
    X.drop(columns=['cp_type', 'sig_id'], inplace=True)
    X['cp_dose'] = ((X['cp_dose'] == 'D2').astype(np.int) - 0.5) * 2
    X['cp_time1'] = ((X['cp_time'] == 24).astype(np.int) - 0.5) * 2
    X['cp_time2'] = ((X['cp_time'] == 48).astype(np.int) - 0.5) * 2
    X.drop(columns='cp_time', inplace=True)
    
    if y is not None:
        return X, y
    return X

def preprocess(train, targets, test, pca=None, scaler=None, label_smoothing=0):
    train, targets = basic_preprocess(train, targets)
    test = basic_preprocess(test)
    
    if pca:
        mask = train.columns.str.startswith('g-')
        train.loc[:, mask] = pca.fit_transform(train.loc[:, mask])
        test.loc[:, mask] = pca.transform(test.loc[:, mask])

        mask = train.columns.str.startswith('c-')
        train.loc[:, mask] = pca.fit_transform(train.loc[:, mask])
        test.loc[:, mask] = pca.transform(test.loc[:, mask])
        
    if scaler:
        mask = train.columns.str.startswith('g-')
        train.loc[:, mask] = scaler.fit_transform(train.loc[:, mask])
        test.loc[:, mask] = scaler.transform(test.loc[:, mask])

        mask = train.columns.str.startswith('c-')
        train.loc[:, mask] = scaler.fit_transform(train.loc[:, mask])
        test.loc[:, mask] = scaler.transform(test.loc[:, mask])
    
    if label_smoothing:
        targets.clip(label_smoothing, 1 - label_smoothing, inplace=True)
    
    return train, targets, test

label_smoothing = 5e-4
train, targets, test = preprocess(train, targets, test, label_smoothing=0)

In [None]:
X = torch.tensor(train.values, dtype=torch.float)
y = torch.tensor(targets.values, dtype=torch.float)
X_t = torch.tensor(test.values, dtype=torch.float)

dataset = TensorDataset(X, y)
test_dataset = TensorDataset(X_t, torch.zeros(X_t.shape[0], y.shape[1], dtype=y.dtype))

In [None]:
class CrafterModel(nn.Module):
    def __init__(self, input_size, output_size, label_clip=label_smoothing):
        super().__init__()
        self.model = nn.Sequential()
        if label_smoothing:
            self.clip_min = torch.log(torch.tensor(label_clip)) - torch.log(torch.tensor(1 - label_clip))
        else:
            self.clip_min = -float("inf")
        
        hidden_sizes = [2048, 2048, output_size]
        dropouts = [0.2, 0.4, 0.4]
        assert len(hidden_sizes) == len(dropouts)
        for i, (hidden_size, dropout) in enumerate(zip(hidden_sizes, dropouts), start=1):
            self.model.add_module(f"batch_norm{i}", nn.BatchNorm1d(input_size))
            self.model.add_module(f"dropout{i}", nn.Dropout(dropout))
            linear_layer = torch.nn.utils.weight_norm(nn.Linear(input_size, hidden_size))
            self.model.add_module(f"linear{i}", linear_layer)
            
            if i != len(hidden_sizes):
                self.model.add_module(f"activation{i}", nn.ELU())
            
            input_size = hidden_size
    
    def forward(self, input):
        input = self.model.forward(input)
        input = torch.clamp(input, self.clip_min, -self.clip_min)
        return input
    
class CustomRunner(dl.SupervisedRunner):
    loss = nn.BCEWithLogitsLoss(reduction='mean')
    
    def _handle_batch(self, batch):
        y_pred = self.model(batch['features'])

        loss = CustomRunner.loss(y_pred, batch['targets'])
        self.batch_metrics.update({"loss": loss})
        
class EpochMetricSaverCallback(dl.Callback):
    def __init__(self, metrics=None):
        super().__init__(dl.CallbackOrder.Logging)
        self.metrics_log = defaultdict(lambda: [])
        self.metrics = metrics
        
    def on_epoch_end(self, runner):
        metrics = self.metrics
        if metrics is None:
            metrics = runner.epoch_metrics
        for x in metrics:
            self.metrics_log[x].append(runner.epoch_metrics[x])

In [None]:
def train_model(train_dataset, valid_dataset, num_epochs=15):
    loaders = {
        "train": DataLoader(train_dataset, batch_size=128, shuffle=True),
        "valid": DataLoader(valid_dataset, batch_size=1024, shuffle=False)
    }
    
    model = CrafterModel(X.shape[1], y.shape[1])
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4)
    logdir = "./logs"
    metric_save = EpochMetricSaverCallback()
    
    runner = CustomRunner()

    runner.train(
        model=model,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=loaders,
        num_epochs=num_epochs,
        logdir=logdir,
        callbacks=[
            dl.EarlyStoppingCallback(5),
            metric_save
        ],
        verbose=True,
        load_best_on_end=True
    )
    
    return model, metric_save.metrics_log

def predict(model, dataset):
    runner = dl.SupervisedRunner()
    results = runner.predict_loader(
        model=model,
        loader=DataLoader(dataset, batch_size=1024)
    )

    total_results = []
    for x in results:
        total_results.append(torch.sigmoid(x['logits']).to('cpu'))
    total_results = torch.cat(total_results)
    return total_results

In [None]:
models = []
predictions = []
metric_logs = []

n_splits = 5

for train_idx, valid_idx in tqdm(KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED).split(dataset), total=n_splits):
    train_dataset = Subset(dataset, train_idx)
    valid_dataset = Subset(dataset, valid_idx)
    
    model, metric_log = train_model(train_dataset, valid_dataset)
    models.append(model.to('cpu'))
    metric_logs.append(metric_log)
    predictions.append(predict(model, test_dataset))
    
predictions = torch.stack(predictions)

In [None]:
total_results = predictions.mean(dim=0)
total_results.clamp_(label_smoothing, 1 - label_smoothing)

Best train loss: 0.120  
Best valid loss: 0.0160   
Real loss: 0.01991  
num_epochs: 3  

In [None]:
submission.iloc[np.where(mask)[0], 1:] = total_results

In [None]:
submission.to_csv("submission.csv", index=None, float_format='%.7f')