In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from typing import List
import torch.nn.functional as F
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm
import random
from sklearn import metrics


In [None]:
train_df = pd.read_csv(f"../input/tabular-playground-series-may-2022/train.csv")
test_df = pd.read_csv(f"../input/tabular-playground-series-may-2022/test.csv")

In [None]:
test_df["target"] = 0

train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [None]:
params = {
        "~lr": 0.01,
        "~batch_size": 2048,
        "~epochs": 40,
        "~early_stopping_patience": 6,
        "~optimizer": "adam",
        "~loss": "bce",
        "activation": "swish",
        "model": "baseline"
    }

# FE

In [None]:
class DataProcess:
    def __init__(self, df: pd.DataFrame) -> None:
        self.scaler = StandardScaler()
        self.numerical_cols = [f"f_{i:02d}" for i in range(27)] + ["f_28"]
        self.float_cols = [i for i in df.columns if df[i].dtype == "float64"]
        self.scaler.fit(df[self.numerical_cols].values)

    def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        df[self.numerical_cols] = self.scaler.transform(df[self.numerical_cols].values)

        # f_29, f_30 -> onehot
        df = df.drop(columns="f_29").join(
            pd.get_dummies(df["f_29"]).rename(columns={0: "f_29_0", 1: "f_29_1"})
        )

        df = df.drop(columns="f_30").join(
            pd.get_dummies(df["f_30"]).rename(
                columns={0: "f_30_0", 1: "f_30_1", 2: "f_30_2"}
            )
        )



        # https://www.kaggle.com/code/ambrosm/tpsmay22-gradient-boosting-quickstart/notebook
        # https://www.kaggle.com/code/ambrosm/tpsmay22-eda-which-makes-sense
        # f_27 -> each char to ord
        # f_27 -> nunique
        for i in range(10):
            df[f"f_27_{i}_int"] = df.f_27.str[i].map(ord) - ord("A")
        df[f"f_27_nunique"] = df.f_27.apply(lambda c: len(set(c)))

        df = df.drop(columns="f_27")

        # stats features
        # https://www.kaggle.com/code/cv13j0/tps-may22-eda-neuronal-nets/notebook
        df["f_sum"] = df[self.float_cols].sum(axis=1)
        df["f_min"] = df[self.float_cols].min(axis=1)
        df["f_max"] = df[self.float_cols].max(axis=1)
        df["f_mean"] = df[self.float_cols].mean(axis=1)
        df["f_std"] = df[self.float_cols].std(axis=1)
        df["f_mad"] = df[self.float_cols].mad(axis=1)
        df["f_kurt"] = df[self.float_cols].kurt(axis=1)
        df["f_count_pos"] = df[self.float_cols].gt(0).count(axis=1)

        return df


In [None]:
processor = DataProcess(train_df)
train_df = processor.preprocess(train_df)
val_df = processor.preprocess(val_df)
test_df = processor.preprocess(test_df)

# DataLoader

In [None]:
class DataLoader:
    class Dataset(torch.utils.data.Dataset):
        def __init__(self, x: np.ndarray, y: np.ndarray):
            self.x = x
            self.y = y
            self.len = len(self.x)

        def __getitem__(self, index):
            x = self.x[index]
            y = self.y[index]
            return x, y

        def __len__(self):
            return self.len

    class Sampler(torch.utils.data.Sampler):
        def __init__(self, l: int, shuffle: bool) -> None:
            super().__init__(l)
            self.len = l
            self.shuffle = shuffle

        def __iter__(self) -> List[int]:
            lst = list(range(self.len))
            if self.shuffle:
                random.shuffle(lst)
            for i in lst:
                yield i

        def __len__(self) -> int:
            return self.len

    def __init__(self, df: pd.DataFrame) -> None:
        self.x = df.drop(columns=["id", "target"]).values
        self.y = df["target"].values

    def get(self, is_train=False) -> torch.utils.data.DataLoader:
        dataset = self.Dataset(self.x, self.y)
        sampler = self.Sampler(len(self.x), shuffle=is_train)
        batch_size = params["~batch_size"] if is_train else len(dataset)

        return torch.utils.data.DataLoader(
            dataset=dataset,
            sampler=sampler,
            batch_size=batch_size,
            drop_last=is_train,
        )

    
train_ds = DataLoader(train_df).get(is_train=True)
val_ds = DataLoader(val_df).get()
test_ds = DataLoader(test_df).get()


# Model

In [None]:
class Model(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(64, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.fc5 = nn.Linear(128, 32)
        self.bn5 = nn.BatchNorm1d(32)
        self.fc6 = nn.Linear(32, 1)
        if params["activation"] == "relu":
            self.activation = F.relu
        elif params["activation"] == "swish":
            self.activation = F.silu

    def forward(self, x):
        x = self.activation(self.bn1(self.fc1(x)))
        x = self.activation(self.bn2(self.fc2(x)))
        x = self.activation(self.bn5(self.fc5(x)))
        x = torch.sigmoid(self.fc6(x))

        return x.squeeze()

model = Model(len(set(train_df.columns) - {"id", "target"})).to('cuda')


# Train

In [None]:
def get_scheduler(optimizer, train_dataloader):
    epochs = params["~epochs"]
    num_training_steps = int(epochs * len(train_dataloader))

    return get_linear_schedule_with_warmup(
        optimizer, int(0.1 * num_training_steps), num_training_steps
    )


In [None]:
class Callback:
    def __init__(self) -> None:
        pass

    def on_val_end(self, preds: np.ndarray, gts: np.ndarray, loss):
        pass

    def on_train_batch_end(self, preds: np.ndarray, gts: np.ndarray, loss):
        pass

    def on_epoch_end(self, loss, val_loss, model: torch.nn.Module) -> bool:
        pass

    def on_train_finish(self, model: torch.nn.Module):
        pass

class EarlyStopping(Callback):
    def __init__(self) -> None:
        self.patience = params["~early_stopping_patience"]
        self.min_loss = np.inf
        self.counter = 0
        self.best_state_dict = None

    def on_val_end(self, preds: np.ndarray, gts: np.ndarray, loss):
        pass

    def on_train_batch_end(self, preds: np.ndarray, gts: np.ndarray, loss):
        pass

    def on_epoch_end(self, loss, val_loss, model: torch.nn.Module) -> bool:
        if val_loss < self.min_loss:
            self.min_loss = val_loss
            self.counter = 0
            self.best_state_dict = model.state_dict()
        else:
            self.counter += 1

        return self.counter < self.patience

    def on_train_finish(self, model: torch.nn.Module):
        model.load_state_dict(self.best_state_dict)

        
class WandbCallback(Callback):
    def __init__(self) -> None:
        self.train_epoch_losses = []
        self.val_epoch_losses = []
        self.train_batch_losses = []
        self.val_batch_losses = []

    def on_val_end(self, preds: np.ndarray, gts: np.ndarray, loss):
        gts = gts.detach().cpu()
        preds = preds.detach().cpu()

        fpr, tpr, threshold = metrics.roc_curve(gts, preds)
        roc_auc = metrics.auc(fpr, tpr)
#         wandb.log({"roc_auc": roc_auc})

        print(f"roc_auc: {roc_auc}")
        return True

    def on_train_batch_end(self, preds: np.ndarray, gts: np.ndarray, loss):
        self.train_batch_losses.append(loss)

    def on_epoch_end(self, loss, val_loss, model: torch.nn.Module) -> bool:
        self.val_epoch_losses.append(val_loss)
        self.train_epoch_losses.append(loss)

#         wandb.log({"loss": loss, "val_loss": val_loss})

        return True

def epoch_train(
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    scheduler,
    train_loader,
    criterion,
    callbacks: List[Callback] = [],
):
    model.train()

    losses = []
    for i, batch in tqdm(enumerate(train_loader), total=len(train_loader), unit=" batch"):
        batch_x = batch[0].to(torch.float32).to("cuda")
        batch_y = batch[1].to(torch.float32).to("cuda")

        optimizer.zero_grad()
        pred_y = model(batch_x)
        loss = criterion(pred_y, batch_y)
        loss.backward()
        optimizer.step()
        scheduler.step()

        losses.append(loss.item())

        [cb.on_train_batch_end(pred_y, batch_y, loss.item()) for cb in callbacks]

    return np.mean(losses)


def epoch_val(
    model: torch.nn.Module, val_loader, criterion, callbacks: List[Callback] = []
):
    model.eval()

    losses = []
    for i, batch in enumerate(val_loader):
        batch_x = batch[0].to(torch.float32).to("cuda")
        batch_y = batch[1].to(torch.float32).to("cuda")
        pred_y = model(batch_x)
        loss = criterion(pred_y, batch_y)
        losses.append(loss.item())

        [cb.on_val_end(pred_y, batch_y, loss.item()) for cb in callbacks]
        break
    return np.mean(losses)


def predict(model: torch.nn.Module, test_loader):
    model.eval()
    preds = []
    gts = []
    for i, (batch_x, batch_y) in enumerate(test_loader):
        batch_x = batch_x.to(torch.float32).to("cuda")
        batch_y = batch_y.to(torch.float32).to("cuda")
        pred_y = model(batch_x)

        preds.append(pred_y.cpu().detach().numpy())
        gts.append(batch_y.cpu().detach().numpy())

        break

    preds = np.array(preds)
    preds = preds.reshape(-1, preds.shape[-2], preds.shape[-1])
    gts = np.array(gts)
    gts = gts.reshape(-1, gts.shape[-2], gts.shape[-1])
    return preds, gts


In [None]:
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=params["~lr"])
scheduler = get_scheduler(optimizer, train_ds)
callbacks = [EarlyStopping(), WandbCallback()]

for epoch in range(params["~epochs"]):
    loss = epoch_train(
        model, optimizer, scheduler, train_ds, criterion, callbacks
    )
    val_loss = epoch_val(model, val_ds, criterion, callbacks)
    print(epoch, ": train_loss", loss, "val_loss", val_loss)

    res = [c.on_epoch_end(loss, val_loss, model) for c in callbacks]
    if False in res:
        print("Early stopping")
        break
        
[c.on_train_finish(model) for c in callbacks]


# Submit

In [None]:
preds, gts = predict(model, test_ds)

sub = pd.read_csv(f"../input/tabular-playground-series-may-2022/sample_submission.csv")
sub.target = preds.squeeze()
sub.to_csv('submission.csv', index=False)
