Here is a succinct version of a basic neural network. The architecture is currently taken from another kernel. There are so many similar notebooks, so it's hard to tell where it originates from.

Most of the notebooks felt long and slightly messy. So I wrote a short version of it.

I was quite disappointed from Pytorch Lightning, as despite it's shiny presentation and version 1.0 it's buggy under the hood and the documentation is incomplete. Something goes very wrong during training and it's puzzling why a self-written loop performs much better.

In [None]:
import sys
sys.path.append("/kaggle/input/multilabel-stratification")  # https://github.com/trent-b/iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
import torch
from torch import nn
from torch.utils import data
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pandas as pd
from pathlib import Path
from statistics import mean, stdev
from tqdm.auto import tqdm
from functools import reduce

In [None]:
data_dir = Path("/kaggle/input/lish-moa")

# Load raw data

In [None]:
D_feats=pd.read_csv(data_dir / "train_features.csv", index_col="sig_id")
display(D_feats.head(2))

D_targets=pd.read_csv(data_dir / "train_targets_scored.csv", index_col="sig_id")
display(D_targets.head(2))

D_feats_test=pd.read_csv(data_dir / "test_features.csv", index_col="sig_id")

# Prepare data

In [None]:
def process(df, select_trt=True):
    if select_trt:
        df=df.query("cp_type=='trt_cp'")
        
    df=df.drop(columns="cp_type")    
        
    df["cp_dose"]=df["cp_dose"].map({"D1":0, "D2":1})
    df["cp_time"]=df["cp_time"]/df["cp_time"].max()

    df=df.astype("float32")
    
    return df
    
X=process(D_feats, select_trt=False)
display(X.head(2))

Y=D_targets.loc[X.index].astype("float32")

Xt=process(D_feats_test)
display(Xt.head(2))

# Fold definition

In [None]:
n_splits = 5
splitter_random_seed = 123

fold_splitter = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=splitter_random_seed)
folds = pd.Series(-1, index=X.index)
for fold_idx, (train_idx, val_idx) in enumerate(fold_splitter.split(X, Y)):
    folds.iloc[val_idx]=fold_idx

In [None]:
class FoldTensorDatasets:
    def __init__(self, X, Y, fold):
        self.X = X
        self.Y = Y
        self._fold = fold
        self._folds = sorted(fold.unique())

    def fold(self, fold_idx):
        train_idx = self._fold.loc[lambda x: x != fold_idx].index
        val_idx = self._fold.loc[lambda x: x == fold_idx].index

        train_dataset = data.TensorDataset(
            torch.tensor(self.X.loc[train_idx].values, dtype=torch.float),
            torch.tensor(self.Y.loc[train_idx].values, dtype=torch.float),
        )

        val_dataset = data.TensorDataset(
            torch.tensor(self.X.loc[val_idx].values, dtype=torch.float),
            torch.tensor(self.Y.loc[val_idx].values, dtype=torch.float),
        )
        
        return train_dataset, val_dataset

    def folds(self):
        for fold_idx in self._folds:
            yield self.fold(fold_idx)


dataset = FoldTensorDatasets(X, Y, folds)

# Model definition

In [None]:
max_epochs=25
hidden_size1 = 200
hidden_size2 = 400
train_batch_size = 128
val_batch_size=1024

##

num_feats = X.shape[1]
num_targets = Y.shape[1]
    
   
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.net = nn.Sequential(
            nn.BatchNorm1d(num_feats),
            nn.Dropout(0.2),
            nn.utils.weight_norm(nn.Linear(num_feats, hidden_size1)),
            nn.ReLU(),
            #
            nn.BatchNorm1d(hidden_size1),
            nn.Dropout(0.5),
            nn.utils.weight_norm(nn.Linear(hidden_size1, hidden_size2)),
            nn.ReLU(),
            #
            nn.BatchNorm1d(hidden_size2),
            nn.Dropout(0.5),
            nn.utils.weight_norm(nn.Linear(hidden_size2, num_targets)),
        )
        
    def forward(self, x):
        return self.net(x)
    
    def infer(self, x):
        return self.net(x).sigmoid()
    

# Training

In [None]:
fold_val_losses = []
model_filenames = []
device="cuda"

for fold_num, (train_data, val_data) in tqdm(enumerate(dataset.folds(), 1), total=n_splits):
    print(f"Fold {fold_num}")
    
    train_loader = DataLoader(train_data, batch_size=train_batch_size)
    val_loader = DataLoader(val_data, batch_size=val_batch_size)
    
    model = Model().to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer=optimizer,
        pct_start=0.1,
        div_factor=1e3,
        max_lr=1e-2,
        epochs=max_epochs,
        steps_per_epoch=len(train_loader),
    )
    
    for epoch in range(max_epochs):
        model.train()
        for x, y in train_loader:
            x=x.to(device)
            y=y.to(device)
            
            optimizer.zero_grad()
            out = model(x)
            loss = F.binary_cross_entropy_with_logits(out, y)
            loss.backward()
            optimizer.step()
            scheduler.step()
            
        model.eval()
        val_losses=[]
        with torch.no_grad():
            for x, y in val_loader:
                x=x.to(device)
                y=y.to(device)
                
                out = model(x)
                loss = F.binary_cross_entropy_with_logits(out, y)
                val_losses.append(loss.item())
        
        mean_val_loss=mean(val_losses)
        print(f"[e{epoch:02}] Last Train Loss: {loss.item():.4f}, Val Loss {mean_val_loss:.4f}")
        
    model_filename =  f"model_fold{fold_num}.pth"
    torch.save(model.state_dict(), model_filename)
    model_filenames.append(model_filename)
    
    fold_val_losses.append(mean_val_loss)
    
print(f"Avg. fold val. loss: {mean(fold_val_losses):.5f}+-{stdev(fold_val_losses):.5f}")
print("TRAINING DONE")
%ls *.pth

# Predict

In [None]:
device="cuda"

model = Model().to(device)

fold_preds = []

for model_filename in model_filenames:
    model.load_state_dict(torch.load(model_filename))
    model.eval()
    
    preds=[]
    
    for i in range(0, len(Xt), val_batch_size):
        X_batch=Xt.iloc[i:i+val_batch_size]

        cur_preds=model.infer(torch.tensor(X_batch.values).to(device))

        preds.append(pd.DataFrame(cur_preds.cpu().detach().numpy(), index=X_batch.index, columns=Y.columns))
        
    fold_preds.append(pd.concat(preds))
    
Yt = reduce(pd.DataFrame.add, fold_preds) / len(fold_preds)

Yt = Yt.reindex(D_feats_test.index).fillna(0)
Yt.to_csv("submission.csv", float_format="%.4e")
Yt.head()

That's all folks.