https://www.kaggle.com/tmhrkt/grownet-gradient-boosting-neural-networks

In [None]:
!pip install adabelief-pytorch

In [None]:
from adabelief_pytorch import AdaBelief

In [None]:
import sys

sys.path.append("../input/iterative-stratification/iterative-stratification-master")
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import os
import copy
import random
import pickle
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import (
    StandardScaler,
    PowerTransformer,
    QuantileTransformer,
    OneHotEncoder,
)
from sklearn.decomposition import PCA

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import time
import warnings

warnings.filterwarnings("ignore")

In [None]:
DATADIR = "../input/lish-moa"

train_features = pd.read_csv(f"{DATADIR}/train_features.csv")
X = train_features.select_dtypes("number")
train_targets_nonscored = pd.read_csv(f"{DATADIR}/train_targets_nonscored.csv")
train_targets_scored = pd.read_csv(f"{DATADIR}/train_targets_scored.csv")
train_drug = pd.read_csv(f"{DATADIR}/train_drug.csv")

test_features = pd.read_csv(f"{DATADIR}/test_features.csv")
sample_submission = pd.read_csv(f"{DATADIR}/sample_submission.csv")

columns = train_targets_scored.iloc[:, 1:].columns

In [None]:
params = {
    "batch_size": 256,
    "model": "MLP_2HL",
    "optimizer": "adabelief",
    "lr": 1e-2,
    "weight_decay": 1e-5,
    "n_folds": 5,
    "early_stopping_steps": 5,
    "hidden_size": 512,
    "boost_rate": 1.0,  # original: 1.0
    "num_nets": 20,  # Number of weak NNs. original: 40 n_estimators?
    "epochs_per_stage": 3,  # Number of epochs to learn the Kth model. original: 1
    "correct_epoch": 1,  # Number of epochs to correct the whole week models original: 1
    "model_order": "second",  # You could put "first" according to the original implemention, but error occurs. original: "second"
}
n_seeds = 5


#DEBUG = True
DEBUG = False
if DEBUG:
    params["batch_size"] = 1024
    params["num_nets"] = 3
    params["epochs_per_stage"] = 1
    n_seeds = 2

In [None]:
GENES = [col for col in train_features.columns if col.startswith("g-")]
CELLS = [col for col in train_features.columns if col.startswith("c-")]

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything(seed=42)

# Preprocessing

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


class ClippedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, copy=True, high=0.99, low=0.01):
        self.copy = copy
        self.high = high
        self.low = low

    def fit(self, X, y=None):
        self.data_max_ = X.quantile(q=self.high)
        self.data_min_ = X.quantile(q=self.low)

        return self

    def transform(self, X):
        if self.copy:
            X = X.copy()

        X.clip(self.data_min_, self.data_max_, axis=1, inplace=True)

        return X

In [None]:
clipped_features = ClippedFeatures()
X = clipped_features.fit_transform(X)

with open("clipped_features.pkl", "wb") as f:
    pickle.dump(clipped_features, f)

train_features[X.columns] = X

In [None]:
#feature_cols = train_features.columns[4:].tolist()
feature_cols = ["cp_time"] + train_features.columns[4:].tolist()
params["feat_d"] = len(feature_cols)

In [None]:
train = train_features.merge(train_targets_scored, on="sig_id")
test = test_features.copy()
target = train[train_targets_scored.columns]

In [None]:
train = train.drop("cp_type", axis=1)
test = test.drop("cp_type", axis=1)

if "sig_id" in target.columns:
    target_cols = target.drop("sig_id", axis=1).columns.values.tolist()
else:
    target_cols = target.columns.values.tolist()

# CV strategy

In [None]:
import sys
sys.path.append('../input/iterativestratification')

import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection._split import _BaseKFold


class MultilabelGroupStratifiedKFold(_BaseKFold):
    def __init__(self, n_splits=5, random_state=None, shuffle=False):
        super().__init__(n_splits=n_splits, random_state=random_state, shuffle=shuffle)

    def _iter_test_indices(self, X=None, Y=None, groups=None):
        cv = MultilabelStratifiedKFold(
            n_splits=self.n_splits,
            random_state=self.random_state,
            shuffle=self.shuffle,
        )

        value_counts = groups.value_counts()
        regular_index = value_counts.loc[
            (value_counts == 6) | (value_counts == 12) | (value_counts == 18)
        ].index.sort_values()
        irregular_index = value_counts.loc[
            (value_counts != 6) & (value_counts != 12) & (value_counts != 18)
        ].index.sort_values()

        group_to_fold = {}
        tmp = Y.groupby(groups).mean().loc[regular_index]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            group_to_fold.update({group: fold for group in tmp.index[test]})

        sample_to_fold = {}
        tmp = Y.loc[groups.isin(irregular_index)]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            sample_to_fold.update({sample: fold for sample in tmp.index[test]})

        folds = groups.map(group_to_fold)
        is_na = folds.isna()
        folds[is_na] = folds[is_na].index.map(sample_to_fold).values

        for i in range(self.n_splits):
            yield np.where(folds == i)[0]

# Dataset Classes

In [None]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        dct = {
            "x": torch.tensor(self.features[idx, :], dtype=torch.float),
            "y": torch.tensor(self.targets[idx, :], dtype=torch.float),
        }
        return dct


class TestDataset:
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        dct = {"x": torch.tensor(self.features[idx, :], dtype=torch.float)}
        return dct

# Dynamic Model

In [None]:
from enum import Enum


class ForwardType(Enum):
    SIMPLE = 0
    STACKED = 1
    CASCADE = 2
    GRADIENT = 3


class DynamicNet(object):
    def __init__(self, c0, lr):
        self.models = []
        self.c0 = c0
        self.lr = lr
        self.boost_rate = nn.Parameter(
            torch.tensor(lr, requires_grad=True, device=device)
        )

    def add(self, model):
        self.models.append(model)

    def parameters(self):
        params = []
        for m in self.models:
            params.extend(m.parameters())

        params.append(self.boost_rate)
        return params

    def zero_grad(self):
        for m in self.models:
            m.zero_grad()

    def to_cuda(self):
        for m in self.models:
            m.cuda()

    def to_eval(self):
        for m in self.models:
            m.eval()

    def to_train(self):
        for m in self.models:
            m.train(True)

    def forward(self, x):
        if len(self.models) == 0:
            batch = x.shape[0]
            c0 = np.repeat(self.c0.detach().cpu().numpy().reshape(1, -1), batch, axis=0)
            c0 = torch.Tensor(c0).cuda() if device == "cuda" else torch.Tensor(c0)
            return None, c0
        middle_feat_cum = None
        prediction = None
        with torch.no_grad():
            for m in self.models:
                if middle_feat_cum is None:
                    middle_feat_cum, prediction = m(x, middle_feat_cum)
                else:
                    middle_feat_cum, pred = m(x, middle_feat_cum)
                    prediction += pred
        return middle_feat_cum, self.c0 + self.boost_rate * prediction

    def forward_grad(self, x):
        if len(self.models) == 0:
            batch = x.shape[0]
            c0 = np.repeat(self.c0.detach().cpu().numpy().reshape(1, -1), batch, axis=0)
            return None, torch.Tensor(c0).cuda()
        # at least one model
        middle_feat_cum = None
        prediction = None
        for m in self.models:
            if middle_feat_cum is None:
                middle_feat_cum, prediction = m(x, middle_feat_cum)
            else:
                middle_feat_cum, pred = m(x, middle_feat_cum)
                prediction += pred
        return middle_feat_cum, self.c0 + self.boost_rate * prediction

    @classmethod
    def from_file(cls, path, builder):
        d = torch.load(path)
        net = DynamicNet(d["c0"], d["lr"])
        net.boost_rate = d["boost_rate"]
        for stage, m in enumerate(d["models"]):
            submod = builder(stage)
            submod.load_state_dict(m)
            net.add(submod)
        return net

    def to_file(self, path):
        models = [m.state_dict() for m in self.models]
        d = {
            "models": models,
            "c0": self.c0,
            "lr": self.lr,
            "boost_rate": self.boost_rate,
        }
        torch.save(d, path)

# Weak Models

In [None]:
class MLP_2HL(nn.Module):
    def __init__(self, dim_in, dim_hidden1, dim_hidden2, sparse=False, bn=True):
        super(MLP_2HL, self).__init__()
        self.bn2 = nn.BatchNorm1d(dim_in)

        self.layer1 = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(dim_in, dim_hidden1),
            nn.ReLU(),
            nn.BatchNorm1d(dim_hidden1),
            nn.Dropout(0.4),
            nn.Linear(dim_hidden1, dim_hidden2),
        )
        self.layer2 = nn.Sequential(nn.ReLU(), nn.Linear(dim_hidden2, len(target_cols)),)

    def forward(self, x, lower_f):
        if lower_f is not None:
            x = torch.cat([x, lower_f], dim=1)
            x = self.bn2(x)
        middle_feat = self.layer1(x)
        out = self.layer2(middle_feat)
        return middle_feat, out

    @classmethod
    def get_model(cls, stage, params):
        if stage == 0:
            dim_in = params["feat_d"]
        else:
            dim_in = params["feat_d"] + params["hidden_size"]
        model = MLP_2HL(dim_in, params["hidden_size"], params["hidden_size"])
        return model

In [None]:
from torch.nn.modules.loss import _WeightedLoss


class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction="mean", smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets: torch.Tensor, n_labels: int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1), self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets, self.weight)

        if self.reduction == "sum":
            loss = loss.sum()
        elif self.reduction == "mean":
            loss = loss.mean()

        return loss

In [None]:
def get_optim(params, lr, weight_decay):
    optimizer = optim.Adam(params, lr, weight_decay=weight_decay)
    # optimizer = SGD(params, lr, weight_decay=weight_decay)
    return optimizer


def get_optim_adabelief(params, lr, weight_decay):
    optimizer = AdaBelief(
        params,
        lr=lr,
        weight_decay=weight_decay,
        eps=1e-16,
        betas=(0.9, 0.999),
        weight_decouple=True,
        rectify=False,
    )
    return optimizer

In [None]:
def logloss(net_ensemble, test_loader):
    loss = 0
    total = 0
    loss_f = (
        nn.BCEWithLogitsLoss()
    )  # Binary cross entopy loss with logits, reduction=mean by default
    for data in test_loader:
        x = data["x"].cuda() if device == "cuda" else data["x"]
        y = data["y"].cuda() if device == "cuda" else data["y"]
        with torch.no_grad():
            _, out = net_ensemble.forward(x)
        loss += loss_f(out, y)
        total += 1

    return loss / total

# Training

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
c0_ = np.log(np.mean(train_targets_scored.iloc[:, 1:].values, axis=0))


def train_fn(n_seeds):
    print(f"params: {params}")

    counts = np.empty((n_seeds * params["n_folds"], len(target_cols)))
    oof = np.zeros((len(train), len(target_cols)))

    for seed in tqdm(range(n_seeds)):
        seed_everything(seed)
        print("-" * 100)
        
        cv = MultilabelGroupStratifiedKFold(n_splits=params["n_folds"], random_state=seed, shuffle=True)
        cv_split = cv.split(train[feature_cols], train[target_cols], train_drug["drug_id"])
        
        for fold, (trn_idx, val_idx) in enumerate(cv_split):
            print("=" * 25, f"fold: {fold}", "=" * 25)
            
            x_train, x_val = train[feature_cols].iloc[trn_idx].values, train[feature_cols].iloc[val_idx].values
            y_train, y_val = train[target_cols].iloc[trn_idx].values, train[target_cols].iloc[val_idx].values
            
            counts[seed * params["n_folds"] + fold] = train[target_cols].iloc[trn_idx].sum()

            train_ds = MoADataset(x_train, y_train)
            val_ds = MoADataset(x_val, y_val)
            train_loader = DataLoader(
                train_ds, batch_size=params["batch_size"], shuffle=True
            )
            val_loader = DataLoader(
                val_ds, batch_size=params["batch_size"], shuffle=False
            )

            best_score = np.inf
            val_score = best_score
            best_stage = params["num_nets"] - 1

            c0 = torch.tensor(c0_, dtype=torch.float).to(device)
            net_ensemble = DynamicNet(c0, params["boost_rate"])
            loss_f1 = nn.MSELoss(reduction="none")
            loss_f2 = SmoothBCEwLogits(smoothing=0.001, reduction="none")
            loss_models = torch.zeros((params["num_nets"], 3))

            all_ensm_losses = []
            all_ensm_losses_te = []
            all_mdl_losses = []
            dynamic_br = []

            lr = params["lr"]
            L2 = params["weight_decay"]

            early_stop = 0
            for stage in range(params["num_nets"]):
                t0 = time.time()

                if params["model"] == "MLP_2HL_weight_norm":
                    model = MLP_2HL_weight_norm.get_model(stage, params)
                elif params["model"] == "MLP_2HL_leaky_relu":
                    model = MLP_2HL_leaky_relu.get_model(stage, params)
                else:
                    model = MLP_2HL.get_model(stage, params)
                model.to(device)

                if params["optimizer"] == "adam":
                    optimizer = get_optim(model.parameters(), lr, L2)
                elif params["optimizer"] == "adabelief":
                    optimizer = get_optim_adabelief(model.parameters(), lr, L2)

                net_ensemble.to_train()  # Set the models in ensemble net to train mode
                stage_mdlloss = []
                for epoch in range(params["epochs_per_stage"]):
                    for i, data in enumerate(train_loader):
                        x = data["x"].to(device)
                        y = data["y"].to(device)
                        middle_feat, out = net_ensemble.forward(x)
                        if params["model_order"] == "first":
                            grad_direction = y / (1.0 + torch.exp(y * out))
                        else:
                            h = 1 / (
                                (1 + torch.exp(y * out)) * (1 + torch.exp(-y * out))
                            )
                            grad_direction = y * (1.0 + torch.exp(-y * out))
                            nwtn_weights = (torch.exp(out) + torch.exp(-out)).abs()
                        _, out = model(x, middle_feat)
                        loss = loss_f1(net_ensemble.boost_rate * out, grad_direction)
                        loss = loss * h
                        loss = loss.mean()
                        model.zero_grad()
                        loss.backward()
                        optimizer.step()
                        stage_mdlloss.append(loss.item())

                net_ensemble.add(model)
                sml = np.mean(stage_mdlloss)

                stage_loss = []
                lr_scaler = 2
                # fully-corrective step
                if stage != 0:
                    # Adjusting corrective step learning rate
                    if stage % 3 == 0:
                        lr /= 2 
                    optimizer = get_optim(net_ensemble.parameters(), lr / lr_scaler, L2)
                    for _ in range(params["correct_epoch"]):
                        for i, data in enumerate(train_loader):
                            x = data["x"].to(device)
                            y = data["y"].to(device)
                            _, out = net_ensemble.forward_grad(x)
                            loss = loss_f2(out, y).mean()
                            optimizer.zero_grad()
                            loss.backward()
                            optimizer.step()
                            stage_loss.append(loss.item())

                sl_te = logloss(net_ensemble, val_loader)

                # Store dynamic boost rate
                dynamic_br.append(net_ensemble.boost_rate.item())

                elapsed_tr = time.time() - t0
                sl = 0
                if stage_loss != []:
                    sl = np.mean(stage_loss)

                all_ensm_losses.append(sl)
                all_ensm_losses_te.append(sl_te)
                all_mdl_losses.append(sml)
                print(
                    f"Stage - {stage}, training time: {elapsed_tr: .1f} sec, boost rate: {net_ensemble.boost_rate: .4f}, Training Loss: {sl: .5f}, Val Loss: {sl_te: .5f}"
                )

                if device == "cuda":
                    net_ensemble.to_cuda()
                net_ensemble.to_eval()  # Set the models in ensemble net to eval mode

                # --------------------- Train ---------------------
                if sl_te < best_score:
                    best_score = sl_te
                    best_stage = stage
                    net_ensemble.to_file(f"./{fold}FOLD_{seed}_.pth")
                    early_stop = 0
                else:
                    early_stop += 1

                if early_stop > params["early_stopping_steps"]:
                    print("early stopped!")
                    break

            print(f"Best validation stage: {best_stage}")

            if params["model"] == "MLP_2HL_weight_norm":
                net_ensemble = DynamicNet.from_file(
                    f"./{fold}FOLD_{seed}_.pth",
                    lambda stage: MLP_2HL_weight_norm.get_model(stage, params),
                )
            elif params["model"] == "MLP_2HL_leaky_relu":
                net_ensemble = DynamicNet.from_file(
                    f"./{fold}FOLD_{seed}_.pth",
                    lambda stage: MLP_2HL_leaky_relu.get_model(stage, params),
                )
            else:
                net_ensemble = DynamicNet.from_file(
                    f"./{fold}FOLD_{seed}_.pth",
                    lambda stage: MLP_2HL.get_model(stage, params),
                )
            if device == "cuda":
                net_ensemble.to_cuda()
            net_ensemble.to_eval()

            # --------------------- PREDICTION---------------------

            preds = []
            with torch.no_grad():
                for data in val_loader:
                    x = data["x"].to(device)
                    _, pred = net_ensemble.forward(x)
                    preds.append(pred.sigmoid().detach().cpu().numpy())
            oof[val_idx, :] += np.concatenate(preds) / n_seeds

    train[target_cols] = oof

    val_results = (
        train_targets_scored.drop(columns=target_cols)
        .merge(train[["sig_id"] + target_cols], on="sig_id", how="left")
        .fillna(0)
    )

    y_true = train_targets_scored[target_cols].values
    y_pred = val_results[target_cols].values

    score = 0
    for i in range(len(target_cols)):
        score_ = log_loss(y_true[:, i], y_pred[:, i])
        score += score_ / len(target_cols)
    print("CV log_loss ", score)
   
    with open("counts.pkl", "wb") as f:
        pickle.dump(counts, f)
    
    y_pred = pd.DataFrame(y_pred, index=train["sig_id"], columns=target_cols)
    with open("Y_pred.pkl", "wb") as f:
        pickle.dump(y_pred, f)

    return score, y_pred

In [None]:
%%time

score, y_pred = train_fn(n_seeds)

In [None]:
score

In [None]:
path = r"Y_pred.pkl"
with open(path, 'rb') as f:
    Y_pred = pickle.load(f)
Y_pred

In [None]:
path = r"counts.pkl"
with open(path, 'rb') as f:
    counts = pickle.load(f)
print(counts.shape)
counts

# predict test

In [None]:
train_features = pd.read_csv(f"{DATADIR}/train_features.csv")
X = train_features.select_dtypes("number")

sample_submission = pd.read_csv(f"{DATADIR}/sample_submission.csv")
test_features = pd.read_csv(f"{DATADIR}/test_features.csv")

test = test_features.copy()
with open("./clipped_features.pkl", "rb") as f:
    clipped_features = pickle.load(f)
test[X.columns] = clipped_features.transform(test[X.columns])
test

In [None]:
pd.set_option('display.max_columns', None)

x_test = test[feature_cols].values
test_ds = TestDataset(x_test)
test_loader = DataLoader(test_ds, batch_size=params["batch_size"], shuffle=False)

predictions = np.zeros((len(test), len(target_cols)))
for seed in tqdm(range(n_seeds)):
    seed_everything(seed)
    
    for fold in range(params["n_folds"]):
        if params["model"] == "MLP_2HL_weight_norm":
            net_ensemble = DynamicNet.from_file(
                f"./{fold}FOLD_{seed}_.pth",
                lambda stage: MLP_2HL_weight_norm.get_model(stage, params),
            )
        elif params["model"] == "MLP_2HL_leaky_relu":
            net_ensemble = DynamicNet.from_file(
                f"./{fold}FOLD_{seed}_.pth",
                lambda stage: MLP_2HL_leaky_relu.get_model(stage, params),
            )
        else:
            net_ensemble = DynamicNet.from_file(
                f"./{fold}FOLD_{seed}_.pth",
                lambda stage: MLP_2HL.get_model(stage, params),
            )
        if device == "cuda":
            net_ensemble.to_cuda()
        net_ensemble.to_eval()

        preds = []
        with torch.no_grad():
            for data in test_loader:
                x = data["x"].to(device)
                _, pred = net_ensemble.forward(x)
                preds.append(pred.sigmoid().detach().cpu().numpy())
        predictions += np.concatenate(preds) / (params["n_folds"] * n_seeds)

sample_submission[target_cols] = predictions

sample_submission.loc[:, ["atp-sensitive_potassium_channel_antagonist", "erbb2_inhibitor"]] = 0.000012

test = test.set_index("sig_id")
sample_submission = sample_submission.set_index("sig_id")
sample_submission[test["cp_type"] == "ctl_vehicle"] = 0.0

sample_submission.to_csv("submission.csv")

display(sample_submission)