In [None]:
!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet

In [None]:
import sys

sys.path.append("../input/customtabnet")

In [None]:
import sys

sys.path.append("../input/rank-gauss")

In [None]:
import sys

sys.path.append("../input/tabnet")

In [None]:
import os
import pickle

import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
import torch
import tqdm
from gauss_rank_scaler import GaussRankScaler
from pytorch_tabnet.tab_model import TabNetRegressor

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"

In [None]:
def compute_row_statistics(X, prefix=""):
    Xt = pd.DataFrame()

    for agg_func in [
        # "min",
        # "max",
        "mean",
        "std",
        "kurtosis",
        "skew",
    ]:
        Xt[f"{prefix}{agg_func}"] = X.agg(agg_func, axis=1)

    return Xt

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


class ClippedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, copy=True, high=0.99, low=0.01):
        self.copy = copy
        self.high = high
        self.low = low

    def fit(self, X, y=None):
        self.data_max_ = X.quantile(q=self.high)
        self.data_min_ = X.quantile(q=self.low)

        return self

    def transform(self, X):
        if self.copy:
            X = X.copy()

        X.clip(self.data_min_, self.data_max_, axis=1, inplace=True)

        return X

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader


class DynamicNet(object):
    def __init__(self, c0, lr):
        self.models = []
        self.c0 = c0
        self.lr = lr
        self.boost_rate = nn.Parameter(
            torch.tensor(lr, requires_grad=True, device=device)
        )

    def add(self, model):
        self.models.append(model)

    def parameters(self):
        params = []

        for m in self.models:
            params.extend(m.parameters())

        params.append(self.boost_rate)

        return params

    def zero_grad(self):
        for m in self.models:
            m.zero_grad()

    def to_cuda(self):
        for m in self.models:
            m.cuda()

    def to_eval(self):
        for m in self.models:
            m.eval()

    def to_train(self):
        for m in self.models:
            m.train(True)

    def forward(self, x):
        if len(self.models) == 0:
            batch = x.shape[0]
            c0 = np.repeat(self.c0.detach().cpu().numpy().reshape(1, -1), batch, axis=0)
            c0 = torch.Tensor(c0).cuda() if device == "cuda" else torch.Tensor(c0)

            return None, c0

        middle_feat_cum = None
        prediction = None

        with torch.no_grad():
            for m in self.models:
                if middle_feat_cum is None:
                    middle_feat_cum, prediction = m(x, middle_feat_cum)
                else:
                    middle_feat_cum, pred = m(x, middle_feat_cum)
                    prediction += pred

        return middle_feat_cum, self.c0 + self.boost_rate * prediction

    def forward_grad(self, x):
        if len(self.models) == 0:
            batch = x.shape[0]
            c0 = np.repeat(self.c0.detach().cpu().numpy().reshape(1, -1), batch, axis=0)

            return None, torch.Tensor(c0).cuda()

        # at least one model
        middle_feat_cum = None
        prediction = None

        for m in self.models:
            if middle_feat_cum is None:
                middle_feat_cum, prediction = m(x, middle_feat_cum)
            else:
                middle_feat_cum, pred = m(x, middle_feat_cum)
                prediction += pred

        return middle_feat_cum, self.c0 + self.boost_rate * prediction

    @classmethod
    def from_file(cls, path, builder, device):
        d = torch.load(path, map_location=device)
        net = DynamicNet(d["c0"], d["lr"])
        net.boost_rate = d["boost_rate"]

        for stage, m in enumerate(d["models"]):
            submod = builder(stage)

            submod.load_state_dict(m)
            net.add(submod)

        return net

    def to_file(self, path):
        models = [m.state_dict() for m in self.models]
        d = {
            "models": models,
            "c0": self.c0,
            "lr": self.lr,
            "boost_rate": self.boost_rate,
        }

        torch.save(d, path)

    def predict(self, X, device, batch_size=32):
        self.to_eval()

        for m in self.models:
            m.to(device)

        dataset = TestDataset(X)
        data_loader = DataLoader(dataset, batch_size=batch_size)

        predictions = []

        with torch.no_grad():
            for data in data_loader:
                x = data["x"].to(device)
                _, pred = self.forward(x)

                predictions.append(pred.sigmoid().detach().cpu().numpy())

        return np.concatenate(predictions)


class MLP_2HL(nn.Module):
    def __init__(self, dim_in, dim_hidden, dim_out, sparse=False, bn=True):
        super().__init__()

        self.bn2 = nn.BatchNorm1d(dim_in)

        self.layer1 = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(dim_in, dim_hidden),
            nn.ReLU(),
            nn.BatchNorm1d(dim_hidden),
            nn.Dropout(0.4),
            nn.Linear(dim_hidden, dim_hidden),
        )
        self.layer2 = nn.Sequential(
            nn.ReLU(),
            nn.Linear(dim_hidden, dim_out),
        )

    def forward(self, x, lower_f):
        if lower_f is not None:
            x = torch.cat([x, lower_f], dim=1)
            x = self.bn2(x)

        middle_feat = self.layer1(x)
        out = self.layer2(middle_feat)

        return middle_feat, out

    @classmethod
    def get_model(cls, stage, dim_in, dim_hidden, dim_out):
        if stage != 0:
            dim_in += dim_hidden

        return MLP_2HL(dim_in, dim_hidden, dim_out)


class TestDataset(object):
    def __init__(self, X):
        self.X = np.asarray(X)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i):
        return {"x": torch.tensor(self.X[i], dtype=torch.float)}

In [None]:
import numpy as np
from pytorch_tabnet.metrics import Metric


class LogitsLogLoss(Metric):
    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(
            logits + 1e-15
        )

        return -np.mean(aux)

In [None]:
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader


def predict_with_pytorch_model(model, X, device, batch_size=32):
    model.eval()

    dataset = TestDataset(X)
    data_loader = DataLoader(dataset, batch_size=batch_size)

    predictions = []

    with torch.no_grad():
        for data in data_loader:
            inputs = data["x"].to(device)
            outputs = model(inputs)

        predictions.append(outputs.sigmoid().detach().cpu().numpy())

    return np.concatenate(predictions)


class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size, rate=0.2619422201258426):
        super().__init__()

        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))

        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(rate)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))

        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(rate)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))

    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))

        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)

        return x

In [None]:
import tensorflow as tf
from tabnet import StackedTabNet


class StackedTabNetClassifier(tf.keras.Model):
    def __init__(
        self,
        num_classes,
        batch_momentum=0.98,
        epsilon=1e-05,
        feature_columns=None,
        feature_dim=64,
        norm_type="group",
        num_decision_steps=5,
        num_features=None,
        num_groups=2,
        num_layers=1,
        output_dim=64,
        relaxation_factor=1.5,
        sparsity_coefficient=1e-05,
        virtual_batch_size=None,
        **kwargs
    ):
        super().__init__(**kwargs)

        self.stacked_tabnet = StackedTabNet(
            feature_columns,
            batch_momentum=batch_momentum,
            epsilon=epsilon,
            feature_dim=feature_dim,
            norm_type=norm_type,
            num_decision_steps=num_decision_steps,
            num_features=num_features,
            num_groups=num_groups,
            num_layers=num_layers,
            output_dim=output_dim,
            relaxation_factor=relaxation_factor,
            sparsity_coefficient=sparsity_coefficient,
            virtual_batch_size=virtual_batch_size,
        )

        self.classifier = tf.keras.layers.Dense(
            num_classes, activation="sigmoid", use_bias=False
        )

    def call(self, inputs, training=None):
        x = self.stacked_tabnet(inputs, training=training)

        return self.classifier(x)

In [None]:
import tensorflow as tf
from tabnet import TabNet


class TabNetClassifier(tf.keras.Model):
    def __init__(
        self,
        num_classes,
        batch_momentum=0.98,
        epsilon=1e-05,
        feature_columns=None,
        feature_dim=64,
        norm_type="group",
        num_decision_steps=5,
        num_features=None,
        num_groups=1,
        output_dim=64,
        relaxation_factor=1.5,
        sparsity_coefficient=1e-05,
        virtual_batch_size=None,
        **kwargs
    ):
        super().__init__(**kwargs)

        self.tabnet = TabNet(
            feature_columns,
            batch_momentum=batch_momentum,
            epsilon=epsilon,
            feature_dim=feature_dim,
            norm_type=norm_type,
            num_decision_steps=num_decision_steps,
            num_features=num_features,
            num_groups=num_groups,
            output_dim=output_dim,
            relaxation_factor=relaxation_factor,
            sparsity_coefficient=sparsity_coefficient,
            virtual_batch_size=virtual_batch_size,
            **kwargs
        )

        self.classifier = tf.keras.layers.Dense(
            num_classes, activation="sigmoid", use_bias=False
        )

    def call(self, inputs, training=None):
        x = self.tabnet(inputs, training=training)

        return self.classifier(x)

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa


def gelu(x):
    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))

    return x * cdf


def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential(
        [
            tf.keras.layers.Dense(dff, activation=gelu),  # (batch_size, seq_len, dff)
            tf.keras.layers.Dense(d_model),  # (batch_size, seq_len, d_model)
        ]
    )


def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += mask * -1e09

    # softmax is normalized on the last axis (seq_len_k) so that the scores add up to 1.
    attention_weights = tf.nn.softmax(
        scaled_attention_logits, axis=-1
    )  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights


class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-06)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-06)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(
            out1 + ffn_output
        )  # (batch_size, input_seq_len, d_model)

        return out2


class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()

        self.num_heads = num_heads
        self.d_model = d_model

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))

        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask
        )

        scaled_attention = tf.transpose(
            scaled_attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(
            scaled_attention, (batch_size, -1, self.d_model)
        )  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights


class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, rate=0.1, **kwargs):
        super().__init__(**kwargs)

        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dff = dff
        self.rate = rate

        self.embedding = tfa.layers.WeightNormalization(
            tf.keras.layers.Dense(self.d_model)
        )

        self.enc_layers = [
            EncoderLayer(self.d_model, self.num_heads, self.dff, self.rate)
            for _ in range(self.num_layers)
        ]

        self.dropout = tf.keras.layers.Dropout(self.rate)

    def get_config(self):
        config = super().get_config().copy()

        config.update(
            {
                "d_model": self.d_model,
                "dff": self.dff,
                "num_layers": self.num_layers,
                "num_heads": self.num_heads,
                "rate": self.rate,
            }
        )

        return config

    def call(self, x, training, mask=None):
        seq_len = tf.shape(x)[1]

        x = self.embedding(x)
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)


class TransformerEncoderByZhang(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, rate=0.1, **kwargs):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dff = dff
        self.rate = rate

        self.embedding = tf.keras.layers.Dense(self.d_model)

        self.enc_layers = [
            EncoderLayer(self.d_model, self.num_heads, self.dff, self.rate)
            for _ in range(self.num_layers)
        ]

        self.dropout = tf.keras.layers.Dropout(self.rate)

    def get_config(self):
        config = super().get_config().copy()
        config.update(
            {
                "d_model": self.d_model,
                "dff": self.dff,
                "num_layers": self.num_layers,
                "num_heads": self.num_heads,
                "rate": self.rate,
            }
        )

        return config

    def call(self, x, training, mask=None):
        seq_len = tf.shape(x)[1]

        x = self.embedding(x)
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)

In [None]:
with open("../input/preprocessor-fit/clipped_features.pkl", "rb") as f:
    clipped_features = pickle.load(f)

with open("../input/preprocessor-fit/counts.pkl", "rb") as f:
    counts = pickle.load(f)

with open("../input/mlp-for-ensemble/kaggle_upload/stacked_tabnet_params.pkl", "rb") as f:
    params_stacked_tabnet = pickle.load(f)

with open("../input/tabnetclassifier-fit/params.pkl", "rb") as f:
    params_tabnet = pickle.load(f)

with open("../input/votingclassifier-fit-without-lightgm-and-svm/weights.pkl", "rb") as f:
    weights = pickle.load(f)

# with open("../input/votingclassifier-fit-without-lightgm-and-svm/thresholds.pkl", "rb") as f:
#     thresholds = pickle.load(f)

with open("../input/pytorch-mlp-tabnet-many-fe-train/rank_gauss.pkl", "rb") as f:
    scalers = pickle.load(f)

with open("../input/pytorch-mlp-tabnet-many-fe-train/gen_pca.pkl", "rb") as f:
    pca_g = pickle.load(f)

with open("../input/pytorch-mlp-tabnet-many-fe-train/cel_pca.pkl", "rb") as f:
    pca_c = pickle.load(f)

with open("../input/pytorch-mlp-tabnet-many-fe-train/var_thresh.pkl", "rb") as f:
    selector = pickle.load(f)

# with open("../input/21-tabnet-fit/ica_gene.pickle", "rb") as f:
#     ica_g = pickle.load(f)

# with open("../input/21-tabnet-fit/ica_cell.pickle", "rb") as f:
#     ica_c = pickle.load(f)

# with open("../input/21-tabnet-fit/rankgauss.pickle", "rb") as f:
#     scaler = pickle.load(f)

In [None]:
dtype = {"cp_type": "category", "cp_dose": "category"}
index_col = "sig_id"

Y = pd.read_csv("../input/lish-moa/train_targets_scored.csv", index_col=index_col)
test_features = pd.read_csv(
    "../input/lish-moa/test_features.csv", dtype=dtype, index_col=index_col
)

columns = Y.columns
index = test_features.index

c_prefix = "c-"
g_prefix = "g-"
c_columns = test_features.columns.str.startswith(c_prefix)
c_columns = list(test_features.columns[c_columns])
g_columns = test_features.columns.str.startswith(g_prefix)
g_columns = list(test_features.columns[g_columns])

In [None]:
X_test = test_features.select_dtypes("number").copy()
X_test_stats_c = compute_row_statistics(X_test.loc[:, c_columns], prefix=c_prefix)
X_test_stats_g = compute_row_statistics(X_test.loc[:, g_columns], prefix=g_prefix)

X_test_clipped = clipped_features.transform(X_test)

X_test = pd.concat([X_test_clipped, X_test_stats_c, X_test_stats_g], axis=1)

In [None]:
X_test_hirune924 = test_features[g_columns + c_columns].copy()

for column in g_columns + c_columns:
    X_test_hirune924[column] = scalers[column].transform(X_test_hirune924[[column]])

X_test_pca_g = pca_g.transform(X_test_hirune924[g_columns])
X_test_pca_c = pca_c.transform(X_test_hirune924[c_columns])

X_test_hirune924 = np.concatenate(
    [X_test_hirune924, X_test_pca_g, X_test_pca_c], axis=1
)

X_test_hirune924 = selector.transform(X_test_hirune924)
X_test_hirune924 = pd.DataFrame(X_test_hirune924, index=index)

X_test_hirune924 = pd.concat(
    [test_features[["cp_time", "cp_dose"]], X_test_hirune924], axis=1
)
X_test_hirune924 = pd.get_dummies(X_test_hirune924, columns=["cp_time", "cp_dose"])

In [None]:
# X_test_ynishi = test_features.copy()
# X_test_ynishi["cp_type"] = X_test_ynishi["cp_type"].map({"trt_cp": 0, "ctl_vehicle": 1})
# X_test_ynishi["cp_dose"] = X_test_ynishi["cp_dose"].map({"D1": 0, "D2": 1})

# vt_label = pd.read_csv("../input/98-preparation-set/vt_label.csv", index_col=0)
# vt_label = vt_label.iloc[1, 1:].dropna()
# selected_c_columns = vt_label.str.startswith(c_prefix)
# selected_c_columns = list(vt_label[selected_c_columns])
# selected_g_columns = vt_label.str.startswith(g_prefix)
# selected_g_columns = list(vt_label[selected_g_columns])

# X_test_ica_g = ica_g.transform(X_test_ynishi[selected_g_columns])
# X_test_ica_g = pd.DataFrame(X_test_ica_g, index=index)
# X_test_ica_c = ica_c.transform(X_test_ynishi[selected_c_columns])
# X_test_ica_c = pd.DataFrame(X_test_ica_c, index=index)

# X_test_stats = pd.DataFrame(index=index)

# for agg_func in ["std", "kurt", "skew"]:
#     X_test_stats[f"g-{agg_func}"] = X_test_ynishi[selected_g_columns].agg(
#         agg_func, axis=1
#     )
#     X_test_stats[f"c-{agg_func}"] = X_test_ynishi[selected_c_columns].agg(
#         agg_func, axis=1
#     )
#     X_test_stats[f"gc-{agg_func}"] = X_test_ynishi[
#         selected_g_columns + selected_c_columns
#     ].agg(agg_func, axis=1)

# X_test_scaled = scaler.transform(X_test_ynishi[selected_g_columns + selected_c_columns])
# X_test_scaled = pd.DataFrame(
#     X_test_scaled, columns=selected_g_columns + selected_c_columns, index=index
# )

# X_test_n_effective_features = (X_test_scaled[selected_g_columns].abs() > 2.0).sum(
#     axis=1
# )

# X_test_bins = pd.DataFrame(index=index)

# cell_bins = pd.read_csv(f"../input/21-tabnet-fit/cell_bins.csv", index_col=0)
# cell_bins.iloc[0] = -np.inf
# cell_bins.iloc[-1] = np.inf
# n_bins = cell_bins.shape[0] - 1

# for column in selected_c_columns:
#     X_test_bins[f"bin-{column}"] = pd.cut(
#         X_test_scaled[column], cell_bins[column], labels=range(n_bins)
#     )

# X_test_ynishi = pd.concat(
#     [
#         X_test_ynishi[["cp_type", "cp_time", "cp_dose"]],
#         X_test_scaled,
#         X_test_ica_g,
#         X_test_ica_c,
#         X_test_stats,
#         X_test_n_effective_features,
#         X_test_bins,
#     ],
#     axis=1,
# )

# X_test_ynishi = X_test_ynishi.astype("float")

In [None]:
selected_columns = [
    "g-0",
    "g-7",
    "g-8",
    "g-10",
    "g-13",
    "g-17",
    "g-20",
    "g-22",
    "g-24",
    "g-26",
    "g-28",
    "g-29",
    "g-30",
    "g-31",
    "g-32",
    "g-34",
    "g-35",
    "g-36",
    "g-37",
    "g-38",
    "g-39",
    "g-41",
    "g-46",
    "g-48",
    "g-50",
    "g-51",
    "g-52",
    "g-55",
    "g-58",
    "g-59",
    "g-61",
    "g-62",
    "g-63",
    "g-65",
    "g-66",
    "g-67",
    "g-68",
    "g-70",
    "g-72",
    "g-74",
    "g-75",
    "g-79",
    "g-83",
    "g-84",
    "g-85",
    "g-86",
    "g-90",
    "g-91",
    "g-94",
    "g-95",
    "g-96",
    "g-97",
    "g-98",
    "g-100",
    "g-102",
    "g-105",
    "g-106",
    "g-112",
    "g-113",
    "g-114",
    "g-116",
    "g-121",
    "g-123",
    "g-126",
    "g-128",
    "g-131",
    "g-132",
    "g-134",
    "g-135",
    "g-138",
    "g-139",
    "g-140",
    "g-142",
    "g-144",
    "g-145",
    "g-146",
    "g-147",
    "g-148",
    "g-152",
    "g-155",
    "g-157",
    "g-158",
    "g-160",
    "g-163",
    "g-164",
    "g-165",
    "g-170",
    "g-173",
    "g-174",
    "g-175",
    "g-177",
    "g-178",
    "g-181",
    "g-183",
    "g-185",
    "g-186",
    "g-189",
    "g-192",
    "g-194",
    "g-195",
    "g-196",
    "g-197",
    "g-199",
    "g-201",
    "g-202",
    "g-206",
    "g-208",
    "g-210",
    "g-213",
    "g-214",
    "g-215",
    "g-220",
    "g-226",
    "g-228",
    "g-229",
    "g-235",
    "g-238",
    "g-241",
    "g-242",
    "g-243",
    "g-244",
    "g-245",
    "g-248",
    "g-250",
    "g-251",
    "g-254",
    "g-257",
    "g-259",
    "g-261",
    "g-266",
    "g-270",
    "g-271",
    "g-272",
    "g-275",
    "g-278",
    "g-282",
    "g-287",
    "g-288",
    "g-289",
    "g-291",
    "g-293",
    "g-294",
    "g-297",
    "g-298",
    "g-301",
    "g-303",
    "g-304",
    "g-306",
    "g-308",
    "g-309",
    "g-310",
    "g-311",
    "g-314",
    "g-315",
    "g-316",
    "g-317",
    "g-320",
    "g-321",
    "g-322",
    "g-327",
    "g-328",
    "g-329",
    "g-332",
    "g-334",
    "g-335",
    "g-336",
    "g-337",
    "g-339",
    "g-342",
    "g-344",
    "g-349",
    "g-350",
    "g-351",
    "g-353",
    "g-354",
    "g-355",
    "g-357",
    "g-359",
    "g-360",
    "g-364",
    "g-365",
    "g-366",
    "g-367",
    "g-368",
    "g-369",
    "g-374",
    "g-375",
    "g-377",
    "g-379",
    "g-385",
    "g-386",
    "g-390",
    "g-392",
    "g-393",
    "g-400",
    "g-402",
    "g-406",
    "g-407",
    "g-409",
    "g-410",
    "g-411",
    "g-414",
    "g-417",
    "g-418",
    "g-421",
    "g-423",
    "g-424",
    "g-427",
    "g-429",
    "g-431",
    "g-432",
    "g-433",
    "g-434",
    "g-437",
    "g-439",
    "g-440",
    "g-443",
    "g-449",
    "g-458",
    "g-459",
    "g-460",
    "g-461",
    "g-464",
    "g-467",
    "g-468",
    "g-470",
    "g-473",
    "g-477",
    "g-478",
    "g-479",
    "g-484",
    "g-485",
    "g-486",
    "g-488",
    "g-489",
    "g-491",
    "g-494",
    "g-496",
    "g-498",
    "g-500",
    "g-503",
    "g-504",
    "g-506",
    "g-508",
    "g-509",
    "g-512",
    "g-522",
    "g-529",
    "g-531",
    "g-534",
    "g-539",
    "g-541",
    "g-546",
    "g-551",
    "g-553",
    "g-554",
    "g-559",
    "g-561",
    "g-562",
    "g-565",
    "g-568",
    "g-569",
    "g-574",
    "g-577",
    "g-578",
    "g-586",
    "g-588",
    "g-590",
    "g-594",
    "g-595",
    "g-596",
    "g-597",
    "g-599",
    "g-600",
    "g-603",
    "g-607",
    "g-615",
    "g-618",
    "g-619",
    "g-620",
    "g-625",
    "g-628",
    "g-629",
    "g-632",
    "g-634",
    "g-635",
    "g-636",
    "g-638",
    "g-639",
    "g-641",
    "g-643",
    "g-644",
    "g-645",
    "g-646",
    "g-647",
    "g-648",
    "g-663",
    "g-664",
    "g-665",
    "g-668",
    "g-669",
    "g-670",
    "g-671",
    "g-672",
    "g-673",
    "g-674",
    "g-677",
    "g-678",
    "g-680",
    "g-683",
    "g-689",
    "g-691",
    "g-693",
    "g-695",
    "g-701",
    "g-702",
    "g-703",
    "g-704",
    "g-705",
    "g-706",
    "g-708",
    "g-711",
    "g-712",
    "g-720",
    "g-721",
    "g-723",
    "g-724",
    "g-726",
    "g-728",
    "g-731",
    "g-733",
    "g-738",
    "g-739",
    "g-742",
    "g-743",
    "g-744",
    "g-745",
    "g-749",
    "g-750",
    "g-752",
    "g-760",
    "g-761",
    "g-764",
    "g-766",
    "g-768",
    "g-770",
    "g-771",
    "c-0",
    "c-1",
    "c-2",
    "c-3",
    "c-4",
    "c-5",
    "c-6",
    "c-7",
    "c-8",
    "c-9",
    "c-10",
    "c-11",
    "c-12",
    "c-13",
    "c-14",
    "c-15",
    "c-16",
    "c-17",
    "c-18",
    "c-19",
    "c-20",
    "c-21",
    "c-22",
    "c-23",
    "c-24",
    "c-25",
    "c-26",
    "c-27",
    "c-28",
    "c-29",
    "c-30",
    "c-31",
    "c-32",
    "c-33",
    "c-34",
    "c-35",
    "c-36",
    "c-37",
    "c-38",
    "c-39",
    "c-40",
    "c-41",
    "c-42",
    "c-43",
    "c-44",
    "c-45",
    "c-46",
    "c-47",
    "c-48",
    "c-49",
    "c-50",
    "c-51",
    "c-52",
    "c-53",
    "c-54",
    "c-55",
    "c-56",
    "c-57",
    "c-58",
    "c-59",
    "c-60",
    "c-61",
    "c-62",
    "c-63",
    "c-64",
    "c-65",
    "c-66",
    "c-67",
    "c-68",
    "c-69",
    "c-70",
    "c-71",
    "c-72",
    "c-73",
    "c-74",
    "c-75",
    "c-76",
    "c-77",
    "c-78",
    "c-79",
    "c-80",
    "c-81",
    "c-82",
    "c-83",
    "c-84",
    "c-85",
    "c-86",
    "c-87",
    "c-88",
    "c-89",
    "c-90",
    "c-91",
    "c-92",
    "c-93",
    "c-94",
    "c-95",
    "c-96",
    "c-97",
    "c-98",
    "c-99",
]

In [None]:
_, n_classes = Y.shape
test_size, n_clipped_features = X_test_clipped.shape
_, n_features = X_test.shape
_, n_features_hirune924 = X_test_hirune924.shape

In [None]:
# hyperparameters
batch_size = 32
n_seeds = 5
n_splits = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
mean = Y.mean()
mean = np.asarray(mean)

In [None]:
%%time
denominator = counts.sum(axis=0)

counts /= denominator

_, n_models = weights.shape
Y_preds = np.zeros((n_models, test_size, n_classes))

for i in tqdm.trange(n_seeds):
    for j in tqdm.trange(n_splits):
        #
        # anonamename
        #
        # model = tf.keras.models.load_model(
        #     f"../input/mlp-for-ensemble/kaggle_upload/2l/model_seed_{i}_fold_{j}.h5",
        #     compile=False,
        # )

        # Y_preds[0] += counts[i * n_splits + j] * model.predict(
        #     X_test_clipped, batch_size=batch_size
        # )

        # tf.keras.backend.clear_session()

        # model = tf.keras.models.load_model(
        #     f"../input/mlp-for-ensemble/kaggle_upload/3l_v2/model_seed_{i}_fold_{j}.h5",
        #     compile=False,
        # )

        # Y_preds[1] += counts[i * n_splits + j] * model.predict(
        #     X_test_clipped, batch_size=batch_size
        # )

        # tf.keras.backend.clear_session()

        # model = tf.keras.models.load_model(
        #     f"../input/mlp-for-ensemble/kaggle_upload/4l/model_seed_{i}_fold_{j}.h5",
        #     compile=False,
        # )

        # Y_preds[2] += counts[i * n_splits + j] * model.predict(
        #     X_test_clipped, batch_size=batch_size
        # )

        # tf.keras.backend.clear_session()

        # model = tf.keras.models.load_model(
        #     f"../input/mlp-for-ensemble/kaggle_upload/5l/model_seed_{i}_fold_{j}.h5",
        #     compile=False,
        # )

        # Y_preds[3] += counts[i * n_splits + j] * model.predict(
        #     X_test_clipped, batch_size=batch_size
        # )

        # tf.keras.backend.clear_session()

        model = tf.keras.models.load_model(
            f"../input/mlp-for-ensemble/kaggle_upload/rs/model_seed_{i}_fold_{j}.h5",
            compile=False,
        )

        Y_preds[0] += counts[i * n_splits + j] * model.predict(
            [X_test_clipped, X_test_clipped[selected_columns]], batch_size=batch_size
        )

        tf.keras.backend.clear_session()

        model = StackedTabNetClassifier(
            num_classes=n_classes,
            num_features=n_clipped_features,
            **params_stacked_tabnet,
        )

        # https://stackoverflow.com/questions/63658086/tensorflow-2-0-valueerror-while-loading-weights-from-h5-file
        model(np.zeros((1, n_clipped_features)))

        model.load_weights(
            f"../input/mlp-for-ensemble/kaggle_upload/StackedTabNet/model_seed_{i}_fold_{j}.h5"
        )

        Y_preds[1] += counts[i * n_splits + j] * model.predict(
            X_test_clipped, batch_size=batch_size
        )

        tf.keras.backend.clear_session()

        # model = DynamicNet.from_file(
        #     f"../input/moa-grownet/{j}FOLD_{i}_.pth",
        #     lambda stage: MLP_2HL.get_model(
        #         stage,
        #         n_clipped_features,
        #         512,
        #         n_classes,
        #     ),
        #     device,
        # )

        # Y_preds[6] += counts[i * n_splits + j] * model.predict(
        #     X_test_clipped, device, batch_size=test_size
        # )

        # with open(
        #     f"../input/moa-lgbmclassifier-classifierchain/model_seed_{i}_fold_{j}.jlb",
        #     "rb",
        # ) as f:
        #     model = joblib.load(f)

        # Y_preds[7] += counts[i * n_splits + j] * model.predict_proba(X_test_clipped)

        # for k, column in enumerate(columns):
        #     with open(
        #         f"../input/moa-lightgbm/model_seed_{i}_fold_{j}_{column}.jlb",
        #         "rb",
        #     ) as f:
        #         model = joblib.load(f)

        #     Y_preds[2, :, k] += counts[i * n_splits + j, k] * model.predict(
        #         X_test_clipped
        #     )

        # for k, column in enumerate(columns):
        #     model_dir = {
        #         0: "../input/moa-rapids-svm-seed01",
        #         1: "../input/moa-rapids-svm-seed01",
        #         2: "../input/fork-of-moa-rapids-svm-seed23",
        #         3: "../input/fork-of-moa-rapids-svm-seed23",
        #         4: "../input/fork-of-moa-rapids-svm-seed4",
        #     }

        #     model_path = f"{model_dir[i]}/model_seed_{i}_fold_{j}_{column}.jlb"

        #     if os.path.exists(model_path):
        #         with open(model_path, "rb") as f:
        #             model = joblib.load(f)

        #         Y_preds[3, :, k] += (
        #             counts[i * n_splits + j, k]
        #             * model.predict_proba(X_test_clipped)[:, 1]
        #         )

        #     else:
        #         Y_preds[3, :, k] += counts[i * n_splits + j, k] * mean[k]

        #
        # ari hiro
        #
        model = tf.keras.models.load_model(
            f"../input/transformer-fit/Transformer_{i}_{j}.hdf5",
            compile=False,
            custom_objects={"TransformerEncoder": TransformerEncoderByZhang},
        )

        Y_preds[2] += counts[i * n_splits + j] * model.predict(
            X_test_clipped, batch_size=batch_size
        )

        tf.keras.backend.clear_session()

        #
        # hirune924
        #
        model = Model(
            num_features=n_features_hirune924, num_targets=n_classes, hidden_size=1_500
        )

        model.load_state_dict(
            torch.load(
                f"../input/pytorch-mlp-tabnet-many-fe-train/FOLD{j}_SEED{i}.pth",
                map_location=torch.device(device),
            )
        )
        model.to(device)

        Y_preds[3] += counts[i * n_splits + j] * predict_with_pytorch_model(
            model, X_test_hirune924, device, batch_size=test_size
        )

        with open(
            f"../input/pytorch-mlp-tabnet-many-fe-train/tabnet_FOLD{j}_SEED{i}.pkl",
            "rb",
        ) as f:
            model = pickle.load(f)

        Y_pred = model.predict(X_test_hirune924.values)
        Y_pred = 1.0 / (1.0 + np.exp(-Y_pred))
        Y_preds[4] += counts[i * n_splits + j] * Y_pred

        with open(
            f"../input/pytorch-tabnet-pretraining-step3-many-fe-train/tabnet_FOLD{j}_SEED{i}.pkl",
            "rb",
        ) as f:
            model = pickle.load(f)

        Y_pred = model.predict(X_test_hirune924.values)
        Y_pred = 1.0 / (1.0 + np.exp(-Y_pred))
        Y_preds[5] += counts[i * n_splits + j] * Y_pred

        #
        # Kon
        #
        # model = tf.keras.models.load_model(
        #     f"../input/lstmclassifier-fit/model_seed_{i}_fold_{j}.h5",
        #     compile=False,
        # )

        # Y_preds[12] += counts[i * n_splits + j] * model.predict(
        #     X_test, batch_size=batch_size
        # )

        # tf.keras.backend.clear_session()

        # model = tf.keras.models.load_model(
        #     f"../input/mlpclassifier-fit/model_seed_{i}_fold_{j}.h5",
        #     compile=False,
        # )

        # Y_preds[13] += counts[i * n_splits + j] * model.predict(
        #     X_test, batch_size=batch_size
        # )

        # tf.keras.backend.clear_session()

        model = tf.keras.models.load_model(
            f"../input/resnetclassifier-fit/model_seed_{i}_fold_{j}.h5",
            compile=False,
        )

        Y_preds[6] += counts[i * n_splits + j] * model.predict(
            X_test, batch_size=batch_size
        )

        tf.keras.backend.clear_session()

        # model = TabNetClassifier(
        #     num_classes=n_classes, num_features=n_features, **params_tabnet
        # )

        # # https://stackoverflow.com/questions/63658086/tensorflow-2-0-valueerror-while-loading-weights-from-h5-file
        # model(np.zeros((1, n_features)))

        # model.load_weights(f"../input/tabnetclassifier-fit/model_seed_{i}_fold_{j}.h5")

        # Y_preds[9] += counts[i * n_splits + j] * model.predict(
        #     X_test, batch_size=batch_size
        # )

        # tf.keras.backend.clear_session()

        # model = tf.keras.models.load_model(
        #     f"../input/transformerclassifier-fit/model_seed_{i}_fold_{j}.h5",
        #     compile=False,
        #     custom_objects={"TransformerEncoder": TransformerEncoder},
        # )

        # Y_preds[16] += counts[i * n_splits + j] * model.predict(
        #     X_test, batch_size=batch_size
        # )

        # tf.keras.backend.clear_session()

        #
        # ynishi
        #
        # model = TabNetRegressor()

        # model.load_model(f"../input/21-tabnet-fit/model_seed_{i}_fold_{j}.zip")

        # Y_pred = model.predict(X_test_ynishi.values)
        # Y_pred = 1.0 / (1.0 + np.exp(-Y_pred))
        # Y_preds[16] += counts[i * n_splits + j] * Y_pred

In [None]:
Y_pred = pd.DataFrame(columns=Y.columns, index=X_test.index)

for i, column in enumerate(columns):
    Y_pred[column] = np.tensordot(weights[i], Y_preds[:, :, [i]], axes=(0, 0))
    # Y_pred.loc[Y_pred[column] < 1e-04, column] = 0.0
    # Y_pred.loc[Y_pred[column] > 1.0 - 1e-04, column] = 1.0

Y_pred[test_features["cp_type"] == "ctl_vehicle"] = 0.0

In [None]:
Y_pred[columns]

In [None]:
Y_pred[columns].to_csv("submission.csv")