In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

# The installation procedure is inpsried from this great notebook, thanks for sharing it.
# https://www.kaggle.com/hamishdickson/tabnetmultitaskclassifier
!pip uninstall -y typing # this should avoid  AttributeError: type object 'Callable' has no attribute '_abc_registry'

import sys
sys.path.insert(0, "../input/tabnet-latest")

from pytorch_tabnet.multitask import TabNetMultiTaskClassifier

In [None]:
BASE_FOLDER = Path("../input/lish-moa/")
TRAIN_FEATURES_PATH = BASE_FOLDER / "train_features.csv"
TEST_FEATURES_PATH = BASE_FOLDER / "test_features.csv"
TRAIN_TARGETS_PATH = BASE_FOLDER / "train_targets_scored.csv"
SAMPLE_SUBMISSION_PATH = BASE_FOLDER / "sample_submission.csv"
MODEL_PATHS = [f"model_{fold}.zip" for fold in range(5)]
# Category mapping to numbers (similr to what is done in training)
DOSE_MAPPING = {"D1": 0, "D2": 1}
TIME_MAPPING = {24: 0, 48: 1, 72: 2}


# Zip the models!

In [None]:
for fold in range(5):
    !cp -r ../input/multitargets-tabnet/tabnet_multitargets_{fold}_fold/* .
    !zip model_{fold}.zip model_params.json network.pt

In [None]:
train_targets_df = pd.read_csv(TRAIN_TARGETS_PATH)
train_features_df = pd.read_csv(TRAIN_FEATURES_PATH)
test_features_df = pd.read_csv(TEST_FEATURES_PATH)
test_features_df = pd.read_csv(TEST_FEATURES_PATH)

sample_submission_df = pd.read_csv(SAMPLE_SUBMISSION_PATH)

In [None]:
# Since control is always 0, we can filter those

train_sig_ids = train_features_df.loc[lambda df: df["cp_type"] == "ctl_vehicle", "sig_id"].tolist()

mean_train_targets_dict = train_targets_df.loc[lambda df: ~df["sig_id"].isin(train_sig_ids), :].iloc[:, 1:].mean().to_dict()


for col, mean in mean_train_targets_dict.items():
    sample_submission_df.loc[:, col] = mean



In [None]:
FEATURES = sorted(train_features_df.drop(["cp_type", "sig_id"], axis=1).columns.tolist())
TARGETS = sorted(list(set(train_targets_df.drop("sig_id", axis=1).columns) - {"atp-sensitive_potassium_channel_antagonist", "erbb2_inhibitor"}))

# Load the multi-targets models and predict

One model per fold. 

In [None]:
# Predict for one target => nfkb_inhibitor


test_sig_ids = test_features_df.loc[lambda df: df["cp_type"] != "ctl_vehicle", "sig_id"].tolist()

X_test = test_features_df.loc[lambda df: df["cp_type"] != "ctl_vehicle"].drop(["sig_id", "cp_type"], axis=1)


print(len(X_test))
print(len(test_features_df))

X_test["cp_dose"] = X_test["cp_dose"].map(DOSE_MAPPING)
X_test["cp_time"] = X_test["cp_time"].map(TIME_MAPPING)


X_test = X_test.loc[:, FEATURES].values





In [None]:
data = []
for path in MODEL_PATHS:

    model = TabNetMultiTaskClassifier()
    model.load_model(path)
    # This is missing
    model.preds_mapper = DOSE_MAPPING
    # Predict, clip and transpose
    # Trying without clipping for now since training doesn't have it.
    y_preds = np.array(model.predict_proba(X_test))[:,:, 1].T
    # y_preds = np.clip(np.array(model.predict_proba(X_test)), 0.001, 0.999)[:,:, 1].T
    
    assert (sum(y_preds) > 0).all()
    data.append(y_preds)

In [None]:
y_pred = np.array(data).mean(axis=0)

In [None]:
y_pred.shape

In [None]:
sample_submission_df.loc[lambda df: df["sig_id"].isin(test_sig_ids), TARGETS].shape

In [None]:
# For the test, if any are from the control group, we set these to 0


sample_submission_df.loc[lambda df: df["sig_id"].isin(test_sig_ids), TARGETS] = y_pred

In [None]:
sample_submission_df.std()

In [None]:
print(sample_submission_df.mean().sort_values())

# Submit 

In [None]:
sample_submission_df.to_csv("submission.csv", index=False)