In [None]:
import gc
import re
import math
import pickle
import joblib
import warnings

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.linear_model import LogisticRegression

import lightgbm as lgb

warnings.simplefilter('ignore')

In [None]:
import os
import random as rn
import numpy as np


def set_seed(seed=0):
    os.environ["PYTHONHASHSEED"] = str(seed)

    rn.seed(seed)
    np.random.seed(seed)

In [None]:
from sklearn.metrics import log_loss


def score(Y, Y_pred):
    _, n_classes = Y.shape

    losses = []

    for j in range(n_classes):
        loss = log_loss(Y.iloc[:, j], Y_pred.iloc[:, j], labels=[0, 1])

        losses.append(loss)

    return np.mean(losses)

In [None]:
import sys
sys.path.append('../input/iterativestratification')

import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection._split import _BaseKFold


class MultilabelGroupStratifiedKFold(_BaseKFold):
    def __init__(self, n_splits=5, random_state=None, shuffle=False):
        super().__init__(n_splits=n_splits, random_state=random_state, shuffle=shuffle)

    def _iter_test_indices(self, X=None, Y=None, groups=None):
        cv = MultilabelStratifiedKFold(
            n_splits=self.n_splits,
            random_state=self.random_state,
            shuffle=self.shuffle,
        )

        value_counts = groups.value_counts()
        regular_index = value_counts.loc[
            (value_counts == 6) | (value_counts == 12) | (value_counts == 18)
        ].index.sort_values()
        irregular_index = value_counts.loc[
            (value_counts != 6) & (value_counts != 12) & (value_counts != 18)
        ].index.sort_values()

        group_to_fold = {}
        tmp = Y.groupby(groups).mean().loc[regular_index]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            group_to_fold.update({group: fold for group in tmp.index[test]})

        sample_to_fold = {}
        tmp = Y.loc[groups.isin(irregular_index)]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            sample_to_fold.update({sample: fold for sample in tmp.index[test]})

        folds = groups.map(group_to_fold)
        is_na = folds.isna()
        folds[is_na] = folds[is_na].index.map(sample_to_fold).values

        for i in range(self.n_splits):
            yield np.where(folds == i)[0]

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


class ClippedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, copy=True, high=0.99, low=0.01):
        self.copy = copy
        self.high = high
        self.low = low

    def fit(self, X, y=None):
        self.data_max_ = X.quantile(q=self.high)
        self.data_min_ = X.quantile(q=self.low)

        return self

    def transform(self, X):
        if self.copy:
            X = X.copy()

        X.clip(self.data_min_, self.data_max_, axis=1, inplace=True)

        return X

In [None]:
import pandas as pd


def compute_row_statistics(X, prefix=""):
    Xt = pd.DataFrame()

    for agg_func in [
        # "min",
        # "max",
        "mean",
        "std",
        "kurtosis",
        "skew",
    ]:
        Xt[f"{prefix}{agg_func}"] = X.agg(agg_func, axis=1)

    return Xt

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


def display_importances(
    importance_df, png_path=f"feature_importance.png",
):
    """feature_importance plot"""
    importance_df.sort_values(by="importance", ascending=False).to_csv(f"feature_importance.csv")
    cols = (
        importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:100]
        .index
    )
    best_features = importance_df.loc[importance_df.feature.isin(cols)]
    plt.figure(figsize=(8, 15))
    sns.barplot(
        x="importance",
        y="feature",
        data=best_features.sort_values(by="importance", ascending=False),
    )
    plt.title("LightGBM (avg over folds)")
    plt.tight_layout()
    plt.savefig(png_path)

In [None]:
dtype = {"cp_type": "category", "cp_dose": "category"}
index_col = "sig_id"

train_features = pd.read_csv(
   "../input/lish-moa/train_features.csv", dtype=dtype, index_col=index_col
)
X = train_features.select_dtypes("number")
Y_nonscored = pd.read_csv(
   "../input/lish-moa/train_targets_nonscored.csv", index_col=index_col
)
Y = pd.read_csv("../input/lish-moa/train_targets_scored.csv", index_col=index_col)
groups = pd.read_csv(
   "../input/lish-moa/train_drug.csv", index_col=index_col, squeeze=True
)

columns = Y.columns

In [None]:
clipped_features = ClippedFeatures()
X = clipped_features.fit_transform(X)

with open("clipped_features.pkl", "wb") as f:
    pickle.dump(clipped_features, f)
# アンサンブルのために統計値, nonscoredは入れない 
#c_prefix = "c-"
#g_prefix = "g-"
#c_columns = X.columns.str.startswith(c_prefix)
#g_columns = X.columns.str.startswith(g_prefix)
#X_c = compute_row_statistics(X.loc[:, c_columns], prefix=c_prefix)
#X_g = compute_row_statistics(X.loc[:, g_columns], prefix=g_prefix)
#X = pd.concat([X, X_c, X_g], axis=1)

In [None]:
is_drug_cv = True
n_splits = 5
n_seeds = 5
# LBS = 0.0008  # ラベルスムージングは全然効かないからやめる
LBS = 0.0

params = {
    "objective": "binary",
    "learning_rate": 0.1,
    "num_leaves": 4,
    "max_depth": 2,
    "min_data_in_leaf": 1951,
    "feature_fraction": 0.25,
    "lambda_l1": 3.34060389323178e-07,
    "lambda_l2": 0.00012255072592595102,
}

num_boost_round = 2000
verbose_eval = 100
#num_boost_round = 1000
#verbose_eval = 50
early_stopping_rounds = verbose_eval

#DEBUG = True
DEBUG = False
if DEBUG:
    columns = [
        "atp-sensitive_potassium_channel_antagonist",  # 陽性ラベル1個だけ
        "erbb2_inhibitor",  # 陽性ラベル1個だけ
        "antiarrhythmic",  # 陽性ラベル6個だけ
        "aldehyde_dehydrogenase_inhibitor",  # 陽性ラベル7個だけ
#        "lipase_inhibitor",  # 陽性ラベル12個だけ
#        "sphingosine_receptor_agonist",  # 陽性ラベル25個だけ
#        "igf-1_inhibitor",  # 陽性ラベル37個だけ
#        "potassium_channel_activator",  # 陽性ラベル55個だけ
#        "potassium_channel_antagonist",  # 陽性ラベル98個だけ
#        "dopamine_receptor_agonist",  # 陽性ラベル121個だけ
#        "nfkb_inhibitor",  # 陽性ラベル832個
#        "cyclooxygenase_inhibitor",  # 陽性ラベル435個
#        "dna_inhibitor",  # 陽性ラベル402個
#        "glutamate_receptor_antagonist",  # 陽性ラベル367個
#        "tubulin_inhibitor",  # 陽性ラベル316個
#        "pdgfr_inhibitor",  # 陽性ラベル297個
#        "calcium_channel_blocker",  # 陽性ラベル281個
#        "flt3_inhibitor",  # 陽性ラベル279個
#        "progesterone_receptor_agonist",  # 陽性ラベル119個
        "hdac_inhibitor",  # 陽性ラベル106個
    ]
    Y = Y[columns]
    n_seeds = 2
    params["n_estimators"] = 2
    n_splits = 4
    num_boost_round = 50
    verbose_eval = 5
    early_stopping_rounds = verbose_eval
    print(f"DEBUG: {DEBUG}")

In [None]:
train_size, n_features = X.shape
_, n_classes_nonscored = Y_nonscored.shape
_, n_classes = Y.shape

In [None]:
%%time

f_importance = np.zeros((n_features,))
Y_pred = np.zeros((train_size, n_classes))
Y_pred = pd.DataFrame(Y_pred, columns=Y.columns, index=Y.index)

counts = []
for i in tqdm(range(n_seeds)):
    set_seed(seed=i)

    cv = MultilabelGroupStratifiedKFold(n_splits=n_splits, random_state=i, shuffle=True)
    cv_split = cv.split(X, Y, groups)
        
    for j, (trn_idx, val_idx) in enumerate(cv_split):

        print(f"\n------------ fold:{j} ------------")

        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        Y_train_targets, Y_val_targets = Y.iloc[trn_idx], Y.iloc[val_idx]
    
        targets_counts = []

        # Label Smoothing. https://www.kaggle.com/gogo827jz/self-stacking-groupcv-xgboost
        Y_train_targets = Y_train_targets * (1 - LBS) + 0.5 * LBS
    
        for tar, tar_col in enumerate(Y.columns):
            Y_train, Y_val = Y_train_targets.values[:, tar], Y_val_targets.values[:, tar]           

            lgb_train = lgb.Dataset(X_train, Y_train)
            lgb_eval = lgb.Dataset(X_val, Y_val, reference=lgb_train)

            model = lgb.train(
                params,
                lgb_train,
                valid_sets=[lgb_train, lgb_eval],
                verbose_eval=verbose_eval,
                num_boost_round=num_boost_round,
                early_stopping_rounds=early_stopping_rounds,
            )
            Y_pred[tar_col][val_idx] += (
                model.predict(X_val, num_iteration=model.best_iteration) / n_seeds
            )

            f_importance += np.array(
                model.feature_importance(importance_type="gain")
            ) / (n_seeds * n_splits)

            joblib.dump(
                model, f"model_seed_{i}_fold_{j}_{Y.columns[tar]}.jlb", compress=True
            )

            targets_counts.append(Y_train.sum())

        counts.append(targets_counts)

counts = np.array(counts)
assert (
    counts.shape == np.empty((n_seeds * n_splits, n_classes)).shape
), f"countsのshapeおかしい. {counts.shape}"

importance_df = pd.DataFrame(
    {"feature": model.feature_name(), "importance": f_importance}
)

Y_pred[train_features["cp_type"] == "ctl_vehicle"] = 0.0

with open("counts.pkl", "wb") as f:
    pickle.dump(counts, f)

with open("Y_pred.pkl", "wb") as f:
    pickle.dump(Y_pred[columns], f)

In [None]:
score(Y[columns], Y_pred[columns])

In [None]:
display_importances(importance_df)

# Platt Scaling
Train a Logistic Regression model to calibrate the results

https://www.kaggle.com/gogo827jz/kernel-logistic-regression-one-for-206-targets

In [None]:
## predict_probaでだしたY_predをロジスティク回帰で確率に補正する
#
#X = Y_pred.copy()
#Y_cali = np.zeros((train_size, n_classes))
#Y_cali = pd.DataFrame(Y_cali, columns=Y.columns, index=Y.index)
#
#for i in tqdm(range(n_seeds)):
#    set_seed(seed=i)
#
#    cv = MultilabelGroupStratifiedKFold(n_splits=n_splits, random_state=i, shuffle=True)
#    cv_split = cv.split(X, Y, groups)
#        
#    for j, (trn_idx, val_idx) in enumerate(cv_split):
#
#        print(f"\n------------ fold:{j} ------------")
#
#        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
#        Y_train_targets, Y_val_targets = Y.iloc[trn_idx], Y.iloc[val_idx]
#        
#        # Label Smoothing. https://www.kaggle.com/gogo827jz/self-stacking-groupcv-xgboost
#        #Y_train_targets = Y_train_targets * (1 - LBS) + 0.5 * LBS
#    
#        for tar, tar_col in enumerate(Y.columns):
#            Y_train, Y_val = Y_train_targets.values[:, tar], Y_val_targets.values[:, tar]  
#            
#            if Y_train.sum() >= 1:
#                
#                model = LogisticRegression(penalty="none", max_iter=1000)
#                model.fit(X_train, Y_train)
#                
#                Y_cali[tar_col][val_idx] += model.predict_proba(X_val)[:, 1] / n_seeds
#            
#                joblib.dump(model, 
#                            f"calibrate_model_seed_{i}_fold_{j}_{tar_col}.jlb", 
#                            compress=True)
#            else:
#                Y_cali[tar_col][val_idx] = Y_pred[tar_col][val_idx]
#                
#with open("Y_pred_calibrate.pkl", "wb") as f:
#    pickle.dump(Y_cali[columns], f)

In [None]:
#score(Y[columns], Y_cali[columns])

# pkl check

In [None]:
path = r"counts.pkl"
with open(path, 'rb') as f:
    counts = pickle.load(f)
print(counts.shape)
counts

In [None]:
path = r"Y_pred.pkl"
with open(path, 'rb') as f:
    Y_pred = pickle.load(f)
Y_pred

In [None]:
#path = r"Y_pred_calibrate.pkl"
#with open(path, 'rb') as f:
#    Y_pred_calibrate = pickle.load(f)
#Y_pred_calibrate

# predict test

In [None]:
test_features = pd.read_csv(
    "../input/lish-moa/test_features.csv", dtype=dtype, index_col=index_col
)
X_test = test_features.select_dtypes("number")


with open("./clipped_features.pkl", "rb") as f:
    clipped_features = pickle.load(f)
X_test = clipped_features.transform(X_test)
# アンサンブルのため統計値, nonscoredは入れない 
#X_c = compute_row_statistics(X_test.loc[:, c_columns], prefix=c_prefix)
#X_g = compute_row_statistics(X_test.loc[:, g_columns], prefix=g_prefix)
#X_test = pd.concat([X_test, X_c, X_g], axis=1)

In [None]:
import glob

# lgbで予測
Y_test_pred = np.zeros((X_test.shape[0], len(columns)))
Y_test_pred = pd.DataFrame(Y_test_pred, columns=columns, index=test_features.index)
for target in columns:
    model_paths = glob.glob(f"./model_seed_*_{target}.jlb")
    for model_path in model_paths:
        model = joblib.load(model_path)
        Y_test_pred[target] += model.predict(X_test) / len(model_paths)
print(Y_test_pred.shape)
display(Y_test_pred)

In [None]:
## calibrate
#print("\n---------- calibrate ----------")
#X_test = Y_test_pred.copy()
#Y_test_cali = np.zeros((X_test.shape[0], len(columns)))
#Y_test_cali = pd.DataFrame(Y_test_pred, columns=columns, index=test_features.index)
#for i in range(n_seeds):
#    for j in range(n_splits):
#        for tar in range(Y.shape[1]):
#            
#            m_path = f"calibrate_model_seed_{i}_fold_{j}_{Y.columns[tar]}.jlb"
#            if os.path.exists(m_path):
#                print(m_path)
#                model = joblib.load(m_path)
#                Y_test_cali.iloc[:,tar] += model.predict_proba(X_test)[:, 1] / (n_seeds * n_splits)
#            else:
#                Y_test_cali.iloc[:,tar] = Y_test_pred.iloc[:,tar]
#                
#print(Y_test_cali.shape)
#display(Y_test_cali)