# classifierchain でラベル相関学習を強化する

In [1]:
import gc
import re
import math
import pickle
import joblib
import warnings

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.linear_model import LogisticRegression

from lightgbm import LGBMClassifier

warnings.simplefilter('ignore')

In [2]:
import os
import random as rn
import numpy as np


def set_seed(seed=0):
    os.environ["PYTHONHASHSEED"] = str(seed)

    rn.seed(seed)
    np.random.seed(seed)

In [3]:
from sklearn.metrics import log_loss


def score(Y, Y_pred):
    _, n_classes = Y.shape

    losses = []

    for j in range(n_classes):
        loss = log_loss(Y.iloc[:, j], Y_pred.iloc[:, j], labels=[0, 1])

        losses.append(loss)

    return np.mean(losses)

In [4]:
import sys
sys.path.append('../input/iterativestratification')

import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection._split import _BaseKFold


class MultilabelGroupStratifiedKFold(_BaseKFold):
    def __init__(self, n_splits=5, random_state=None, shuffle=False):
        super().__init__(n_splits=n_splits, random_state=random_state, shuffle=shuffle)

    def _iter_test_indices(self, X=None, y=None, groups=None):
        cv = MultilabelStratifiedKFold(
            n_splits=self.n_splits,
            random_state=self.random_state,
            shuffle=self.shuffle,
        )

        value_counts = groups.value_counts()
        regular_index = value_counts.loc[
            (value_counts == 6) | (value_counts == 12) | (value_counts == 18)
        ].index.sort_values()
        irregular_index = value_counts.loc[
            (value_counts != 6) & (value_counts != 12) & (value_counts != 18)
        ].index.sort_values()

        group_to_fold = {}
        tmp = Y.groupby(groups).mean().loc[regular_index]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            group_to_fold.update({group: fold for group in tmp.index[test]})

        sample_to_fold = {}
        tmp = Y.loc[groups.isin(irregular_index)]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            sample_to_fold.update({sample: fold for sample in tmp.index[test]})

        folds = groups.map(group_to_fold)
        is_na = folds.isna()
        folds[is_na] = folds[is_na].index.map(sample_to_fold).values

        for i in range(self.n_splits):
            yield np.where(folds == i)[0]

In [5]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


class ClippedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, copy=True, high=0.99, low=0.01):
        self.copy = copy
        self.high = high
        self.low = low

    def fit(self, X, y=None):
        self.data_max_ = X.quantile(q=self.high)
        self.data_min_ = X.quantile(q=self.low)

        return self

    def transform(self, X):
        if self.copy:
            X = X.copy()

        X.clip(self.data_min_, self.data_max_, axis=1, inplace=True)

        return X

In [6]:
import pandas as pd


def compute_row_statistics(X, prefix=""):
    Xt = pd.DataFrame()

    for agg_func in [
        # "min",
        # "max",
        "mean",
        "std",
        "kurtosis",
        "skew",
    ]:
        Xt[f"{prefix}{agg_func}"] = X.agg(agg_func, axis=1)

    return Xt

In [7]:
#dtype = {"cp_type": "category", "cp_dose": "category"}
#index_col = "sig_id"
#
#train_features = pd.read_csv(
#    "../input/lish-moa/train_features.csv", dtype=dtype, index_col=index_col
#)
#X = train_features.select_dtypes("number")
#Y_nonscored = pd.read_csv(
#    "../input/lish-moa/train_targets_nonscored.csv", index_col=index_col
#)
#Y = pd.read_csv("../input/lish-moa/train_targets_scored.csv", index_col=index_col)
#groups = pd.read_csv(
#    "../input/lish-moa/train_drug.csv", index_col=index_col, squeeze=True
#)
#
#columns = Y.columns

In [8]:
dtype = {"cp_type": "category", "cp_dose": "category"}
index_col = "sig_id"

sys.path.append(
    r"C:\Users\81908\jupyter_notebook\poetry_work\tfgpu\01_MoA_compe\code"
)
import datasets

DATADIR = datasets.DATADIR

train_features = pd.read_csv(
    f"{DATADIR}/train_features.csv", dtype=dtype, index_col=index_col
)
X = train_features.select_dtypes("number")
Y_nonscored = pd.read_csv(
    f"{DATADIR}/train_targets_nonscored.csv", index_col=index_col
)
Y = pd.read_csv(f"{DATADIR}/train_targets_scored.csv", index_col=index_col)
groups = pd.read_csv(
    f"{DATADIR}/train_drug.csv", dtype=dtype, index_col=index_col, squeeze=True
)

columns = Y.columns

In [9]:
clipped_features = ClippedFeatures()
X = clipped_features.fit_transform(X)

with open("clipped_features.pkl", "wb") as f:
    pickle.dump(clipped_features, f)
# アンサンブルのために統計値, nonscoredは入れない 
#c_prefix = "c-"
#g_prefix = "g-"
#c_columns = X.columns.str.startswith(c_prefix)
#g_columns = X.columns.str.startswith(g_prefix)
#X_c = compute_row_statistics(X.loc[:, c_columns], prefix=c_prefix)
#X_g = compute_row_statistics(X.loc[:, g_columns], prefix=g_prefix)
#X = pd.concat([X, X_c, X_g], axis=1)
#
#Y_nonscored = Y_nonscored.loc[:, Y_nonscored.sum(axis=0) > 0]
#Y = pd.concat([Y, Y_nonscored], axis=1)

# objective

In [10]:
def train_and_evaluate(params):
    counts = np.empty((n_seeds * n_splits, n_classes))

    Y_pred = np.zeros((train_size, n_classes))
    Y_pred = pd.DataFrame(Y_pred, columns=Y.columns, index=Y.index)

    for i in range(n_seeds):
        set_seed(seed=i)

        if is_drug_cv:
            cv = MultilabelGroupStratifiedKFold(n_splits=n_splits, random_state=i, shuffle=True)
            cv_split = cv.split(X, Y, groups)
        else:
            MultilabelStratifiedKFold(n_splits=n_splits, random_state=i, shuffle=True)
            cv_split = cv.split(X, Y)

        for j, (trn_idx, val_idx) in tqdm(enumerate(cv_split)):
            counts[i * n_splits + j] = Y.iloc[trn_idx].sum()

            X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
            Y_train, Y_val = Y.iloc[trn_idx], Y.iloc[val_idx]
            
            # Label Smoothing. https://www.kaggle.com/gogo827jz/self-stacking-groupcv-xgboost
            Y_train = Y_train * (1 - LBS) + 0.5 * LBS

            clf = ClassifierChain(LGBMClassifier(**params), order="random", random_state=i)
            clf.fit(X_train, Y_train)
            val_preds = clf.predict_proba(X_val) # list of preds per class

            Y_pred.iloc[val_idx] += val_preds / n_seeds

            joblib.dump(clf, f"model_seed_{i}_fold_{j}.jlb", compress=True)

    Y_pred[train_features["cp_type"] == "ctl_vehicle"] = 0.0

    with open("counts.pkl", "wb") as f:
        pickle.dump(counts, f)

    with open("Y_pred.pkl", "wb") as f:
        pickle.dump(Y_pred[columns], f)
        
    oof = score(Y[columns], Y_pred[columns])
    
    return oof, Y_pred

In [11]:
import optuna


def objective(trial):
    params = {
        "objective": "binary",
        "learning_rate": 0.1,
    }
    params["max_depth"] = trial.suggest_int("max_depth", 1, 4)
    params["num_leaves"] = trial.suggest_int("num_leaves", 2, 5)
    params["feature_fraction"] = trial.suggest_discrete_uniform("feature_fraction", 0.1, 1.0, 0.05)
    params["lambda_l1"] = trial.suggest_loguniform("lambda_l1", 1e-09, 10.0)
    params["lambda_l2"] = trial.suggest_loguniform("lambda_l2", 1e-09, 10.0)

    if DEBUG:
        params["n_estimators"] = 2
    #else:
    #    params["n_estimators"] = 1000
    
    oof, _ = train_and_evaluate(params)
    
    return oof

In [12]:
is_drug_cv = True
n_splits = 5
n_seeds = 1
# LBS = 0.0008  # ラベルスムージングは全然効かないからやめる
LBS = 0.0

n_trials = 50
#params = {
#    "num_leaves": 2,
#    "max_depth": 1,
#    "min_data_in_leaf": 969,
#    "objective": "binary",
#    "learning_rate": 0.01,
#}

#DEBUG = True
DEBUG = False
if DEBUG:
    columns = [
        "atp-sensitive_potassium_channel_antagonist",  # 陽性ラベル1個だけ
        "erbb2_inhibitor",  # 陽性ラベル1個だけ
        "antiarrhythmic",  # 陽性ラベル6個だけ
        "aldehyde_dehydrogenase_inhibitor",  # 陽性ラベル7個だけ
        "lipase_inhibitor",  # 陽性ラベル12個だけ
        "sphingosine_receptor_agonist",  # 陽性ラベル25個だけ
        "igf-1_inhibitor",  # 陽性ラベル37個だけ
        "potassium_channel_activator",  # 陽性ラベル55個だけ
        "potassium_channel_antagonist",  # 陽性ラベル98個だけ
        "dopamine_receptor_agonist",  # 陽性ラベル121個だけ
        "nfkb_inhibitor",  # 陽性ラベル832個
        "cyclooxygenase_inhibitor",  # 陽性ラベル435個
        "dna_inhibitor",  # 陽性ラベル402個
        "glutamate_receptor_antagonist",  # 陽性ラベル367個
        "tubulin_inhibitor",  # 陽性ラベル316個
        "pdgfr_inhibitor",  # 陽性ラベル297個
        "calcium_channel_blocker",  # 陽性ラベル281個
        "flt3_inhibitor",  # 陽性ラベル279個
        "progesterone_receptor_agonist",  # 陽性ラベル119個
        "hdac_inhibitor",  # 陽性ラベル106個
    ]
    Y = Y[columns]

    non_columns = [
        "abc_transporter_expression_enhancer",  # nonscored class
        "abl_inhibitor",  # nonscored class
    ]
    Y_nonscored = Y_nonscored[non_columns]
    
    Y = pd.concat([Y, Y_nonscored], axis=1)
    
    n_splits = 2
    n_trials = 3
    print(f"DEBUG: {DEBUG}")

In [13]:
train_size, n_features = X.shape
_, n_classes_nonscored = Y_nonscored.shape
_, n_classes = Y.shape

In [14]:
# 確認
Y.head()

Unnamed: 0_level_0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_000779bfc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_000a6266a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_0015fd391,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_001626bd3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
%%time

study = optuna.create_study(
    study_name="study",
    storage=f"sqlite:///study.db",
    load_if_exists=True,
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=1),
)
study.optimize(objective, n_trials=n_trials)
study.trials_dataframe().to_csv(f"objective_history.csv", index=False)
with open(f"objective_best_params.txt", mode="w") as f:
    f.write(str(study.best_params))
print(f"\nstudy.best_params:\n{study.best_params}")

[32m[I 2020-11-26 20:43:53,570][0m A new study created in RDB with name: study[0m
0it [00:00, ?it/s]



1it [07:55, 475.18s/it]



2it [15:55, 476.80s/it]



3it [23:36, 471.89s/it]



4it [31:14, 467.86s/it]



5it [38:42, 464.41s/it]
[32m[I 2020-11-26 21:22:36,693][0m Trial 0 finished with value: 0.03982515602229955 and parameters: {'max_depth': 2, 'num_leaves': 5, 'feature_fraction': 0.75, 'lambda_l1': 1.0026370484058346e-09, 'lambda_l2': 1.0551779964424755e-06}. Best is trial 0 with value: 0.03982515602229955.[0m
0it [00:00, ?it/s]



1it [05:26, 326.99s/it]



2it [10:52, 326.47s/it]



3it [16:30, 330.06s/it]



4it [22:14, 334.25s/it]



5it [27:54, 334.83s/it]
[32m[I 2020-11-26 21:50:31,816][0m Trial 1 finished with value: 0.01769193191017419 and parameters: {'max_depth': 4, 'num_leaves': 2, 'feature_fraction': 0.15000000000000002, 'lambda_l1': 7.287895114844981e-08, 'lambda_l2': 2.8550076115684798e-06}. Best is trial 1 with value: 0.01769193191017419.[0m
0it [00:00, ?it/s]



1it [07:25, 445.01s/it]



2it [14:47, 444.27s/it]



3it [22:18, 446.19s/it]



4it [29:40, 444.90s/it]



5it [36:59, 443.86s/it]
[32m[I 2020-11-26 22:27:32,093][0m Trial 2 finished with value: 0.02383984311235496 and parameters: {'max_depth': 3, 'num_leaves': 3, 'feature_fraction': 0.6, 'lambda_l1': 1.555769109170068e-05, 'lambda_l2': 0.007115329296351376}. Best is trial 1 with value: 0.01769193191017419.[0m
0it [00:00, ?it/s]



1it [07:11, 431.88s/it]



2it [14:21, 431.06s/it]



3it [21:36, 432.26s/it]



4it [28:58, 435.40s/it]



5it [36:12, 434.48s/it]
[32m[I 2020-11-26 23:03:45,522][0m Trial 3 finished with value: 0.017739418228634375 and parameters: {'max_depth': 4, 'num_leaves': 2, 'feature_fraction': 0.9, 'lambda_l1': 1.8787800155635633e-09, 'lambda_l2': 0.005066115674273645}. Best is trial 1 with value: 0.01769193191017419.[0m
0it [00:00, ?it/s]



1it [08:07, 487.25s/it]



2it [16:06, 484.96s/it]



3it [24:08, 483.97s/it]



4it [32:08, 482.70s/it]



5it [40:09, 481.82s/it]
[32m[I 2020-11-26 23:43:55,582][0m Trial 4 finished with value: 0.11115864534528995 and parameters: {'max_depth': 4, 'num_leaves': 5, 'feature_fraction': 0.6, 'lambda_l1': 2.534366305306395e-08, 'lambda_l2': 9.572268915979553e-08}. Best is trial 1 with value: 0.01769193191017419.[0m
0it [00:00, ?it/s]



1it [08:26, 506.98s/it]



2it [16:57, 507.97s/it]



3it [26:03, 519.45s/it]



4it [35:12, 528.17s/it]



5it [43:58, 527.78s/it]
[32m[I 2020-11-27 00:27:55,630][0m Trial 5 finished with value: 0.0252293450149319 and parameters: {'max_depth': 2, 'num_leaves': 3, 'feature_fraction': 1.0, 'lambda_l1': 1.3622028406348673e-06, 'lambda_l2': 0.008379655349166654}. Best is trial 1 with value: 0.01769193191017419.[0m
0it [00:00, ?it/s]



1it [07:06, 426.58s/it]



2it [13:58, 422.16s/it]



3it [20:50, 419.18s/it]



4it [27:55, 420.74s/it]



5it [35:03, 420.70s/it]
[32m[I 2020-11-27 01:03:00,281][0m Trial 6 finished with value: 0.018827061654657334 and parameters: {'max_depth': 2, 'num_leaves': 2, 'feature_fraction': 0.9, 'lambda_l1': 7.086668432729139e-09, 'lambda_l2': 2.457807314308485e-09}. Best is trial 1 with value: 0.01769193191017419.[0m
0it [00:00, ?it/s]



1it [08:14, 494.59s/it]



2it [16:35, 496.39s/it]



3it [24:58, 498.38s/it]



4it [33:16, 498.29s/it]



5it [41:44, 500.87s/it]
[32m[I 2020-11-27 01:44:45,819][0m Trial 7 finished with value: 0.11047485200244826 and parameters: {'max_depth': 4, 'num_leaves': 4, 'feature_fraction': 0.9, 'lambda_l1': 9.626498296920044e-09, 'lambda_l2': 1.6258341820257028e-05}. Best is trial 1 with value: 0.01769193191017419.[0m
0it [00:00, ?it/s]



1it [07:00, 420.83s/it]



2it [14:03, 421.27s/it]



3it [21:16, 424.84s/it]



4it [28:22, 425.25s/it]



5it [35:18, 423.66s/it]
[32m[I 2020-11-27 02:20:05,227][0m Trial 8 finished with value: 0.11873802314683643 and parameters: {'max_depth': 2, 'num_leaves': 3, 'feature_fraction': 0.6, 'lambda_l1': 0.008294135756246062, 'lambda_l2': 1.429408334606736e-06}. Best is trial 1 with value: 0.01769193191017419.[0m
0it [00:00, ?it/s]



1it [06:54, 414.13s/it]



2it [13:59, 417.60s/it]



3it [21:26, 426.42s/it]



4it [28:34, 426.71s/it]



5it [35:23, 424.62s/it]
[32m[I 2020-11-27 02:55:29,225][0m Trial 9 finished with value: 0.017677838142813977 and parameters: {'max_depth': 1, 'num_leaves': 4, 'feature_fraction': 0.85, 'lambda_l1': 1.523641430706564e-09, 'lambda_l2': 0.03172803306450885}. Best is trial 9 with value: 0.017677838142813977.[0m
0it [00:00, ?it/s]



1it [05:18, 318.33s/it]



2it [10:40, 319.60s/it]



3it [16:11, 322.98s/it]



4it [21:42, 325.16s/it]



5it [27:03, 324.71s/it]
[32m[I 2020-11-27 03:22:33,747][0m Trial 10 finished with value: 0.017325280949453216 and parameters: {'max_depth': 1, 'num_leaves': 4, 'feature_fraction': 0.25, 'lambda_l1': 3.244385686100423, 'lambda_l2': 2.7648193564784194}. Best is trial 10 with value: 0.017325280949453216.[0m
0it [00:00, ?it/s]



1it [05:16, 316.81s/it]



2it [10:32, 316.58s/it]



3it [15:56, 318.71s/it]



4it [21:21, 320.70s/it]



5it [26:45, 321.17s/it]
[32m[I 2020-11-27 03:49:20,566][0m Trial 11 finished with value: 0.01731449086176443 and parameters: {'max_depth': 1, 'num_leaves': 4, 'feature_fraction': 0.2, 'lambda_l1': 2.216363462404312, 'lambda_l2': 6.774157255895894}. Best is trial 11 with value: 0.01731449086176443.[0m
0it [00:00, ?it/s]



1it [05:19, 319.00s/it]



2it [10:30, 316.72s/it]



3it [15:49, 317.30s/it]



4it [21:02, 316.29s/it]



5it [26:14, 314.80s/it]
[32m[I 2020-11-27 04:15:35,562][0m Trial 12 finished with value: 0.01734036532444897 and parameters: {'max_depth': 1, 'num_leaves': 4, 'feature_fraction': 0.15000000000000002, 'lambda_l1': 4.296786641904572, 'lambda_l2': 8.876734224608294}. Best is trial 11 with value: 0.01731449086176443.[0m
0it [00:00, ?it/s]



1it [05:30, 330.36s/it]



2it [10:58, 329.58s/it]



3it [16:32, 330.94s/it]



4it [22:03, 331.06s/it]



5it [27:25, 329.00s/it]
[32m[I 2020-11-27 04:43:01,494][0m Trial 13 finished with value: 0.017426142454709866 and parameters: {'max_depth': 1, 'num_leaves': 5, 'feature_fraction': 0.35, 'lambda_l1': 4.8739796901916, 'lambda_l2': 8.950141413532025}. Best is trial 11 with value: 0.01731449086176443.[0m
0it [00:00, ?it/s]



1it [05:22, 322.40s/it]



2it [10:59, 326.95s/it]



3it [16:50, 333.97s/it]



4it [22:39, 338.61s/it]



5it [28:22, 340.48s/it]
[32m[I 2020-11-27 05:11:24,803][0m Trial 14 finished with value: 0.017476270916837938 and parameters: {'max_depth': 1, 'num_leaves': 4, 'feature_fraction': 0.35, 'lambda_l1': 0.06073854880728229, 'lambda_l2': 0.5551100414034651}. Best is trial 11 with value: 0.01731449086176443.[0m
0it [00:00, ?it/s]



1it [06:10, 370.34s/it]



2it [12:33, 374.29s/it]



3it [19:14, 382.15s/it]



4it [26:05, 390.94s/it]



5it [32:43, 392.60s/it]
[32m[I 2020-11-27 05:44:08,905][0m Trial 15 finished with value: 0.01824938937046412 and parameters: {'max_depth': 3, 'num_leaves': 4, 'feature_fraction': 0.30000000000000004, 'lambda_l1': 0.08979867455391732, 'lambda_l2': 0.36650671917347166}. Best is trial 11 with value: 0.01731449086176443.[0m
0it [00:00, ?it/s]



1it [06:03, 363.24s/it]



2it [12:00, 361.40s/it]



3it [17:35, 353.42s/it]



4it [22:52, 342.50s/it]



5it [28:02, 336.45s/it]
[32m[I 2020-11-27 06:12:12,097][0m Trial 16 finished with value: 0.01744743839173866 and parameters: {'max_depth': 1, 'num_leaves': 3, 'feature_fraction': 0.25, 'lambda_l1': 0.0009732069906542017, 'lambda_l2': 0.6740814115609671}. Best is trial 11 with value: 0.01731449086176443.[0m
0it [00:00, ?it/s]



In [None]:
params = study.best_params
params["objective"] = "binary"
params["learning_rate"] = 0.1
#"learning_rate": 0.01,
#"n_estimators": 1000,
# params["n_estimators"] = 100  # default param

oof, Y_pred = train_and_evaluate(params)
print(oof)

# Platt Scaling
Train a Logistic Regression model to calibrate the results
- https://www.kaggle.com/gogo827jz/kernel-logistic-regression-one-for-206-targets

In [None]:
# predict_probaでだしたY_predをロジスティク回帰で確率に補正する
# （Sigmoid関数にフィットさせ、そのSigmoid関数に通した値をCalibrationした値とする）

counts = np.empty((n_classes))

X_new = Y_pred.values
Y_cali = Y_pred.copy()

for tar in tqdm(range(Y.shape[1])):

    targets = Y.values[:, tar]
    X_targets = X_new[:, tar]
    counts[tar] = targets.sum()

    if targets.sum() >= n_splits:

        Y_cali[Y.columns[tar]] = np.zeros((Y_cali.shape[0], ))
        
        skf = StratifiedKFold(n_splits=n_splits, random_state=0, shuffle=True)

        for n, (tr, te) in enumerate(skf.split(targets, targets)):
            x_tr, x_val = X_targets[tr].reshape(-1, 1), X_targets[te].reshape(-1, 1)
            y_tr, y_val = targets[tr], targets[te]

            model = LogisticRegression(penalty="none", max_iter=1000)
            model.fit(x_tr, y_tr)
            Y_cali[Y.columns[tar]].iloc[te] += model.predict_proba(x_val)[:, 1]
            
            joblib.dump(model, f"calibrate_model_target_{Y.columns[tar]}.jlb", compress=True)
        
with open("counts_calibrate.pkl", "wb") as f:
    pickle.dump(counts, f)

with open("Y_pred_calibrate.pkl", "wb") as f:
    pickle.dump(Y_cali[columns], f)
    

In [None]:
score(Y[columns], Y_cali[columns])

# pkl check

In [None]:
path = r"counts.pkl"
with open(path, 'rb') as f:
    counts = pickle.load(f)
counts

In [None]:
path = r"counts_calibrate.pkl"
with open(path, 'rb') as f:
    counts = pickle.load(f)
counts

In [None]:
path = r"Y_pred.pkl"
with open(path, 'rb') as f:
    Y_pred = pickle.load(f)
Y_pred

In [None]:
path = r"Y_pred_calibrate.pkl"
with open(path, 'rb') as f:
    Y_pred = pickle.load(f)
Y_pred

# predict test

In [None]:
import glob
import pathlib


test_features = pd.read_csv(
    # "../input/lish-moa/test_features.csv", dtype=dtype, index_col=index_col
    f"{DATADIR}/test_features.csv",
    dtype=dtype,
    index_col=index_col,
)
X_test = test_features.select_dtypes("number")

with open("./clipped_features.pkl", "rb") as f:
    clipped_features = pickle.load(f)
X_test = clipped_features.transform(X_test)
# アンサンブルのために統計値, nonscoredは入れない
# X_c = compute_row_statistics(X_test.loc[:, c_columns], prefix=c_prefix)
# X_g = compute_row_statistics(X_test.loc[:, g_columns], prefix=g_prefix)
# X_test = pd.concat([X_test, X_c, X_g], axis=1)

# lgbで予測
Y_test_pred = np.zeros((X_test.shape[0], len(columns)))
Y_test_pred = pd.DataFrame(Y_test_pred, columns=columns, index=test_features.index)
for i in range(n_seeds):
    set_seed(seed=i)

    for j in range(n_splits):
        clf = joblib.load(f"model_seed_{i}_fold_{j}.jlb")
        Y_test_pred += clf.predict_proba(X_test)[:, : len(columns)] / (
            n_seeds * n_splits
        )

print(Y_test_pred.shape)
display(Y_test_pred)

# lgbの予測値補正
model_paths = glob.glob(f"./calibrate_model_target_*.jlb")
for model_path in model_paths:
    target = str(pathlib.Path(model_path).stem).replace("calibrate_model_target_", "")

    if target in columns:
        # print(target)
        model = joblib.load(model_path)
        X_targets = Y_test_pred.loc[:, target].values.reshape(-1, 1)
        Y_test_pred.loc[:, target] = model.predict_proba(X_targets)[:, 1]

print(Y_test_pred.shape)
display(Y_test_pred)