In [1]:
import os
import sys
import joblib
import warnings
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from lightgbm import LGBMClassifier
import optuna
from tqdm import tqdm
import argparse

sys.path.append(r"C:\Users\81908\Git\iterative-stratification")
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

warnings.filterwarnings("ignore")

parser = argparse.ArgumentParser()
parser.add_argument(
    "-d",
    "--DEBUG",
    action="store_const",
    const=True,
    default=False,
    help="DEBUG flag.",
)
parser.add_argument(
    "-m", "--MODE", type=str, default="train", help="MODE flag.",
)
parser.add_argument(
    "-is_c",
    "--IS_CHAIN",
    action="store_const",
    const=True,
    default=False,
    help="ClassifierChain flag.",
)
#args = vars(parser.parse_args())
#args = vars(parser.parse_args(args=[])) # notebookで argparseそのままで実行する場合はこっち

# DEBUG = False
DEBUG = True
#MODE = "train"
MODE = "objective"
#IS_CHAIN = False
IS_CHAIN = True
#DEBUG = args["DEBUG"]
#MODE = args["MODE"]
#IS_CHAIN = args["IS_CHAIN"]


OUTDIR = r"20201029_lgb_multi"
os.makedirs(OUTDIR, exist_ok=True)

N_SPLITS = 5

N_TRIALS= 5
#N_TRIALS= 50

# SEEDS = [5, 12]
SEEDS = [0]  # 乱数シード指定してるが、StratifiedKFold は shuffle=False にしている

# DATADIR = '/kaggle/input/lish-moa/'
DATADIR = (
    r"C:\Users\81908\jupyter_notebook\poetry_work\tf23\01_MoA_compe\input\lish-moa"
)
train = pd.read_csv(f"{DATADIR}/train_features.csv")
test = pd.read_csv(f"{DATADIR}/test_features.csv")
train_targets = pd.read_csv(f"{DATADIR}/train_targets_scored.csv")
train_targets_nonscored = pd.read_csv(f"{DATADIR}/train_targets_nonscored.csv")
submission = pd.read_csv(f"{DATADIR}/sample_submission.csv")

if DEBUG:
    np.random.seed(0)  # 乱数シード固定
    #    # ランダムに2000件選択
    #    _ids = np.random.choice(train.index, 2000)
    #    train = train.loc[_ids].reset_index(drop=True)
    #    train_targets_scored = train_targets_scored.loc[_ids].reset_index(drop=True)

    # 3クラスのみにする
    _classes = [
        "sig_id",
        "5-alpha_reductase_inhibitor",
        "11-beta-hsd1_inhibitor",
        # "acat_inhibitor", # 2000件だとすべて0になるのでダメ
    ]
    train_targets = train_targets[_classes]
    submission = submission[_classes]

In [2]:
def mapping_and_filter(train, train_targets, test):
    """前処理"""
    cp_type = {"trt_cp": 0, "ctl_vehicle": 1}
    cp_dose = {"D1": 0, "D2": 1}
    for df in [train, test]:
        df["cp_type"] = df["cp_type"].map(cp_type)
        df["cp_dose"] = df["cp_dose"].map(cp_dose)
    
    ## ctl_vehicleは必ず0なので学習データから除く
    #train_targets = train_targets[train["cp_type"] == 0].reset_index(drop=True)
    #train = train[train["cp_type"] == 0].reset_index(drop=True)
    
    # sig_id列はidなので不要
    train_targets.drop(["sig_id"], inplace=True, axis=1)
    return train, train_targets, test

In [3]:
def save_model(model, model_path="model/fold00.model"):
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    joblib.dump(model, model_path, compress=True)


def load_model(model_path="model/fold00.model"):
    return joblib.load(model_path)

In [4]:
def mean_log_loss(y_true, y_pred):
    """マルチラベル全体でlog lossを平均する"""
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    metrics = []
    for target in range(y_true.shape[1]):
        metrics.append(log_loss(y_true[:, target], y_pred[:, target]))
    return np.mean(metrics)

In [5]:
def run_multiout(params, seed):
    """MultiOutputClassifierでマルチラベル学習する"""
    # categorical_cols = ["cp_type", "cp_dose"]

    X_train = train.drop(["sig_id"], axis=1)
    y_train = train_targets.copy()
    X_test = test.drop(["sig_id"], axis=1)

    y_preds = []
    oof_pred = np.zeros([X_train.shape[0], y_train.shape[1]])

    ## for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train)):
    # for fold_id, (train_index, valid_index) in tqdm(
    #    enumerate(
    #        MultilabelStratifiedKFold(
    #            n_splits=N_SPLITS, random_state=seed, shuffle=True
    #        ).split(y_train, y_train)
    #    )
    # ):
    # MultiLabelStratifiedKFold(n_splits=5, shuffle=False) で乱数固定する 20201028
    for fold_id, (train_index, valid_index) in tqdm(
        enumerate(
            MultilabelStratifiedKFold(n_splits=N_SPLITS, shuffle=False).split(
                y_train, y_train
            )
        )
    ):
        X_tr, X_val = (
            X_train.values[train_index],
            X_train.values[valid_index],
        )
        y_tr, y_val = (
            y_train.values[train_index],
            y_train.values[valid_index],
        )
        
        if IS_CHAIN:
            model = ClassifierChain(LGBMClassifier(**params), random_state=seed)
            model_path=f"{OUTDIR}/model/chain_fold{str(fold_id).zfill(2)}_{seed}.model",
        else:
            model = MultiOutputClassifier(LGBMClassifier(**params))
            model_path=f"{OUTDIR}/model/multi_fold{str(fold_id).zfill(2)}_{seed}.model",

        # MultiOutputClassifier/ClassifierChain はval使えないみたい
        # https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html
        model.fit(
            X_tr,
            y_tr,
            # categorical_feature=categorical_cols  # MultiOutputClassifier では指定できない
            # eval_metric="error",
            # verbose=300,
            # eval_set=[(X_tr, y_tr), (X_val, y_val)],
            # early_stopping_rounds=300,
        )

        pred_y_val = model.predict_proba(X_val)
        y_pred = model.predict_proba(X_test)
        #print(pred_y_val, np.array(pred_y_val).shape)
        
        if IS_CHAIN == False:
            pred_y_val = np.array(pred_y_val)[:, :, 1].T  # take the positive class
            y_pred = np.array(y_pred)[:, :, 1].T  # take the positive class
        #print(y_pred.shape)
        
        oof_pred[valid_index] = pred_y_val
        y_preds.append(y_pred)

        if MODE == "train":
            save_model(
                model,
                model_path=model_path,
            )

    oof_score = mean_log_loss(train_targets.values, oof_pred)
    print(f"oof_score: {oof_score}")

    return oof_pred, sum(y_preds) / len(y_preds)

In [6]:
def run_seed_avg(params, seeds=SEEDS):
    """シードアベレージ"""
    oofs = []
    subs = []
    for seed in seeds:
        print(f"\n================ seed:{seed} ================")
        _oof, _preds = run_multiout(params, seed)
        oofs.append(_oof)
        subs.append(_preds)
    oof_avg = sum(oofs) / len(seeds)
    sub_avg = sum(subs) / len(seeds)

    oof_score = mean_log_loss(train_targets.values, oof_avg)
    print(f"oof_score seed_avg: {oof_score}")

    return oof_avg, sub_avg

In [7]:
def submit(test_pred, test, sample_submission, train_targets):
    sample_submission.loc[:, train_targets.columns] = test_pred
    sample_submission.loc[test["cp_type"] == 1, train_targets.columns] = 0
    sample_submission.to_csv(f"{OUTDIR}/submission.csv", index=False)
    return sample_submission

In [8]:
def objective(trial):
    params = {
        "objective": "binary",
        "learning_rate": 0.1,
    }
    params["max_depth"] = trial.suggest_int("max_depth", 1, 7)
    params["num_leaves"] = trial.suggest_int("num_leaves", 2, 2 ** params["max_depth"])
    params["min_child_samples"] = trial.suggest_int(
        "min_child_samples",
        1,
        max(
            1, int(train.shape[0] * ((N_SPLITS - 1) / N_SPLITS) / params["num_leaves"])
        ),
    )
    oof, sub = run_multiout(params, SEEDS[0])
    oof_score = mean_log_loss(train_targets.values, oof)
    return np.mean(oof_score)

In [9]:
def main_train():
    params = {
        "num_leaves": 24,
        "max_depth": 5,
        "objective": "binary",
        "learning_rate": 0.01,
        "n_estimators": 100,
    }
    oof, sub = run_seed_avg(params)
    submit(sub, test, submission, train_targets)

In [10]:
if __name__ == "__main__":
    train, train_targets, test = mapping_and_filter(train, train_targets, test)

    if MODE == "train":
        main_train()
    else:
        study = optuna.create_study(
            study_name="study",
            storage=f"sqlite:///{OUTDIR}/study.db",
            load_if_exists=True,
        )
        study.optimize(objective, n_trials=N_TRIALS)
        study.trials_dataframe().to_csv(f"{OUTDIR}/objective_history.csv", index=False)
        with open(f"{OUTDIR}/objective_best_params.txt", mode="w") as f:
            f.write(str(study.best_params))
        print(f"\nstudy.best_params:\n{study.best_params}")

[32m[I 2020-10-29 21:33:08,377][0m A new study created in RDB with name: study[0m
5it [00:14,  2.96s/it]
[32m[I 2020-10-29 21:33:23,341][0m Trial 0 finished with value: 0.0056314046261365875 and parameters: {'max_depth': 2, 'num_leaves': 2, 'min_child_samples': 4843}. Best is trial 0 with value: 0.0056314046261365875.[0m
0it [00:00, ?it/s]

oof_score: 0.0056314046261365875


5it [00:17,  3.41s/it]
[32m[I 2020-10-29 21:33:40,542][0m Trial 1 finished with value: 0.21036714371117402 and parameters: {'max_depth': 3, 'num_leaves': 7, 'min_child_samples': 12}. Best is trial 0 with value: 0.0056314046261365875.[0m
0it [00:00, ?it/s]

oof_score: 0.21036714371117402


5it [00:32,  6.45s/it]
[32m[I 2020-10-29 21:34:12,919][0m Trial 2 finished with value: 0.011451722362077789 and parameters: {'max_depth': 7, 'num_leaves': 57, 'min_child_samples': 161}. Best is trial 0 with value: 0.0056314046261365875.[0m
0it [00:00, ?it/s]

oof_score: 0.011451722362077789


5it [00:30,  6.01s/it]
[32m[I 2020-10-29 21:34:43,111][0m Trial 3 finished with value: 0.01104589681447704 and parameters: {'max_depth': 6, 'num_leaves': 53, 'min_child_samples': 193}. Best is trial 0 with value: 0.0056314046261365875.[0m
0it [00:00, ?it/s]

oof_score: 0.01104589681447704


1it [00:06,  6.59s/it]



2it [00:12,  6.52s/it]



3it [00:19,  6.47s/it]



4it [00:25,  6.45s/it]



5it [00:32,  6.41s/it]
[32m[I 2020-10-29 21:35:15,269][0m Trial 4 finished with value: 0.011273042056697276 and parameters: {'max_depth': 7, 'num_leaves': 31, 'min_child_samples': 157}. Best is trial 0 with value: 0.0056314046261365875.[0m


oof_score: 0.011273042056697276

study.best_params:
{'max_depth': 2, 'min_child_samples': 4843, 'num_leaves': 2}
