In [1]:
NOTEBOOK_NAME = "e005_make_train"

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import seaborn as sns
import japanize_matplotlib
from tqdm.auto import tqdm

In [3]:
class Config:
    OUTPUT_DIR = f"../saved_data/{NOTEBOOK_NAME}"
    SEED = 33


os.makedirs(Config.OUTPUT_DIR, exist_ok=True)

In [4]:
train_log = pd.read_csv("../data/train_log.csv")
train_label = pd.read_csv("../data/train_label.csv")

test_log = pd.read_csv("../data/test_log.csv")
test_label = pd.read_csv("../data/test_session.csv")

yado = pd.read_csv("../data/yado.csv")

sample_submission = pd.read_csv("../data/sample_submission.csv")

# image_embeddings = pd.read_parquet("../data/image_embeddings.parquet")

In [5]:
train_label

Unnamed: 0,session_id,yad_no
0,000007603d533d30453cc45d0f3d119f,4101
1,0000ca043ed437a1472c9d1d154eb49b,8253
2,0000d4835cf113316fe447e2f80ba1c8,4863
3,0000fcda1ae1b2f431e55a7075d1f500,1652
4,000104bdffaaad1a1e0a9ebacf585f33,96
...,...,...
288693,ffff2262d38abdeb247ebd591835dcc9,2259
288694,ffff2360540745117193ecadcdc06538,963
288695,ffff7fb4617164b2604aaf51c40bf82d,13719
288696,ffffcd5bc19d62cad5a3815c87818d83,10619


# 前処理

In [6]:
train_reserved = train_label.copy()

In [7]:
# 予約があった宿に対しては1を立てる
train_reserved["reserve"] = 1

In [8]:
# 予約はないが、logでの閲覧があった宿に対して0を立てる
def add_log_yado_list(
    train_log: pd.DataFrame, train_label: pd.DataFrame
) -> pd.DataFrame:
    train_log_yado_no_sr = train_log.groupby("session_id")["yad_no"].apply(list)
    train_label["logged_yad_no_list"] = train_log_yado_no_sr.values
    return train_label


train_logged = add_log_yado_list(train_log, train_label)

In [9]:
# 予約をしたログのlistから、予約があった宿を除外する
logged_only_yados = []
for idx, row in tqdm(train_logged.iterrows(), total=len(train_logged)):
    logged_yad_no_list = row["logged_yad_no_list"]
    yad_no = row["yad_no"]
    logged_only_yado = [
        logged_yad_no for logged_yad_no in logged_yad_no_list if yad_no != logged_yad_no
    ]
    logged_only_yado = list(dict.fromkeys(logged_only_yado))
    logged_only_yados.append(logged_only_yado)

# train_logged_explodeとしてデータフレームを作成
train_logged["logged_only_yados"] = logged_only_yados
train_logged_explode = train_logged[["session_id", "logged_only_yados"]].explode(
    "logged_only_yados"
)
train_logged_explode["reserve"] = 0

  0%|          | 0/288698 [00:00<?, ?it/s]

In [10]:
# yado_noとken_cdの辞書を作成
# yado_no_ken_cd_dict = yado[["yad_no", "ken_cd"]].set_index("yad_no").to_dict()["ken_cd"]

In [11]:
# # 最後にログに登場した宿のken_cdと同じ宿の中から、ログに登場しない宿をランダムに5件抽出
# def get_last_log_same_ken_cd_random_10(
#     train_logged: pd.DataFrame, yado: pd.DataFrame
# ) -> dict:
#     outputs = {}
#     yado_only_yado_no_ken_cd = yado[["yad_no", "ken_cd"]]
#     seddion_id_logged_yad_no_list = dict(
#         zip(train_logged["session_id"], train_logged["logged_yad_no_list"])
#     )
#     for session_id, logged_yad_no in tqdm(
#         seddion_id_logged_yad_no_list.items(),
#         total=len(seddion_id_logged_yad_no_list),
#     ):
#         last_log_ken_cd = yado_no_ken_cd_dict[logged_yad_no[-1]]
#         # 最後のログと同じ県の宿で、ログに登場しない宿をランダムに10件抽出
#         random_yado_10 = (
#             yado_only_yado_no_ken_cd[
#                 (yado_only_yado_no_ken_cd["ken_cd"] == last_log_ken_cd)
#                 & (~yado_only_yado_no_ken_cd["yad_no"].isin(logged_yad_no))
#             ]
#             .sample(5, random_state=Config.SEED)["yad_no"]
#             .tolist()
#         )

#         outputs[session_id] = random_yado_10
#     return outputs


# negative_data_dict = get_last_log_same_ken_cd_random_10(train_logged, yado)

In [12]:
# 最後にログに登場した宿のlrg_cdと同じ宿の中から、ログに登場しない宿をランダムに10件抽出
def get_last_log_same_lrg_cd_random_10(
    train_logged: pd.DataFrame, yado: pd.DataFrame
) -> dict:
    # yado_noとken_cdの辞書を作成
    yado_no_lrg_cd_dict = (
        yado[["yad_no", "lrg_cd"]].set_index("yad_no").to_dict()["lrg_cd"]
    )
    outputs = {}
    yado_only_yado_no_lrg_cd = yado[["yad_no", "lrg_cd"]]
    seddion_id_logged_yad_no_list = dict(
        zip(train_logged["session_id"], train_logged["logged_yad_no_list"])
    )
    for session_id, logged_yad_no in tqdm(
        seddion_id_logged_yad_no_list.items(),
        total=len(seddion_id_logged_yad_no_list),
    ):
        last_log_lrg_cd = yado_no_lrg_cd_dict[logged_yad_no[-1]]
        # 最後のログと同じlrg_cdの宿で、ログに登場しない宿をランダムに10件抽出
        filter_yado = yado_only_yado_no_lrg_cd[
            (yado_only_yado_no_lrg_cd["lrg_cd"] == last_log_lrg_cd)
            & (~yado_only_yado_no_lrg_cd["yad_no"].isin(logged_yad_no))
        ]

        random_yado_10 = filter_yado.sample(
            min(10, filter_yado.shape[0]), random_state=Config.SEED
        )["yad_no"].tolist()

        outputs[session_id] = random_yado_10
    return outputs


negative_data_dict = get_last_log_same_lrg_cd_random_10(train_logged, yado)

  0%|          | 0/288698 [00:00<?, ?it/s]

In [17]:
negative_data = pd.DataFrame(
    {
        "session_id": negative_data_dict.keys(),
        "negative_yados": negative_data_dict.values(),
    }
)

negative_data_explode = negative_data.explode("negative_yados")
negative_data_explode = negative_data_explode.rename(
    columns={"negative_yados": "yad_no"}
)
negative_data_explode["reserve"] = 0

In [19]:
# データを保存、一応
train_reserved.to_pickle(f"{Config.OUTPUT_DIR}/{NOTEBOOK_NAME}_train_reserved.pkl")
train_logged_explode.to_pickle(
    f"{Config.OUTPUT_DIR}/{NOTEBOOK_NAME}_train_logged_explode.pkl"
)
negative_data_explode.to_pickle(
    f"{Config.OUTPUT_DIR}/{NOTEBOOK_NAME}_negative_data_explode.pkl"
)

In [70]:
# 正例、微妙な正例、負例を全て結合
def merge_train_data(
    train_reserved: pd.DataFrame,
    train_logged_explode: pd.DataFrame,
    # negative_data_explode: pd.DataFrame,
) -> pd.DataFrame:
    train_data = pd.concat(
        [
            train_reserved,
            train_logged_explode.rename(columns={"logged_only_yados": "yad_no"}),
            # negative_data_explode.rename(columns={"negative_yad_no": "yad_no"}),
        ]
    ).reset_index(drop=True)
    return train_data


merged_train = merge_train_data(
    train_reserved,
    train_logged_explode,
    # negative_data_explode
)

In [71]:
train_logged_explode

Unnamed: 0,session_id,logged_only_yados,reserve
0,000007603d533d30453cc45d0f3d119f,2395,0
1,0000ca043ed437a1472c9d1d154eb49b,13535,0
2,0000d4835cf113316fe447e2f80ba1c8,123,0
3,0000fcda1ae1b2f431e55a7075d1f500,8475,0
4,000104bdffaaad1a1e0a9ebacf585f33,898,0
...,...,...,...
288693,ffff2262d38abdeb247ebd591835dcc9,8140,0
288694,ffff2360540745117193ecadcdc06538,2121,0
288695,ffff7fb4617164b2604aaf51c40bf82d,7183,0
288696,ffffcd5bc19d62cad5a3815c87818d83,12230,0


In [75]:
# session_idごとのyado_idのlistを追加
session_id_logged_yad_no_dict = (
    train_log.groupby("session_id")["yad_no"].apply(list).to_dict()
)

merged_train["logged_yad_no_list"] = merged_train["session_id"].map(
    session_id_logged_yad_no_dict
)

In [76]:
merged_train.to_pickle(f"{Config.OUTPUT_DIR}/{NOTEBOOK_NAME}_merged_train.pkl")

In [77]:
f"{Config.OUTPUT_DIR}/{NOTEBOOK_NAME}_merged_train.pkl"

'../saved_data/e005_make_train/e005_make_train_merged_train.pkl'