In [70]:
NOTEBOOK_NAME = "e001_make_test"

In [71]:
import os
import pickle
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import seaborn as sns
import japanize_matplotlib
from tqdm.auto import tqdm

In [72]:
class Config:
    OUTPUT_DIR = f"../saved_data/{NOTEBOOK_NAME}"
    SEED = 33


os.makedirs(Config.OUTPUT_DIR, exist_ok=True)

In [73]:
log_data = pd.read_csv("../data/train_log.csv")
session_data = pd.read_csv("../data/train_label.csv")

test_log = pd.read_csv("../data/test_log.csv")
test_session = pd.read_csv("../data/test_session.csv")

yado = pd.read_csv("../data/yado.csv")

sample_submission = pd.read_csv("../data/sample_submission.csv")

In [74]:
# testの
def add_log_yado_list(
    log_data: pd.DataFrame, session_data: pd.DataFrame
) -> pd.DataFrame:
    train_log_yado_no_sr = log_data.groupby("session_id")["yad_no"].apply(list)
    session_data["logged_yad_no_list"] = train_log_yado_no_sr.values
    return session_data


test_session_with_logged = add_log_yado_list(test_log, test_session)

test_session_with_logged

Unnamed: 0,session_id,logged_yad_no_list
0,00001149e9c73985425197104712478c,"[3560, 1959]"
1,0000e02747d749a52b7736dfa751e258,[11984]
2,0000f17ae2628237d78d3a38b009d3be,"[757, 8922]"
3,000174a6f7a569b84c5575760d2e9664,"[13610, 12341, 13610]"
4,00017e2a527901c9c41b1acef525d016,[4621]
...,...,...
174695,fffee3199ef94b92283239cd5e3534fa,"[1997, 8336]"
174696,ffff62c6bb49bc9c0fbcf08494a4869c,[12062]
174697,ffff9a7dcc892875c7a8b821fa436228,[8989]
174698,ffffb1d30300fe17f661941fd085b04b,[6030]


In [75]:
# 最後に閲覧したyad_noを取得する
test_session_with_logged["last_yad_no"] = test_session_with_logged[
    "logged_yad_no_list"
].apply(lambda x: x[-1])

In [76]:
# 最後に閲覧したyad_noが含まれているlrg_cdを紐付ける
yad_no_lrg_cd_dict = dict(zip(yado["yad_no"], yado["lrg_cd"]))

test_session_with_logged["last_lrg_cd"] = test_session_with_logged["last_yad_no"].map(
    yad_no_lrg_cd_dict
)

In [77]:
test_session_with_logged

Unnamed: 0,session_id,logged_yad_no_list,last_yad_no,last_lrg_cd
0,00001149e9c73985425197104712478c,"[3560, 1959]",1959,52ca3d2824fc3cc90bd4274423badeed
1,0000e02747d749a52b7736dfa751e258,[11984],11984,2e63024b11908f3729510051a6fc7d9e
2,0000f17ae2628237d78d3a38b009d3be,"[757, 8922]",8922,dca13b5f308a0ae88ab8875a9ab56919
3,000174a6f7a569b84c5575760d2e9664,"[13610, 12341, 13610]",13610,57b6663bea1ed3527b11e80be82d5235
4,00017e2a527901c9c41b1acef525d016,[4621],4621,7763c74e2efa67a522125d9d3d7dde25
...,...,...,...,...
174695,fffee3199ef94b92283239cd5e3534fa,"[1997, 8336]",8336,da273b9909edbb8cdb40305868de155c
174696,ffff62c6bb49bc9c0fbcf08494a4869c,[12062],12062,989ce3ae2fc5f1649bd10e05917a27f8
174697,ffff9a7dcc892875c7a8b821fa436228,[8989],8989,9d6a46da05976cab8ac2b8583215c665
174698,ffffb1d30300fe17f661941fd085b04b,[6030],6030,d153c8fd78bfad6faadf8e769e5cb314


In [96]:
# lrg_cdが一致するyad_noをリストを追加する、ただし最後に閲覧したyad_noは除く
def add_same_lrg_cd_yad_no_list(last_lrg_cd: str, yado: pd.DataFrame) -> list:
    last_lrg_cd_yado = yado[yado["lrg_cd"] == last_lrg_cd]
    last_lrg_cd_yado_list = last_lrg_cd_yado["yad_no"].tolist()
    return last_lrg_cd_yado_list


same_lrg_cd_yad_no_lists_wo_last_yad_no = []
for idx, row in tqdm(
    test_session_with_logged.iterrows(), total=len(test_session_with_logged)
):
    last_lrg_cd = row["last_lrg_cd"]
    logged_yad_no_list = row["logged_yad_no_list"]

    same_lrg_cd_yad_no_list = add_same_lrg_cd_yad_no_list(last_lrg_cd, yado)

    # logがあり、最後以外のyad_noが存在する場合は、追加する
    for logged_yad_no in logged_yad_no_list[:-1]:
        if logged_yad_no not in same_lrg_cd_yad_no_list:
            same_lrg_cd_yad_no_list.append(logged_yad_no)

    # 最後のyad_noは必ず正解にならないため、省く
    if logged_yad_no_list[-1] in same_lrg_cd_yad_no_list:
        same_lrg_cd_yad_no_list.remove(logged_yad_no_list[-1])

    same_lrg_cd_yad_no_lists_wo_last_yad_no.append(same_lrg_cd_yad_no_list)

test_session_with_logged[
    "same_lrg_cd_yad_no_wo_last_yad"
] = same_lrg_cd_yad_no_lists_wo_last_yad_no

  0%|          | 0/174700 [00:00<?, ?it/s]

In [116]:
test_session_with_logged_explode = test_session_with_logged.explode(
    "same_lrg_cd_yad_no_wo_last_yad"
)

In [118]:
test_session_with_logged_explode = test_session_with_logged_explode.rename(
    columns={"same_lrg_cd_yad_no_wo_last_yad": "yad_no"}
)

In [120]:
test_session_with_logged_explode = test_session_with_logged_explode[
    ["session_id", "yad_no", "logged_yad_no_list", "last_yad_no"]
]

In [122]:
test_session_with_logged_explode.to_pickle(
    f"{Config.OUTPUT_DIR}/{NOTEBOOK_NAME}_test_same_lrg_cd_wo_last_yad_no.pkl"
)