In [None]:
NOTEBOOK_NAME = "e024_make_rulebased_feat"

In [None]:
import os


class Config:
    OUTPUT_DIR = f"../saved_data/{NOTEBOOK_NAME}"
    SEED = 33
    TARGET_COL = "reserve"


os.makedirs(Config.OUTPUT_DIR, exist_ok=True)

In [None]:
import pickle

In [None]:
FOLD_NUM = 5

In [None]:
# 必要ライブラリのimportとデータの読み込み
import pandas as pd
from collections import defaultdict
from heapq import heappush, heappop

yado = pd.read_csv("../data/yado.csv", dtype={"yad_no": int})  # 今回は使いません
train_log = pd.read_csv(
    "../data/train_log.csv", dtype={"session_id": str, "seq_no": int, "yad_no": int}
)
train_label = pd.read_csv(
    "../data/train_label.csv", dtype={"session_id": str, "yad_no": int}
)
test_log = pd.read_csv(
    "../data/test_log.csv", dtype={"session_id": str, "seq_no": int, "yad_no": int}
)
test_session = pd.read_csv("../data/test_session.csv", dtype={"session_id": str})

In [None]:
train_label

In [None]:
# e016にて、データ作成時にfoldを利用するように変更
with open(
    "../saved_data/e016_make_train_popular_base/session_id_fold_dict.pkl", "rb"
) as f:
    session_id_fold_dict = pickle.load(f)

train_label["fold"] = train_label["session_id"].map(session_id_fold_dict)

In [None]:
# TODO: ここの処理の修正から
def get_rulebased_predict_train(
    train_label: pd.DataFrame, train_log: pd.DataFrame, fold: int
) -> pd.DataFrame:
    train_label_org = train_label.copy()
    train_log_org = train_log.copy()

    train_label = train_label_org[train_label_org["fold"] != fold].reset_index(
        drop=True
    )
    train_log = train_log_org[
        train_log_org["session_id"].isin(train_label["session_id"].tolist())
    ].reset_index(drop=True)

    test_session = train_label_org[train_label_org["fold"] == fold].reset_index(
        drop=True
    )
    test_log = train_log_org[
        train_log_org["session_id"].isin(test_session["session_id"].tolist())
    ].reset_index(drop=True)

    train_label = train_label[["session_id", "yad_no"]]
    test_session = test_session[["session_id", "yad_no"]]

    # データの加工

    # train_logで実際に予約した宿をひけるようにしておく
    map_reserved = defaultdict(int)
    for idx, rec in train_label.iterrows():
        session_id, yad_no_reserved = rec
        map_reserved[session_id] = yad_no_reserved

    # 縦持ちのセッションログを、session_id : [閲覧したyad_noのリスト] のdictに変換
    def Make_session_list(session_log):
        map_session_yads = defaultdict(list)
        for _, row in session_log.iterrows():
            session_id = row[0]
            yad_no = row[2]
            map_session_yads[session_id].append(yad_no)
        return map_session_yads

    map_session_yads_train = Make_session_list(train_log)
    map_session_yads_test = Make_session_list(test_log)

    # D[v][r]:= 「最後に宿vを閲覧して、宿rを予約した」セッションの件数
    D = defaultdict(lambda: defaultdict(int))
    for session_id, viewed_yad_no in map_session_yads_train.items():
        last_viewed = viewed_yad_no[-1]
        reserved = map_reserved[session_id]
        D[last_viewed][reserved] += 1

    # test_logを用いて予測値を出力

    # 1. 1件の宿vしか閲覧していないログの場合、D[v][r]の大きい順にrを10件出力する.
    # 2. 2件以上閲覧しているログの場合、最後から2番目に閲覧している宿を1位とする. その後、2位以降について1.と同様に出力する.
    test_session_number = len(test_session)
    Predicted_List = [[0] * 10 for _ in range(test_session_number)]
    for idx, session_id in enumerate(test_session["session_id"]):
        viewed_number = len(map_session_yads_test[session_id])
        last_viewed = map_session_yads_test[session_id][-1]
        rank = 0

        if viewed_number > 1:
            Predicted_List[idx][rank] = map_session_yads_test[session_id][-2]
            rank += 1

        sorted_yad_list = []
        for yad_no, viewed_cnt in D[last_viewed].items():
            heappush(sorted_yad_list, (-viewed_cnt, yad_no))

        while rank < 10 and sorted_yad_list:
            _, predicted_yad_no = heappop(sorted_yad_list)
            Predicted_List[idx][rank] = predicted_yad_no
            rank += 1

    df_rulebase = pd.DataFrame(
        Predicted_List,
        columns=[
            "predict_0",
            "predict_1",
            "predict_2",
            "predict_3",
            "predict_4",
            "predict_5",
            "predict_6",
            "predict_7",
            "predict_8",
            "predict_9",
        ],
    )
    df_rulebase = df_rulebase.add_prefix("rulebased_")

    df_rulebase.insert(0, "session_id", test_session["session_id"])
    return df_rulebase


# 学習データへのルールベースで推論した宿を付与する
rulebased_predict_dfs = []
for fold in range(FOLD_NUM):
    rulebased_predict_df = get_rulebased_predict_train(
        train_label, train_log, fold=fold
    )
    rulebased_predict_dfs.append(rulebased_predict_df)

In [None]:
all_rulebased_predict_df_train = pd.concat(rulebased_predict_dfs, axis=0).reset_index(
    drop=True
)

In [None]:
all_rulebased_predict_df_train.sort_values("session_id", inplace=True)

In [None]:
assert (
    all_rulebased_predict_df_train["session_id"].nunique()
    == train_label["session_id"].nunique()
)

In [None]:
all_rulebased_predict_df_train.to_pickle(
    f"{Config.OUTPUT_DIR}/all_rulebased_predict_df_train.pkl"
)

In [None]:
def get_rulebased_predict_test(
    train_label: pd.DataFrame,
    train_log: pd.DataFrame,
    test_session: pd.DataFrame,
) -> pd.DataFrame:
    train_label = train_label[["session_id", "yad_no"]]
    test_session = test_session[["session_id"]]

    # データの加工

    # train_logで実際に予約した宿をひけるようにしておく
    map_reserved = defaultdict(int)
    for idx, rec in train_label.iterrows():
        session_id, yad_no_reserved = rec
        map_reserved[session_id] = yad_no_reserved

    # 縦持ちのセッションログを、session_id : [閲覧したyad_noのリスト] のdictに変換
    def Make_session_list(session_log):
        map_session_yads = defaultdict(list)
        for _, row in session_log.iterrows():
            session_id = row[0]
            yad_no = row[2]
            map_session_yads[session_id].append(yad_no)
        return map_session_yads

    map_session_yads_train = Make_session_list(train_log)
    map_session_yads_test = Make_session_list(test_log)

    # D[v][r]:= 「最後に宿vを閲覧して、宿rを予約した」セッションの件数
    D = defaultdict(lambda: defaultdict(int))
    for session_id, viewed_yad_no in map_session_yads_train.items():
        last_viewed = viewed_yad_no[-1]
        reserved = map_reserved[session_id]
        D[last_viewed][reserved] += 1

    # test_logを用いて予測値を出力

    # 1. 1件の宿vしか閲覧していないログの場合、D[v][r]の大きい順にrを10件出力する.
    # 2. 2件以上閲覧しているログの場合、最後から2番目に閲覧している宿を1位とする. その後、2位以降について1.と同様に出力する.
    test_session_number = len(test_session)
    Predicted_List = [[0] * 10 for _ in range(test_session_number)]
    for idx, session_id in enumerate(test_session["session_id"]):
        viewed_number = len(map_session_yads_test[session_id])
        last_viewed = map_session_yads_test[session_id][-1]
        rank = 0

        if viewed_number > 1:
            Predicted_List[idx][rank] = map_session_yads_test[session_id][-2]
            rank += 1

        sorted_yad_list = []
        for yad_no, viewed_cnt in D[last_viewed].items():
            heappush(sorted_yad_list, (-viewed_cnt, yad_no))

        while rank < 10 and sorted_yad_list:
            _, predicted_yad_no = heappop(sorted_yad_list)
            Predicted_List[idx][rank] = predicted_yad_no
            rank += 1

    df_rulebase = pd.DataFrame(
        Predicted_List,
        columns=[
            "predict_0",
            "predict_1",
            "predict_2",
            "predict_3",
            "predict_4",
            "predict_5",
            "predict_6",
            "predict_7",
            "predict_8",
            "predict_9",
        ],
    )
    df_rulebase = df_rulebase.add_prefix("rulebased_")

    df_rulebase.insert(0, "session_id", test_session["session_id"])
    return df_rulebase


# 学習データへのルールベースで推論した宿を付与する
all_rulebased_predict_df_test = get_rulebased_predict_test(
    train_label, train_log, test_session
)

In [None]:
all_rulebased_predict_df_test.sort_values("session_id", inplace=True)

In [None]:
assert (
    all_rulebased_predict_df_test["session_id"].nunique()
    == test_session["session_id"].nunique()
)

In [None]:
all_rulebased_predict_df_test.to_pickle(
    f"{Config.OUTPUT_DIR}/all_rulebased_predict_df_test.pkl"
)