In [1]:
NOTEBOOK_NAME = "e039_make_associ_with_label"

In [2]:
import polars as pl
import os

In [3]:
class Config:
    OUTPUT_DIR = f"../saved_data/{NOTEBOOK_NAME}"
    SEED = 33
    TARGET_COL = "reserve"


os.makedirs(Config.OUTPUT_DIR, exist_ok=True)

In [4]:
train_log = pl.read_csv("../data/train_log.csv")
test_log = pl.read_csv("../data/test_log.csv")

In [8]:
# e016にて、データ作成時にfoldを利用するように変更
import pickle

with open(
    "../saved_data/e016_make_train_popular_base/session_id_fold_dict.pkl", "rb"
) as f:
    session_id_fold_dict = pickle.load(f)

In [14]:
tmp = train_log.get_column("session_id").map_dict(session_id_fold_dict)
train_log = train_log.with_columns(pl.Series("fold", tmp))

In [None]:
train_label = pl.read_csv("../data/train_label.csv")

In [6]:
def get_co_visit_matrix(train_log: pl.DataFrame):
    train_log = train_log.clone()

    def generate_co_visit_matrix(df: pl.DataFrame) -> pl.DataFrame:
        # 共起ペアの作成
        df = df.join(df, on="session_id")
        # yad_noが同じものは除外する
        df = df.filter(pl.col("yad_no") != pl.col("yad_no_right"))
        # yad_noのペアごとに共起回数を計算
        df = df.group_by(["yad_no", "yad_no_right"]).count()
        # 整形
        df = df.rename(
            {
                "yad_no_right": "candidate_yad_no",
                "count": "co_visit_count",
            }
        )[["yad_no", "candidate_yad_no", "co_visit_count"]]

        return df

    train_co_visit_matrix = generate_co_visit_matrix(train_log)
    # test_co_visit_matrix = generate_co_visit_matrix(test_log)

    # 最後に見た宿と紐づけてcandidateを作成するために、renameしておく。
    train_co_visit_matrix = train_co_visit_matrix.rename(
        {"yad_no": "latest_yad_no", "candidate_yad_no": "yad_no"}
    )
    # test_co_visit_matrix = test_co_visit_matrix.rename(
    #     {"yad_no": "latest_yad_no", "candidate_yad_no": "yad_no"}
    # )
    train_co_visit_matrix_top10_candidate = (
        train_co_visit_matrix.sort(
            ["latest_yad_no", "co_visit_count"], descending=[False, True]
        )
        .group_by("latest_yad_no")
        .head(100_000)
    )
    return train_co_visit_matrix_top10_candidate
    # test_co_visit_matrix_top10_candidate = (
    #     test_co_visit_matrix.sort(
    #         ["latest_yad_no", "co_visit_count"], descending=[False, True]
    #     )
    #     .group_by("latest_yad_no")
    #     .head(10)
    # )


for i in range(5):
    train_log = train_log.filter(pl.col("fold") == i)
    train_co_visit_matrix_top10_candidate = get_co_visit_matrix(train_log)

latest_yad_no,yad_no,co_visit_count
i64,i64,u32
2,3860,2
2,3847,1
2,12162,1
2,12232,1
2,13783,1
3,10095,41
3,846,24
3,5800,10
3,7093,8
3,10211,6


In [12]:
train_co_visit_matrix.write_parquet(f"{Config.OUTPUT_DIR}/train_co_visit_matrix.pkl")
test_co_visit_matrix.write_parquet(f"{Config.OUTPUT_DIR}/test_co_visit_matrix.pkl")