In [7]:
NOTEBOOK_NAME = "e035_make_associ"

In [8]:
import polars as pl
import os

In [9]:
class Config:
    OUTPUT_DIR = f"../saved_data/{NOTEBOOK_NAME}"
    SEED = 33
    TARGET_COL = "reserve"


os.makedirs(Config.OUTPUT_DIR, exist_ok=True)

In [2]:
train_log = pl.read_csv("../data/train_log.csv")
test_log = pl.read_csv("../data/test_log.csv")

In [10]:
def generate_co_visit_matrix(df: pl.DataFrame) -> pl.DataFrame:
    # 共起ペアの作成
    df = df.join(df, on="session_id")
    # yad_noが同じものは除外する
    df = df.filter(pl.col("yad_no") != pl.col("yad_no_right"))
    # yad_noのペアごとに共起回数を計算
    df = df.group_by(["yad_no", "yad_no_right"]).count()
    # 整形
    df = df.rename(
        {
            "yad_no_right": "candidate_yad_no",
            "count": "co_visit_count",
        }
    )[["yad_no", "candidate_yad_no", "co_visit_count"]]

    return df


train_co_visit_matrix = generate_co_visit_matrix(train_log)
test_co_visit_matrix = generate_co_visit_matrix(test_log)

# 最後に見た宿と紐づけてcandidateを作成するために、renameしておく。
train_co_visit_matrix = train_co_visit_matrix.rename(
    {"yad_no": "latest_yad_no", "candidate_yad_no": "yad_no"}
)
test_co_visit_matrix = test_co_visit_matrix.rename(
    {"yad_no": "latest_yad_no", "candidate_yad_no": "yad_no"}
)
train_co_visit_matrix_top10_candidate = (
    train_co_visit_matrix.sort(
        ["latest_yad_no", "co_visit_count"], descending=[False, True]
    )
    .group_by("latest_yad_no")
    .head(10)
)
test_co_visit_matrix_top10_candidate = (
    test_co_visit_matrix.sort(
        ["latest_yad_no", "co_visit_count"], descending=[False, True]
    )
    .group_by("latest_yad_no")
    .head(10)
)

In [12]:
train_co_visit_matrix.write_parquet(f"{Config.OUTPUT_DIR}/train_co_visit_matrix.pkl")
test_co_visit_matrix.write_parquet(f"{Config.OUTPUT_DIR}/test_co_visit_matrix.pkl")