In [1]:
NOTEBOOK_NAME = "e031_use_sml_cd"

In [2]:
import builtins
import types


def imports():
    for name, val in globals().items():
        # module imports
        if isinstance(val, types.ModuleType):
            yield name, val

            # functions / callables
        if hasattr(val, "__call__"):
            yield name, val


def noglobal(f):
    """
    ref: https://gist.github.com/raven38/4e4c3c7a179283c441f575d6e375510c
    """
    return types.FunctionType(
        f.__code__, dict(imports()), f.__name__, f.__defaults__, f.__closure__
    )

In [3]:
import os

from sklearn.model_selection import StratifiedKFold
import pickle
from typing import Literal

import pandas as pd
import numpy as np
import japanize_matplotlib

from contextlib import contextmanager
from time import time
import random
from tqdm.auto import tqdm

%matplotlib inline


# ref: Kaggleコード遺産 https://qiita.com/kaggle_grandmaster-arai-san/items/d59b2fb7142ec7e270a5 
class Timer:
    def __init__(self, logger=None, format_str="{:.3f}[s]", prefix=None, suffix=None, sep=" "):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)


def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    
# 再現性確保!
seed_everything(33)

In [4]:
INPUT_DIR = "../data"
OUTPUT_DIR = f"../saved_data/{NOTEBOOK_NAME}"

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [5]:
# 学習用のログデータと正解ラベル
train_log = pd.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_session = pd.read_csv(os.path.join(INPUT_DIR, "train_label.csv"))

# 宿のデータ
yado = pd.read_csv(os.path.join(INPUT_DIR, "yado.csv"))

# テスト期間のログデータ
test_log = pd.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))
test_session = pd.read_csv(os.path.join(INPUT_DIR, "test_session.csv"))

sample_submission = pd.read_csv(os.path.join(INPUT_DIR, "sample_submission.csv"))

In [6]:
whole_log_df = pd.concat([train_log, test_log], ignore_index=True)

In [7]:
# logの長さをtrain_sessionに付与する
session_id_cnt_dict = train_log.groupby("session_id")["yad_no"].apply(len).to_dict()
train_session["session_length"] = train_session["session_id"].map(session_id_cnt_dict)

In [8]:
# foldごとに切ってデータを用意するため、最初にfoldを付与する
# Cross Validationの際に用いるfold(分割する際のグループ番号)を追加
FOLD_NUM = 5

skf = StratifiedKFold(n_splits=FOLD_NUM)
for fold, (_, v_idx) in enumerate(
    skf.split(
        X=train_session,
        y=train_session["session_length"],
    )
):
    train_session.loc[v_idx, "fold"] = fold

train_session["fold"] = train_session["fold"].astype(int)



In [9]:
# session_idとfoldの対応を保存しておく
session_id_fold_dict = dict(zip(train_session["session_id"], train_session["fold"]))
with open(os.path.join(OUTPUT_DIR, "session_id_fold_dict.pkl"), "wb") as f:
    pickle.dump(session_id_fold_dict, f)
with open(os.path.join(OUTPUT_DIR, "session_id_fold_dict.pkl"), "rb") as f:
    session_id_fold_dict = pickle.load(f)

In [10]:
train_log = pd.merge(train_log, yado, on="yad_no", how="left")
test_log = pd.merge(test_log, yado, on="yad_no", how="left")

In [11]:
# 最も最近見たyado: latest_yad_no
def add_latest_yad_no(train_log: pd.DataFrame, train_session: pd.DataFrame):
    train_log = train_log.copy()
    train_session = train_session.copy()

    session_id_yad_no_list_dict = (
        train_log.groupby("session_id")["yad_no"].apply(list).to_dict()
    )
    train_session["logged_yad_no"] = train_session["session_id"].map(
        session_id_yad_no_list_dict
    )
    return train_session["logged_yad_no"].apply(lambda x: x[-1])


train_session["latest_yad_no"] = add_latest_yad_no(train_log, train_session)
test_session["latest_yad_no"] = add_latest_yad_no(test_log, test_session)

In [12]:
# 最も頻度の高いwid_cd: wid_cd
# 最も頻度の高いken_cd: ken_cd
# 最も頻度の高いlrg_cd: lrg_cd
# 最も頻度の高いsml_cd: sml_cd
@noglobal
def get_mode(
    train_log_with_yad: pd.DataFrame, train_session: pd.DataFrame, area: str
) -> pd.Series:
    # train_log_with_yadについて、session_idごとに最も登場したwid_cdを取得する
    session_id_mode_area_dict = (
        train_log_with_yad.groupby("session_id")[area]
        .agg(lambda x: x.value_counts().index[0])
        .to_dict()
    )
    return train_session["session_id"].map(session_id_mode_area_dict)


# train_session["wid_cd"] = get_mode(train_log, train_session, area="wid_cd")
# train_session["ken_cd"] = get_mode(train_log, train_session, area="ken_cd")
train_session["lrg_cd"] = get_mode(train_log, train_session, area="lrg_cd")
train_session["sml_cd"] = get_mode(train_log, train_session, area="sml_cd")

In [13]:
# test_session["wid_cd"] = get_mode(test_log, test_session, area="wid_cd")
# test_session["ken_cd"] = get_mode(test_log, test_session, area="ken_cd")
test_session["lrg_cd"] = get_mode(test_log, test_session, area="lrg_cd")
test_session["sml_cd"] = get_mode(test_log, test_session, area="sml_cd")

In [14]:
@noglobal
def add_popular_top_N_yad_no_train(
    train_session: pd.DataFrame, N: int = 10, FOLD_NUM: int = FOLD_NUM
):
    train_session = train_session.copy()

    for fold in range(FOLD_NUM):
        # fold以外で候補を作る
        train_session_wo_fold = train_session[train_session["fold"] != fold]
        top_10_yado = (
            train_session_wo_fold.groupby("yad_no")["yad_no"]
            .count()
            .sort_values(ascending=False)
            .head(N)
            .index.to_list()
        )
        # pandasでfoldが一致する部分に、"popular_top_10_yad_no"という列名で、top_10_yadoのリストを全て同じセルに持った列を追加する
        train_session.loc[
            train_session["fold"] == fold, "popular_top_10_yad_no"
        ] = train_session.loc[
            (train_session["fold"] == fold), "yad_no"
        ].apply(  # yad_noである必要はない、適当
            lambda x: top_10_yado
        )
    out_df = (
        train_session[["session_id", "popular_top_10_yad_no"]]
        .explode("popular_top_10_yad_no")
        .rename(columns={"popular_top_10_yad_no": "yad_no"})
    )
    return out_df


tmp = add_popular_top_N_yad_no_train(train_session)

In [15]:
@noglobal
def add_popular_top_N_yad_no_test(
    train_session: pd.DataFrame,
    test_session: pd.DataFrame,
    N: int = 10,
):
    train_session = train_session.copy()
    test_session = test_session.copy()

    top_10_yado = (
        train_session.groupby("yad_no")["yad_no"]
        .count()
        .sort_values(ascending=False)
        .head(N)
        .index.to_list()
    )
    test_session["popular_top_10_yad_no"] = test_session["session_id"].apply(
        lambda x: top_10_yado
    )

    out_df = (
        test_session[["session_id", "popular_top_10_yad_no"]]
        .explode("popular_top_10_yad_no")
        .rename(columns={"popular_top_10_yad_no": "yad_no"})
    )
    return out_df


tmp = add_popular_top_N_yad_no_test(train_session, test_session)

In [16]:
@noglobal
def add_popular_topN_yad_per_area_train(
    input_df: pd.DataFrame,
    area: Literal["wid_cd", "ken_cd", "lrg_cd", "sml_cd"],
    N: int = 10,
    FOLD_NUM: int = FOLD_NUM,
):
    input_df = input_df.copy()

    use_col_name = "popular_topN_yad_no_per_area"
    for fold in range(FOLD_NUM):
        # fold以外で候補を作る
        input_df_wo_fold = input_df[input_df["fold"] != fold]
        result = (
            input_df_wo_fold.groupby([area, "yad_no"])["yad_no"]
            .size()
            .reset_index(name="count")
        )
        sorted_result = result.sort_values([area, "count"], ascending=[True, False])
        area_top_10_yad_no_dict = (
            sorted_result.groupby(area)["yad_no"].apply(lambda x: list(x)[:N]).to_dict()
        )
        input_df.loc[input_df["fold"] == fold, use_col_name] = input_df.loc[
            (input_df["fold"] == fold), area
        ].apply(lambda x: area_top_10_yad_no_dict[x])
    out_df = (
        input_df[["session_id", use_col_name]]
        .explode(use_col_name)
        .rename(columns={use_col_name: "yad_no"})
    )
    return out_df


# 出力結果が合っているかをローカルのDataFrameで出力して確認する -> OK
tmp_wid = add_popular_topN_yad_per_area_train(train_session, area="wid_cd")

In [17]:
@noglobal
def add_popular_topN_yad_per_area_test(
    train_session: pd.DataFrame,
    test_session: pd.DataFrame,
    area: Literal["wid_cd", "ken_cd", "lrg_cd", "sml_cd"],
    N: int = 10,
):
    use_col_name = "popular_topN_yad_no_per_area"
    train_session = train_session.copy()
    test_session = test_session.copy()

    result = (
        train_session.groupby([area, "yad_no"])["yad_no"]
        .size()
        .reset_index(name="count")
    )

    sorted_result = result.sort_values([area, "count"], ascending=[True, False])

    area_top_10_yad_no_dict = (
        sorted_result.groupby(area)["yad_no"].apply(lambda x: list(x)[:N]).to_dict()
    )

    test_session[use_col_name] = test_session[area].apply(
        lambda x: area_top_10_yad_no_dict[x]
    )

    out_df = (
        test_session[["session_id", use_col_name]]
        .explode(use_col_name)
        .rename(columns={use_col_name: "yad_no"})
    )
    return out_df


tmp = add_popular_topN_yad_per_area_test(train_session, test_session, area="wid_cd")

In [18]:
train_sml = add_popular_topN_yad_per_area_train(train_session, area="sml_cd")
test_sml = add_popular_topN_yad_per_area_test(
    train_session, test_session, area="sml_cd"
)

In [20]:
train_sml.to_pickle(f"{OUTPUT_DIR}/{NOTEBOOK_NAME}_train_sml.pkl")
test_sml.to_pickle(f"{OUTPUT_DIR}/{NOTEBOOK_NAME}_test_sml.pkl")

In [53]:
@noglobal
def add_popular_next_topN_yad_train(
    input_df: pd.DataFrame, N: int = 10, FOLD_NUM: int = FOLD_NUM
):
    """
    最後に訪れたyad_noから見て、次に訪れやすいyad_noをN件追加する
    """
    input_df = input_df.copy()

    use_col_name = "popular_next_topN_yad_no"

    for fold in range(FOLD_NUM):
        # fold以外で候補を作る
        input_df_wo_fold = input_df[input_df["fold"] != fold]

        result = (
            input_df_wo_fold.groupby(["latest_yad_no", "yad_no"])["yad_no"]
            .size()
            .reset_index(name="count")
        )
        sorted_result = result.sort_values(
            ["latest_yad_no", "count"], ascending=[True, False]
        )
        latest_yad_no_popular_next_topN_yad_no_dict = (
            sorted_result.groupby("latest_yad_no")["yad_no"]
            .apply(lambda x: list(x)[:N])
            .to_dict()
        )

        input_df.loc[input_df["fold"] == fold, use_col_name] = input_df.loc[
            (input_df["fold"] == fold), "latest_yad_no"
        ].apply(
            lambda x: latest_yad_no_popular_next_topN_yad_no_dict[x]
            if x in latest_yad_no_popular_next_topN_yad_no_dict.keys()
            else []
        )
    out_df = (
        input_df[["session_id", use_col_name]]
        .explode(use_col_name)
        .rename(columns={use_col_name: "yad_no"})
    )
    return out_df


tmp = add_popular_next_topN_yad_train(train_session)

In [54]:
@noglobal
def add_popular_next_topN_yad_test(
    train_session: pd.DataFrame, test_session: pd.DataFrame, N: int = 10
):
    """
    最後に訪れたyad_noから見て、次に訪れやすいyad_noをN件追加する
    """
    train_session = train_session.copy()
    test_session = test_session.copy()

    use_col_name = "popular_next_topN_yad_no"

    result = (
        train_session.groupby(["latest_yad_no", "yad_no"])["yad_no"]
        .size()
        .reset_index(name="count")
    )

    sorted_result = result.sort_values(
        ["latest_yad_no", "count"], ascending=[True, False]
    )

    latest_yad_no_popular_next_topN_yad_no_dict = (
        sorted_result.groupby("latest_yad_no")["yad_no"]
        .apply(lambda x: list(x)[:N])
        .to_dict()
    )

    test_session[use_col_name] = test_session["latest_yad_no"].apply(
        lambda x: latest_yad_no_popular_next_topN_yad_no_dict[x]
        if x in latest_yad_no_popular_next_topN_yad_no_dict.keys()
        else []
    )

    out_df = (
        test_session[["session_id", use_col_name]]
        .explode(use_col_name)
        .rename(columns={use_col_name: "yad_no"})
    )
    return out_df


tmp = add_popular_next_topN_yad_test(train_session, test_session)

In [32]:
# TODO: train_sessionにおいて、latest_yad_noと、yad_noの違いが明確になっているかを確認
# -> OK
# TODO: add_popular_next_topN_yad_testの出力が合っているかを確認する
# -> OK

In [41]:
@noglobal
def remove_last_yad_id(log_df: pd.DataFrame, out_df: pd.DataFrame):
    """
    データの一番最後に存在するログは必ず正解ではないため、除外する
    log_dfは元々のlogデータ、out_dfはsession_id、yad_noの組み合わせをもつDataFrameであることを想定
    out_dfにsession_dfを入れないように注意
    """
    log_df = log_df.copy()
    out_df = out_df.copy()

    # セッション中一番最後の宿の組を作成
    last_yad_df = log_df.groupby("session_id").tail(1)[["session_id", "yad_no"]]

    # 最後であることがわかるようにラベル is_last を付与
    last_yad_df["is_last"] = 1

    # 引数の session - yad の組み合わせとマージ
    merged = out_df.merge(
        last_yad_df,
        on=["session_id", "yad_no"],
        how="left",
    )

    # is_last **ではない** (i.e. is_last is null) データのみに絞る
    idx_use = merged["is_last"].isnull()
    out_df = out_df[idx_use].reset_index(drop=True)

    return out_df

In [60]:
@noglobal
def create_session_train_yad_df(log_df: pd.DataFrame, session_df: pd.DataFrame):
    # 全体で人気の宿を追加
    topN_popular_yad_df = add_popular_top_N_yad_no_train(train_session=session_df, N=10)

    # wid_cdで人気の宿を追加
    popular_topN_yad_per_wid_cd = add_popular_topN_yad_per_area_train(
        input_df=session_df, area="wid_cd", N=10
    )

    # ken_cdで人気の宿を追加
    popular_topN_yad_per_ken_cd = add_popular_topN_yad_per_area_train(
        input_df=session_df, area="ken_cd", N=10
    )

    # lrg_cdで人気の宿を追加
    popular_topN_yad_per_lrg_cd = add_popular_topN_yad_per_area_train(
        input_df=session_df, area="lrg_cd", N=10
    )

    # sml_cdで人気の宿を追加
    popular_topN_yad_per_sml_cd = add_popular_topN_yad_per_area_train(
        input_df=session_df, area="sml_cd", N=10
    )

    # latest_yad_noからみて、次に訪れやすいyad_noを追加
    popular_next_topN_yad = add_popular_next_topN_yad_train(input_df=session_df, N=10)

    negative_data = pd.concat(
        [
            topN_popular_yad_df,
            popular_topN_yad_per_wid_cd,
            popular_topN_yad_per_ken_cd,
            popular_topN_yad_per_lrg_cd,
            popular_topN_yad_per_sml_cd,
            popular_next_topN_yad,
        ],
        ignore_index=True,
    )

    # 負例として付け加えたもの以外・同一ログに出現する宿を候補にいれる
    no_dup_train_log = log_df[["session_id", "yad_no"]].drop_duplicates()
    out_df = pd.concat([no_dup_train_log, negative_data], ignore_index=True)

    # 正解のデータを追加
    out_df = pd.concat([session_df, out_df], ignore_index=True)

    # 最後の宿は正解になりえないので除外
    out_df = remove_last_yad_id(log_df=log_df, out_df=out_df)

    # 正解ラベルに含まれているレコードの index を配列で取得して
    target_index = pd.merge(
        out_df.reset_index(), session_df, on=["session_id", "yad_no"], how="inner"
    )["index"].values

    # 正解Indexに含まれている場合 1 / そうでないと 0 のラベルを作成
    out_df["reserve"] = out_df.index.isin(target_index).astype(int)

    # 重複を省く
    out_df = out_df.drop_duplicates(subset=["session_id", "yad_no"], keep="first")

    # 見た目を揃えるために session / yad の順番でソートをします
    out_df = out_df.sort_values(["session_id", "yad_no"]).reset_index(drop=True)

    # 必要な列に限定
    out_df = out_df[["session_id", "yad_no", "reserve"]]

    return out_df

In [62]:
train_out = create_session_train_yad_df(log_df=train_log, label_df=train_session)

In [66]:
@noglobal
def create_session_test_yad_df(
    test_log: pd.DataFrame,
    test_session: pd.DataFrame,
    train_session: pd.DataFrame,
):
    # 全体で人気の宿を追加
    topN_popular_yad_df = add_popular_top_N_yad_no_test(
        train_session=train_session, test_session=test_session, N=10
    )

    # wid_cdで人気の宿を追加
    popular_topN_yad_per_wid_cd = add_popular_topN_yad_per_area_test(
        train_session=train_session, test_session=test_session, area="wid_cd", N=10
    )

    # ken_cdで人気の宿を追加
    popular_topN_yad_per_ken_cd = add_popular_topN_yad_per_area_test(
        train_session=train_session, test_session=test_session, area="ken_cd", N=10
    )

    # lrg_cdで人気の宿を追加
    popular_topN_yad_per_lrg_cd = add_popular_topN_yad_per_area_test(
        train_session=train_session, test_session=test_session, area="lrg_cd", N=10
    )

    # sml_cdで人気の宿を追加
    popular_topN_yad_per_sml_cd = add_popular_topN_yad_per_area_test(
        train_session=train_session, test_session=test_session, area="sml_cd", N=10
    )

    # latest_yad_noからみて、次に訪れやすいyad_noを追加
    popular_next_topN_yad = add_popular_next_topN_yad_test(
        train_session=train_session, test_session=test_session, N=10
    )

    negative_data = pd.concat(
        [
            topN_popular_yad_df,
            popular_topN_yad_per_wid_cd,
            popular_topN_yad_per_ken_cd,
            popular_topN_yad_per_lrg_cd,
            popular_topN_yad_per_sml_cd,
            popular_next_topN_yad,
        ],
        ignore_index=True,
    )

    # ランダムに付け加えたもの以外・同一ログに出現する宿を候補にいれる
    no_dup_train_log = test_log[["session_id", "yad_no"]].drop_duplicates()
    out_df = pd.concat([no_dup_train_log, negative_data], ignore_index=True)

    # testデータの一番最後に存在するログは必ず正解ではないため、除外する
    out_df = remove_last_yad_id(log_df=test_log, out_df=out_df)

    # 重複を省く
    out_df = out_df.drop_duplicates(subset=["session_id", "yad_no"], keep="first")

    # 見た目を揃えるために session / yad の順番でソートをします
    out_df = out_df.sort_values(["session_id", "yad_no"]).reset_index(drop=True)

    # 必要な列に限定
    out_df = out_df[["session_id", "yad_no"]]

    return out_df

In [67]:
test_out = create_session_test_yad_df(
    test_log=test_log, test_session=test_session, train_session=train_session
)

In [68]:
print(train_out.shape)
print(test_out.shape)

(10436811, 3)
(6337684, 2)


In [69]:
train_out.to_pickle(f"{OUTPUT_DIR}/{NOTEBOOK_NAME}_merged_train.pkl")
test_out.to_pickle(f"{OUTPUT_DIR}/{NOTEBOOK_NAME}_merged_test.pkl")