In [16]:
NOTEBOOK_NAME = "e019_use_ml_at_0"

In [17]:
# 必要ライブラリのimportとデータの読み込み
import pandas as pd
from collections import defaultdict
from heapq import heappush, heappop

yado = pd.read_csv("../data/yado.csv", dtype={"yad_no": int})  # 今回は使いません
train_log = pd.read_csv(
    "../data/train_log.csv", dtype={"session_id": str, "seq_no": int, "yad_no": int}
)
train_label = pd.read_csv(
    "../data/train_label.csv", dtype={"session_id": str, "yad_no": int}
)
test_log = pd.read_csv(
    "../data/test_log.csv", dtype={"session_id": str, "seq_no": int, "yad_no": int}
)
test_session = pd.read_csv("../data/test_session.csv", dtype={"session_id": str})

In [18]:
# データの加工

# train_logで実際に予約した宿をひけるようにしておく
map_reserved = defaultdict(int)
for idx, rec in train_label.iterrows():
    session_id, yad_no_reserved = rec
    map_reserved[session_id] = yad_no_reserved


# 縦持ちのセッションログを、session_id : [閲覧したyad_noのリスト] のdictに変換
def Make_session_list(session_log):
    map_session_yads = defaultdict(list)
    for _, row in session_log.iterrows():
        session_id = row[0]
        yad_no = row[2]
        map_session_yads[session_id].append(yad_no)
    return map_session_yads


map_session_yads_train = Make_session_list(train_log)
map_session_yads_test = Make_session_list(test_log)

# D[v][r]:= 「最後に宿vを閲覧して、宿rを予約した」セッションの件数
D = defaultdict(lambda: defaultdict(int))
for session_id, viewed_yad_no in map_session_yads_train.items():
    last_viewed = viewed_yad_no[-1]
    reserved = map_reserved[session_id]
    D[last_viewed][reserved] += 1

In [19]:
# test_logを用いて予測値を出力

# 1. 1件の宿vしか閲覧していないログの場合、D[v][r]の大きい順にrを10件出力する.
# 2. 2件以上閲覧しているログの場合、最後から2番目に閲覧している宿を1位とする. その後、2位以降について1.と同様に出力する.
test_session_number = len(test_session)
Predicted_List = [[0] * 10 for _ in range(test_session_number)]
for idx, session_id in enumerate(test_session["session_id"]):
    viewed_number = len(map_session_yads_test[session_id])
    last_viewed = map_session_yads_test[session_id][-1]
    rank = 0

    if viewed_number > 1:
        Predicted_List[idx][rank] = map_session_yads_test[session_id][-2]
        rank += 1

    sorted_yad_list = []
    for yad_no, viewed_cnt in D[last_viewed].items():
        heappush(sorted_yad_list, (-viewed_cnt, yad_no))

    while rank < 10 and sorted_yad_list:
        _, predicted_yad_no = heappop(sorted_yad_list)
        Predicted_List[idx][rank] = predicted_yad_no
        rank += 1

df_rulebase = pd.DataFrame(
    Predicted_List,
    columns=[
        "predict_0",
        "predict_1",
        "predict_2",
        "predict_3",
        "predict_4",
        "predict_5",
        "predict_6",
        "predict_7",
        "predict_8",
        "predict_9",
    ],
)

In [20]:
df_ml = pd.read_csv(
    "../sub/e017_add_same_area_flg_and_log_feat_make_model_auc0.9236_mapk0.5595.csv"
)

In [21]:
# test_sessionに対し、session_lengthを付与する
test_session_length = test_log.groupby("session_id")["yad_no"].count()
# test_session["session_length"] = (
#     test_session["session_id"].map(map_session_yads_test).map(len)
# )

In [22]:
test_session = test_session.merge(
    test_session_length.rename("session_length"), on="session_id", how="left"
)

In [25]:
df_rulebase.sum()

predict_0    1024532055
predict_1     935716711
predict_2     941523300
predict_3     933846258
predict_4     921571477
predict_5     909302571
predict_6     896081599
predict_7     879809327
predict_8     866316035
predict_9     852308474
dtype: int64

In [23]:
sub_df = df_rulebase.copy()
sub_df[(sub_df == 0)] = df_ml[(sub_df == 0)]

In [28]:
sub_df.to_csv(f"../sub/{NOTEBOOK_NAME}_rulebase_0_ml.csv", index=False)