In [32]:
NOTEBOOK_NAME = "e018_len_0_lgbm_1_rulebase"

In [33]:
# 必要ライブラリのimportとデータの読み込み
import pandas as pd
from collections import defaultdict
from heapq import heappush, heappop

yado = pd.read_csv("../data/yado.csv", dtype={"yad_no": int})  # 今回は使いません
train_log = pd.read_csv(
    "../data/train_log.csv", dtype={"session_id": str, "seq_no": int, "yad_no": int}
)
train_label = pd.read_csv(
    "../data/train_label.csv", dtype={"session_id": str, "yad_no": int}
)
test_log = pd.read_csv(
    "../data/test_log.csv", dtype={"session_id": str, "seq_no": int, "yad_no": int}
)
test_session = pd.read_csv("../data/test_session.csv", dtype={"session_id": str})

In [34]:
# データの加工

# train_logで実際に予約した宿をひけるようにしておく
map_reserved = defaultdict(int)
for idx, rec in train_label.iterrows():
    session_id, yad_no_reserved = rec
    map_reserved[session_id] = yad_no_reserved


# 縦持ちのセッションログを、session_id : [閲覧したyad_noのリスト] のdictに変換
def Make_session_list(session_log):
    map_session_yads = defaultdict(list)
    for _, row in session_log.iterrows():
        session_id = row[0]
        yad_no = row[2]
        map_session_yads[session_id].append(yad_no)
    return map_session_yads


map_session_yads_train = Make_session_list(train_log)
map_session_yads_test = Make_session_list(test_log)

# D[v][r]:= 「最後に宿vを閲覧して、宿rを予約した」セッションの件数
D = defaultdict(lambda: defaultdict(int))
for session_id, viewed_yad_no in map_session_yads_train.items():
    last_viewed = viewed_yad_no[-1]
    reserved = map_reserved[session_id]
    D[last_viewed][reserved] += 1

In [35]:
# test_logを用いて予測値を出力

# 1. 1件の宿vしか閲覧していないログの場合、D[v][r]の大きい順にrを10件出力する.
# 2. 2件以上閲覧しているログの場合、最後から2番目に閲覧している宿を1位とする. その後、2位以降について1.と同様に出力する.
test_session_number = len(test_session)
Predicted_List = [[0] * 10 for _ in range(test_session_number)]
for idx, session_id in enumerate(test_session["session_id"]):
    viewed_number = len(map_session_yads_test[session_id])
    last_viewed = map_session_yads_test[session_id][-1]
    rank = 0

    if viewed_number > 1:
        Predicted_List[idx][rank] = map_session_yads_test[session_id][-2]
        rank += 1

    sorted_yad_list = []
    for yad_no, viewed_cnt in D[last_viewed].items():
        heappush(sorted_yad_list, (-viewed_cnt, yad_no))

    while rank < 10 and sorted_yad_list:
        _, predicted_yad_no = heappop(sorted_yad_list)
        Predicted_List[idx][rank] = predicted_yad_no
        rank += 1

df_rulebase = pd.DataFrame(
    Predicted_List,
    columns=[
        "predict_0",
        "predict_1",
        "predict_2",
        "predict_3",
        "predict_4",
        "predict_5",
        "predict_6",
        "predict_7",
        "predict_8",
        "predict_9",
    ],
)

In [36]:
df_ml = pd.read_csv(
    "../sub/e017_add_same_area_flg_and_log_feat_make_model_auc0.9236_mapk0.5595.csv"
)

In [37]:
# test_sessionに対し、session_lengthを付与する
test_session_length = test_log.groupby("session_id")["yad_no"].count()
# test_session["session_length"] = (
#     test_session["session_id"].map(map_session_yads_test).map(len)
# )

In [39]:
test_session = test_session.merge(
    test_session_length.rename("session_length"), on="session_id", how="left"
)

In [41]:
# test_sessionのsession_lengthが1のデータについては、df_mlを、それ以外のデータについてはdf_rulebaseを使う
is_rulebase = test_session["session_length"] > 1
is_ml = test_session["session_length"] == 1

test_session.loc[is_rulebase, df_rulebase.columns] = df_rulebase.loc[is_rulebase]
test_session.loc[is_ml, df_ml.columns] = df_ml.loc[is_ml]

In [46]:
sub_df = test_session[df_rulebase.columns].astype(int)

In [47]:
sub_df.to_csv(f"../sub/{NOTEBOOK_NAME}_1_ml_over_2_rulebase.csv", index=False)

In [31]:
test_session

Unnamed: 0,session_id,yad_no,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,00001149e9c73985425197104712478c,0,3560.0,4714.0,2680.0,4420.0,11561.0,5466.0,9830.0,2040.0,2305.0,2811.0
1,0000e02747d749a52b7736dfa751e258,0,,,,,,,,,,
2,0000f17ae2628237d78d3a38b009d3be,0,757.0,757.0,7710.0,9190.0,10485.0,410.0,1774.0,3400.0,6721.0,6730.0
3,000174a6f7a569b84c5575760d2e9664,0,12341.0,3359.0,12341.0,5080.0,6991.0,10746.0,13521.0,277.0,1542.0,2047.0
4,00017e2a527901c9c41b1acef525d016,0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
174695,fffee3199ef94b92283239cd5e3534fa,0,1997.0,1997.0,2278.0,5744.0,7062.0,7888.0,9543.0,9743.0,10997.0,11123.0
174696,ffff62c6bb49bc9c0fbcf08494a4869c,0,,,,,,,,,,
174697,ffff9a7dcc892875c7a8b821fa436228,0,,,,,,,,,,
174698,ffffb1d30300fe17f661941fd085b04b,0,,,,,,,,,,


In [30]:
test_session.isnull().sum()

session_id         0
yad_no             0
predict_0     113940
predict_1     113940
predict_2     113940
predict_3     113940
predict_4     113940
predict_5     113940
predict_6     113940
predict_7     113940
predict_8     113940
predict_9     113940
dtype: int64