In [6]:
NOTEBOOK_NAME = "e033_make_feat_kyouki"

In [7]:
import os
import pandas as pd
import pickle
from tqdm.auto import tqdm
from sklearn.decomposition import TruncatedSVD

In [8]:
class Config:
    OUTPUT_DIR = f"../saved_data/{NOTEBOOK_NAME}"
    SEED = 33
    TARGET_COL = "reserve"


os.makedirs(Config.OUTPUT_DIR, exist_ok=True)

In [9]:
import os
import pandas as pd
import pickle
from tqdm.auto import tqdm
from sklearn.decomposition import TruncatedSVD

In [10]:
train_log = pd.read_csv("../data/train_log.csv")
train_label = pd.read_csv("../data/train_label.csv")

In [11]:
# e016にて、データ作成時にfoldを利用するように変更
with open(
    "../saved_data/e016_make_train_popular_base/session_id_fold_dict.pkl", "rb"
) as f:
    session_id_fold_dict = pickle.load(f)

In [12]:
train_label["fold"] = train_label["session_id"].map(session_id_fold_dict)

In [13]:
def make_kyouki_df(train_log: pd.DataFrame, train_label: pd.DataFrame):
    # latest_yad_noを追加する
    train_log = train_log.copy()
    train_label = train_label.copy()

    train_log_latest = (
        train_log.groupby("session_id")["yad_no"]
        .apply(lambda x: list(x)[-1])
        .rename("latest_yad_no")
    )

    train_label = train_label.merge(train_log_latest, on="session_id", how="left")

    kyouki_arr_reduced_dfs = []
    for i in tqdm(range(5)):
        train_label_wo_fold = train_label[train_label["fold"] != i]

        # 超スパースな共起行列を取得する
        kyouki_arr = pd.pivot_table(
            train_label_wo_fold,
            index="latest_yad_no",
            columns="yad_no",
            aggfunc="count",
        ).fillna(
            0
        )  # 該当しない組み合わせは0にする

        # kyouki_arrをTruncartedSVDで100次元に次元削減する
        svd = TruncatedSVD(n_components=100, random_state=Config.SEED)
        kyouki_arr_reduced = svd.fit_transform(kyouki_arr)

        # kyouki_arrのindexを、kyouki_arr_reducedを結合する
        kyouki_arr_reduced_df = pd.DataFrame(
            kyouki_arr_reduced, index=kyouki_arr.index
        ).add_prefix("kyouki_arr_reduced_")

        kyouki_arr_reduced_df.insert(0, "fold", i)
        kyouki_arr_reduced_dfs.append(kyouki_arr_reduced_df)

    kyouki_df = pd.concat(kyouki_arr_reduced_dfs)
    kyouki_df = kyouki_df.reset_index()
    return kyouki_df


kyouki_df_train = make_kyouki_df(train_log, train_label)

  0%|          | 0/5 [00:00<?, ?it/s]

In [20]:
def make_kyouki_df_test(train_log: pd.DataFrame, train_label: pd.DataFrame):
    # latest_yad_noを追加する
    train_log = train_log.copy()
    train_label = train_label.copy()

    train_log_latest = (
        train_log.groupby("session_id")["yad_no"]
        .apply(lambda x: list(x)[-1])
        .rename("latest_yad_no")
    )

    train_label = train_label.merge(train_log_latest, on="session_id", how="left")

    # 超スパースな共起行列を取得する
    kyouki_arr = pd.pivot_table(
        train_label,
        index="latest_yad_no",
        columns="yad_no",
        aggfunc="count",
    ).fillna(
        0
    )  # 該当しない組み合わせは0にする

    # kyouki_arrをTruncartedSVDで100次元に次元削減する
    svd = TruncatedSVD(n_components=100, random_state=Config.SEED)
    kyouki_arr_reduced = svd.fit_transform(kyouki_arr)

    # kyouki_arrのindexを、kyouki_arr_reducedを結合する
    kyouki_df = pd.DataFrame(kyouki_arr_reduced, index=kyouki_arr.index).add_prefix(
        "kyouki_arr_reduced_"
    )

    kyouki_df = kyouki_df.reset_index()
    return kyouki_df


kyouki_df_test = make_kyouki_df_test(train_log, train_label)

In [23]:
kyouki_df_train.to_pickle(f"{Config.OUTPUT_DIR}/kyouki_df_train.pkl")
kyouki_df_test.to_pickle(f"{Config.OUTPUT_DIR}/kyouki_df_test.pkl")