In [1]:
import numpy as np
import pandas as pd
import scipy
from sklearn.model_selection import train_test_split
from irspack import (
    df_to_sparse,
    IALSRecommender,
    Evaluator,
    ItemIDMapper
)
from tqdm.auto import tqdm

In [2]:
union = lambda x, y: x + y - x.multiply(y)

In [3]:
DATA_DIR = "../data"

In [5]:
trainvalid_log = pd.read_csv(f'{DATA_DIR}/train_log.csv')
trainvalid_label = pd.read_csv(f'{DATA_DIR}/train_label.csv')
test_log = pd.read_csv(f'{DATA_DIR}/test_log.csv')
test_session = pd.read_csv(f'{DATA_DIR}/test_session.csv')
yado = pd.read_csv(f'{DATA_DIR}/yado.csv')
sample_submission = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

In [6]:
# セッションの一覧
trainvalid_user_ids = trainvalid_log['session_id'].drop_duplicates().to_list()
test_user_ids = test_log['session_id'].drop_duplicates().to_list()

# 宿の一覧
item_ids = yado['yad_no'].drop_duplicates().to_list()

In [7]:
train_user_ids, valid_user_ids = train_test_split(trainvalid_user_ids, test_size=0.2, random_state=0)
train_log = trainvalid_log.loc[lambda df: df['session_id'].isin(train_user_ids), :].copy()
valid_log = trainvalid_log.loc[lambda df: df['session_id'].isin(valid_user_ids), :].copy()
train_label = trainvalid_label.loc[lambda df: df['session_id'].isin(train_user_ids), :].copy()
valid_label = trainvalid_label.loc[lambda df: df['session_id'].isin(valid_user_ids), :].copy()

In [8]:
def log_to_matrix(log, label, user_ids, item_ids):
    # 出現した宿
    matrix_x, _, _ = df_to_sparse(
        df = log[['session_id', 'yad_no']].drop_duplicates(),
        user_column='session_id',
        item_column='yad_no',
        user_ids=user_ids,
        item_ids=item_ids
    )

    # 予約された宿
    if label is not None:
        matrix_y, _, _ = df_to_sparse(
            df=label,
            user_column='session_id',
            item_column='yad_no',
            user_ids=user_ids,
            item_ids=item_ids
        )
    else:
        matrix_y = None

    # レコメンド対象外の宿 (各セッション内で最後に出現する宿)
    matrix_mask, _, _ = df_to_sparse(
        df = (
            log
            .merge(
                log.groupby('session_id')['seq_no'].max().rename('seq_no_max'),
                how='left',
                on='session_id'
            )
            .loc[lambda df: df['seq_no'] == df['seq_no_max'], :]
        ),
        user_column='session_id',
        item_column='yad_no',
        user_ids=user_ids,
        item_ids=item_ids
    )

    return matrix_x, matrix_y, matrix_mask

In [9]:
train_matrix_x, train_matrix_y, train_matrix_mask = \
    log_to_matrix(train_log, train_label, train_user_ids, item_ids)
valid_matrix_x, valid_matrix_y, valid_matrix_mask = \
    log_to_matrix(valid_log, valid_label, valid_user_ids, item_ids)
test_matrix_x, test_matrix_y, test_matrix_mask = \
    log_to_matrix(test_log, None, test_user_ids, item_ids)

In [10]:
def get_user_ids_seq_no_max_over_1_index(log, user_ids):
    user_ids_seq_no_max_over_1 = (
        log
        .groupby('session_id')['seq_no'].max()
        .loc[lambda s: s > 0]
        .index
        .to_list()
    )
    user_ids_seq_no_max_over_1_set = set(user_ids_seq_no_max_over_1)
    user_ids_seq_no_max_over_1_index = [
        i
        for i, user_id in enumerate(user_ids)
        if user_id in user_ids_seq_no_max_over_1_set
    ]

    return user_ids_seq_no_max_over_1_index

In [11]:
# train_user_ids_seq_no_max_over_1_index = \
#     get_user_ids_seq_no_max_over_1_index(train_log, train_user_ids)
# valid_user_ids_seq_no_max_over_1_index = \
#     get_user_ids_seq_no_max_over_1_index(valid_log, valid_user_ids)
test_user_ids_seq_no_max_over_1_index = \
    get_user_ids_seq_no_max_over_1_index(test_log, test_user_ids)

In [12]:
valid_evaluator = Evaluator(
    valid_matrix_y,
    target_metric='map',
    cutoff=10,
    masked_interactions=valid_matrix_mask
)

In [13]:
# ハイパラ調整の結果 (メモリ不足で中断したので最適じゃないかも……)
best_params = {
    'n_components': 1600,
    'alpha0': 0.001297421599991797,
    'reg': 0.010800184775061342,
    'train_epochs': 3
}

In [14]:
valid_recommender = IALSRecommender(
    scipy.sparse.vstack([
        valid_matrix_x,
        union(train_matrix_x, train_matrix_y)
    ]),
    **best_params,
    random_seed=0
).learn()
valid_evaluator.get_score(valid_recommender)

{'appeared_item': 11471.0,
 'entropy': 8.616186750611389,
 'gini_index': 0.695784365276243,
 'hit': 0.5771908555594042,
 'map': 0.38353862552987933,
 'n_items': 13806.0,
 'ndcg': 0.42880331398477156,
 'precision': 0.05771908555594042,
 'recall': 0.5771908555594042,
 'total_user': 57740.0,
 'valid_user': 57740.0}

In [15]:
test_recommender = IALSRecommender(
    scipy.sparse.vstack([
        test_matrix_x,
        union(train_matrix_x, train_matrix_y),
        union(valid_matrix_x, valid_matrix_y),
        # 学習データ-テストデータ間のシフト対策として、テストデータを10倍にする。
        test_matrix_x[test_user_ids_seq_no_max_over_1_index, :],
        test_matrix_x[test_user_ids_seq_no_max_over_1_index, :],
        test_matrix_x[test_user_ids_seq_no_max_over_1_index, :],
        test_matrix_x[test_user_ids_seq_no_max_over_1_index, :],
        test_matrix_x[test_user_ids_seq_no_max_over_1_index, :],
        test_matrix_x[test_user_ids_seq_no_max_over_1_index, :],
        test_matrix_x[test_user_ids_seq_no_max_over_1_index, :],
        test_matrix_x[test_user_ids_seq_no_max_over_1_index, :],
        test_matrix_x[test_user_ids_seq_no_max_over_1_index, :]
    ]),
    **best_params,
    random_seed=0
).learn()

In [None]:
def get_candidates(recommender, offset, matrix_mask, user_ids, item_ids, num_candidates):
    batch_size = 10000
    id_mapper = ItemIDMapper(item_ids)
    cantidates = []
    for begin in tqdm(range(0, len(user_ids), batch_size)):
        end = min(begin + batch_size, len(user_ids))
        score = recommender.get_score_block(begin + offset, end + offset)
        score[matrix_mask[begin:end, :].nonzero()] = -np.inf
        cantidates += id_mapper.score_to_recommended_items_batch(score, cutoff=num_candidates)
    cantidates = (
        pd.DataFrame(
            [
                (user_ids[user_ids_index], rank + 1, yad_no, score)
                for user_ids_index, cantidates_per_user in enumerate(cantidates)
                for rank, (yad_no, score) in enumerate(cantidates_per_user)
            ],
            columns = ['session_id', 'rank', 'yad_no', 'score']
        )
    )

    return cantidates