# Anime2Vec(Explicit/Implicit Feedback) + LightGBM

[前回のnotebook](https://www.guruguru.science/competitions/21/discussions/57f5ea4e-69ad-439d-bbe3-887240cf5cf2/)ではレーティングの情報を活用する場合とそうでない場合を比較しましたが、今回はそれらを組み合わせることを考えます。

[こちらのdiscussion](https://www.guruguru.science/competitions/21/discussions/d0e9e563-0910-46b9-a562-441b9c2bb843/)で紹介されているように、推薦システムにおけるユーザーからのフィードバックはExplicit FeedbackとImplicit Feedbackに分けることができます。  
今回のAnime2Vecにおいて考えると、ユーザーレーティングを使用した場合はExplicit Feedbackをモデル化し、使用しない場合はある意味でImplicit Feedbackをモデル化していると考えることができます。  

また、今回は`test.csv`が丸ごと与えられ、新規ユーザー（コールドユーザー）に対してもまとまった視聴情報、つまりImplicit Feedbackを得られているやや特殊な状況とも言えます。  
折角なのでこれを活用する方法を考えたいところです。  

## Importing necessary libraries

In [7]:
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, StratifiedKFold
from gensim.models import word2vec

import time
from contextlib import contextmanager
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import os

## Utility functions

In [8]:
SEED = 0

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    

seed_everything(SEED)

### Word2Vecによる特徴量エンジニアリング(Explicit Feedback)

こちらは[前回のnotebook](https://www.guruguru.science/competitions/21/discussions/57f5ea4e-69ad-439d-bbe3-887240cf5cf2/)と同様に、ユーザーのレーティングを考慮し、視聴回数分アニメを追加します。

In [9]:
def add_w2v_features_with_score(train_df, val_df, test_df=None):
    anime_ids = train_df['anime_id'].unique().tolist()
    user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in train_df.groupby('user_id')['anime_id']}

    # スコアを考慮する場合
    # 今回は1～10のレーティングなので、スコアが5のアニメは5回、スコアが10のアニメは10回、タイトルをリストに追加する
    title_sentence_list = []
    for user_id, user_df in train_df.groupby('user_id'):
        user_title_sentence_list = []
        for anime_id, anime_score in user_df[['anime_id', 'score']].values:
            for i in range(anime_score):
                user_title_sentence_list.append(anime_id)
        title_sentence_list.append(user_title_sentence_list)

    # ユーザごとにshuffleしたリストを作成
    shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]  ## <= 変更点

    # 元のリストとshuffleしたリストを合わせる
    train_sentence_list = title_sentence_list + shuffled_sentence_list

    # word2vecのパラメータ
    vector_size = 64
    w2v_params = {
        "vector_size": vector_size,  ## <= 変更点
        "seed": SEED,
        "min_count": 1,
        "workers": 1
    }

    # word2vecのモデル学習
    model = word2vec.Word2Vec(train_sentence_list, **w2v_params)

    # ユーザーごとの特徴ベクトルと対応するユーザーID
    user_factors = {user_id: np.mean([model.wv[anime_id] for anime_id in user_anime_list], axis=0) for user_id, user_anime_list in user_anime_list_dict.items()}

    # アイテムごとの特徴ベクトルと対応するアイテムID
    item_factors = {aid: model.wv[aid] for aid in anime_ids}

    # データフレームを作成
    user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "user_id"})
    item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "anime_id"})

    # データフレームのカラム名をリネーム
    user_factors_df.columns = ["user_id"] + [f"user_factor_{i}" for i in range(vector_size)]
    item_factors_df.columns = ["anime_id"] + [f"item_factor_{i}" for i in range(vector_size)]

    train_df = train_df.merge(user_factors_df, on="user_id", how="left")
    train_df = train_df.merge(item_factors_df, on="anime_id", how="left")

    val_df = val_df.merge(user_factors_df, on="user_id", how="left")
    val_df = val_df.merge(item_factors_df, on="anime_id", how="left")

    if test_df is not None:
        test_df = test_df.merge(user_factors_df, on="user_id", how="left")
        test_df = test_df.merge(item_factors_df, on="anime_id", how="left")
        return train_df, val_df, test_df

    return train_df, val_df

### Word2Vecによる特徴量エンジニアリング(Implicit Feedback)

今回は`test.csv`が丸ごと与えられ、新規ユーザーに対しても視聴情報が得られる状況なのでこれを活用します。  
とは言っても、単にtrain/testを先に結合してからスコアの情報を使用せずにWord2Vecの学習を行うだけです。

In [10]:
def add_w2v_features_without_score(train_test_df):
    
    anime_ids = train_test_df['anime_id'].unique().tolist()
    user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in train_test_df.groupby('user_id')['anime_id']}

    title_sentence_list = train_test_df.groupby('user_id')['anime_id'].apply(list).tolist()

    # ユーザごとにshuffleしたリストを作成
    shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]  ## <= 変更点

    # 元のリストとshuffleしたリストを合わせる
    train_sentence_list = title_sentence_list + shuffled_sentence_list
    print(len(train_sentence_list))
    # word2vecのパラメータ
    vector_size = 64
    w2v_params = {
        "vector_size": vector_size,  ## <= 変更点
        "seed": SEED,
        "min_count": 1,
        "workers": 1
    }

    # word2vecのモデル学習
    model = word2vec.Word2Vec(train_sentence_list, **w2v_params)

    # ユーザーごとの特徴ベクトルと対応するユーザーID
    user_factors = {user_id: np.mean([model.wv[anime_id] for anime_id in user_anime_list], axis=0) for user_id, user_anime_list in user_anime_list_dict.items()}

    # アイテムごとの特徴ベクトルと対応するアイテムID
    item_factors = {aid: model.wv[aid] for aid in anime_ids}

    # データフレームを作成
    user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "user_id"})
    item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "anime_id"})

    # データフレームのカラム名をリネーム
    user_factors_df.columns = ["user_id"] + [f"wo_score_user_factor_{i}" for i in range(vector_size)]
    item_factors_df.columns = ["anime_id"] + [f"wo_score_item_factor_{i}" for i in range(vector_size)]

    train_test_df = train_test_df.merge(user_factors_df, on="user_id", how="left")

    return train_test_df

# Training and prediction

### 学習に便利な関数

In [15]:
def load_data():
    train_df = pd.read_csv('../../data/train.csv')
    test_df = pd.read_csv('../../data/test.csv')
    test_df['score'] = 0 # dummy

    # Initialize submission file
    submission_df = pd.read_csv('../../data/sample_submission.csv')
    submission_df['score'] = 0
    return train_df, test_df, submission_df

def stratified_and_group_kfold_split(train_df):
    # https://www.guruguru.science/competitions/21/discussions/45ffc8a1-e37c-4b95-aac4-c4e338aa6a9b/
    
    # 20%のユーザを抽出
    n_user = train_df["user_id"].nunique()
    unseen_users = random.sample(sorted(train_df["user_id"].unique()), k=n_user // 5)
    train_df["unseen_user"] = train_df["user_id"].isin(unseen_users)
    unseen_df = train_df[train_df["unseen_user"]].reset_index(drop=True)
    train_df = train_df[~train_df["unseen_user"]].reset_index(drop=True)

    # train_dfの80%をStratifiedKFoldで分割
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold_id, (_, valid_idx) in enumerate(skf.split(train_df, train_df["user_id"])):
        train_df.loc[valid_idx, "fold"] = fold_id

    # 20%をGroupKFoldで分割
    gkf = GroupKFold(n_splits=5)
    unseen_df["fold"] = -1
    for fold_id, (_, valid_idx) in enumerate(gkf.split(unseen_df, unseen_df["user_id"], unseen_df["user_id"])):
        unseen_df.loc[valid_idx, "fold"] = fold_id

    # concat
    train_df = pd.concat([train_df, unseen_df], axis=0).reset_index(drop=True)
    train_df.drop(columns=["unseen_user"], inplace=True)
    return train_df


def train(train_df, original_test_df, submission_df):
    train_df['oof'] = 0
    train_df['seen'] = False

    for fold in range(5):
        # Prepare the train and validation data
        trn_df = train_df[train_df['fold'] != fold].copy()
        val_df = train_df[train_df['fold'] == fold].copy()

        trn_df, val_df, test_df = add_w2v_features_with_score(trn_df, val_df, original_test_df.copy())
        
        # Define the features and the target
        unused_cols = ['user_id', 'anime_id', 'score', 'fold', 'oof', 'seen']
        feature_cols = [col for col in trn_df.columns if col not in unused_cols]
        target_col = 'score'

        # Prepare the LightGBM datasets
        lgb_train = lgb.Dataset(trn_df[feature_cols], trn_df[target_col])
        lgb_val = lgb.Dataset(val_df[feature_cols], val_df[target_col])

        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.1,
            # 'reg_lambda': 1.0
        }

        # Train the model
        callbacks = [
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100)
        ]
        model_lgb = lgb.train(params, lgb_train, valid_sets=[
                              lgb_train, lgb_val], callbacks=callbacks, num_boost_round=10000)

        # Predict
        trn_df['preds'] = model_lgb.predict(trn_df[feature_cols], num_iteration=model_lgb.best_iteration)
        val_df['preds'] = model_lgb.predict(val_df[feature_cols], num_iteration=model_lgb.best_iteration)
        test_preds = model_lgb.predict(test_df[feature_cols], num_iteration=model_lgb.best_iteration)

        train_users = trn_df['user_id'].unique()
        is_seen = val_df['user_id'].isin(train_users)
        seen_val = val_df[is_seen]
        unseen_val = val_df[~is_seen]

        # Evaluate the model
        train_score = np.sqrt(mean_squared_error(trn_df['score'], trn_df['preds']))
        seen_val_score = np.sqrt(mean_squared_error(seen_val['score'], seen_val['preds']))
        unseen_val_score = np.sqrt(mean_squared_error(unseen_val['score'], unseen_val['preds']))
        print(f'fold{fold} train RMSE: {train_score:.3f}, seen val RMSE: {seen_val_score:.3f}, unseen val RMSE: {unseen_val_score:.3f}')
        
        submission_df['score'] += test_preds / 5

        train_df.loc[train_df['fold'] == fold, 'oof'] = val_df['preds'].values
        train_df.loc[train_df['fold'] == fold, 'seen'] = is_seen.values

    total_score = np.sqrt(mean_squared_error(train_df['score'], train_df['oof']))
    seen_score = np.sqrt(mean_squared_error(train_df[train_df['seen']]['score'], train_df[train_df['seen']]['oof']))
    unseen_score = np.sqrt(mean_squared_error(train_df[~train_df['seen']]['score'], train_df[~train_df['seen']]['oof']))
    print(f"Total RMSE: {total_score} | Seen RMSE: {seen_score} | Unseen RMSE: {unseen_score}")

    submission_df.to_csv('submission.csv', index=False)

# Main

In [18]:
train_df, test_df, submission_df = load_data()

train_test_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

train_test_df = add_w2v_features_without_score(train_test_df)
add_w2v_features_with_score(train_df, test_df)

In [16]:
# with timer("Load the data"):
#     train_df, test_df, submission_df = load_data()

# with timer("Stratified & Group split"):
#     train_df = stratified_and_group_kfold_split(train_df)

# with timer("add_w2v_features_without_score"):
#     # testの視聴情報も活用するため、trainとtestを結合して先に特徴量を作成
#     train_test_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
#     train_test_df = add_w2v_features_without_score(train_test_df)
# #     train_df = train_test_df[train_test_df['score'] != 0].copy().reset_index(drop=True)
# #     test_df = train_test_df[train_test_df['score'] == 0].copy().reset_index(drop=True)

# # with timer("Training and evaluation with LightGBM"):
# #     train(train_df, test_df, submission_df)

[Load the data] done in 0 s
[Stratified & Group split] done in 0 s




[add_w2v_features_without_score] done in 2 s


In [17]:
train_test_df

Unnamed: 0,user_id,anime_id,score,fold,wo_score_user_factor_0,wo_score_user_factor_1,wo_score_user_factor_2,wo_score_user_factor_3,wo_score_user_factor_4,wo_score_user_factor_5,...,wo_score_user_factor_54,wo_score_user_factor_55,wo_score_user_factor_56,wo_score_user_factor_57,wo_score_user_factor_58,wo_score_user_factor_59,wo_score_user_factor_60,wo_score_user_factor_61,wo_score_user_factor_62,wo_score_user_factor_63
0,001a7aed2546342e2602,034eb6feb083d80751a4,9,3.0,0.207528,-0.130969,0.102914,-0.177719,-0.099937,-0.208528,...,-0.023294,0.204172,-0.021628,0.084894,0.025680,0.123446,0.351074,-0.147088,-0.232003,0.165780
1,001a7aed2546342e2602,04068820a73e52dc3b32,9,3.0,0.207528,-0.130969,0.102914,-0.177719,-0.099937,-0.208528,...,-0.023294,0.204172,-0.021628,0.084894,0.025680,0.123446,0.351074,-0.147088,-0.232003,0.165780
2,001a7aed2546342e2602,057c8610088179f68964,9,4.0,0.207528,-0.130969,0.102914,-0.177719,-0.099937,-0.208528,...,-0.023294,0.204172,-0.021628,0.084894,0.025680,0.123446,0.351074,-0.147088,-0.232003,0.165780
3,001a7aed2546342e2602,08aaefd0726338c6cda6,8,0.0,0.207528,-0.130969,0.102914,-0.177719,-0.099937,-0.208528,...,-0.023294,0.204172,-0.021628,0.084894,0.025680,0.123446,0.351074,-0.147088,-0.232003,0.165780
4,001a7aed2546342e2602,09d9688ffb425b3903b2,8,1.0,0.207528,-0.130969,0.102914,-0.177719,-0.099937,-0.208528,...,-0.023294,0.204172,-0.021628,0.084894,0.025680,0.123446,0.351074,-0.147088,-0.232003,0.165780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254072,ffe85a36cd20500faa58,f508b02efeac8ecb8cc0,0,,0.234619,-0.268356,0.334104,-0.424140,-0.196889,-0.348174,...,-0.597357,0.198136,0.352581,-0.014029,0.214613,0.124050,0.399124,0.054557,-0.253500,0.196781
254073,ffe85a36cd20500faa58,f5b8ecea3beea4b82d79,0,,0.234619,-0.268356,0.334104,-0.424140,-0.196889,-0.348174,...,-0.597357,0.198136,0.352581,-0.014029,0.214613,0.124050,0.399124,0.054557,-0.253500,0.196781
254074,ffe85a36cd20500faa58,f6c208226b6b69948053,0,,0.234619,-0.268356,0.334104,-0.424140,-0.196889,-0.348174,...,-0.597357,0.198136,0.352581,-0.014029,0.214613,0.124050,0.399124,0.054557,-0.253500,0.196781
254075,ffe85a36cd20500faa58,fe67592c312fc1e17745,0,,0.234619,-0.268356,0.334104,-0.424140,-0.196889,-0.348174,...,-0.597357,0.198136,0.352581,-0.014029,0.214613,0.124050,0.399124,0.054557,-0.253500,0.196781


Total RMSE: 1.2404953346081091  
Seen RMSE: 1.1849511964814186  
Unseen RMSE: 1.43856427010653  
となりました。  

### コールドユーザーに対するナイーブな予測（おまけ）
さて、上記のunseen userに対するスコアRMSE=1.438というのはどの程度いいのでしょうか。  
単に未知ユーザーに対してはアニメ全体の予測値の平均で埋めるナイーブな手法と比較してみます。

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# データを読み込む
train = pd.read_csv('train.csv')

# ユーザーごとにデータを分割
users = train['user_id'].unique()

# ユーザーIDを訓練セットと検証セットに分割
train_users, val_users = train_test_split(users, test_size=0.2, random_state=42)

# 訓練セットと検証セットを作成
train_data = train[train['user_id'].isin(train_users)]
val_data = train[train['user_id'].isin(val_users)]

# アニメの平均スコアを計算
anime_mean_score = train_data.groupby('anime_id')['score'].mean()

# 検証データに対する予測値を計算
val_data = val_data.copy()  # Avoid SettingWithCopyWarning
val_data['pred_score'] = val_data['anime_id'].map(anime_mean_score)

# 訓練データ全体の平均スコアを計算
global_mean_score = train_data['score'].mean()

# NaNを訓練データ全体の平均スコアで補完
val_data['pred_score'].fillna(global_mean_score, inplace=True)

# 再度、ベースラインモデルのRMSEを計算
rmse = np.sqrt(mean_squared_error(val_data['score'], val_data['pred_score']))

print(rmse)

1.4414882404447242


・・・あまり変わらないような気も？