# Anime2Vec(Explicit/Implicit Feedback) + LightGBM

[前回のnotebook](https://www.guruguru.science/competitions/21/discussions/57f5ea4e-69ad-439d-bbe3-887240cf5cf2/)ではレーティングの情報を活用する場合とそうでない場合を比較しましたが、今回はそれらを組み合わせることを考えます。

[こちらのdiscussion](https://www.guruguru.science/competitions/21/discussions/d0e9e563-0910-46b9-a562-441b9c2bb843/)で紹介されているように、推薦システムにおけるユーザーからのフィードバックはExplicit FeedbackとImplicit Feedbackに分けることができます。  
今回のAnime2Vecにおいて考えると、ユーザーレーティングを使用した場合はExplicit Feedbackをモデル化し、使用しない場合はある意味でImplicit Feedbackをモデル化していると考えることができます。  

また、今回は`test.csv`が丸ごと与えられ、新規ユーザー（コールドユーザー）に対してもまとまった視聴情報、つまりImplicit Feedbackを得られているやや特殊な状況とも言えます。  
折角なのでこれを活用する方法を考えたいところです。  

## Importing necessary libraries

In [1]:
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, StratifiedKFold
from gensim.models import word2vec

import time
from contextlib import contextmanager
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import os

## Utility functions

In [2]:
SEED = 0

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    

seed_everything(SEED)

### Word2Vecによる特徴量エンジニアリング(Explicit Feedback)

こちらは[前回のnotebook](https://www.guruguru.science/competitions/21/discussions/57f5ea4e-69ad-439d-bbe3-887240cf5cf2/)と同様に、ユーザーのレーティングを考慮し、視聴回数分アニメを追加します。

In [3]:
def add_w2v_features_with_score(train_df, val_df, test_df=None):
    anime_ids = train_df['anime_id'].unique().tolist()
    user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in train_df.groupby('user_id')['anime_id']}

    # スコアを考慮する場合
    # 今回は1～10のレーティングなので、スコアが5のアニメは5回、スコアが10のアニメは10回、タイトルをリストに追加する
    title_sentence_list = []
    for user_id, user_df in train_df.groupby('user_id'):
        user_title_sentence_list = []
        for anime_id, anime_score in user_df[['anime_id', 'score']].values:
            for i in range(anime_score):
                user_title_sentence_list.append(anime_id)
        title_sentence_list.append(user_title_sentence_list)

    # ユーザごとにshuffleしたリストを作成
    shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]  ## <= 変更点

    # 元のリストとshuffleしたリストを合わせる
    train_sentence_list = title_sentence_list + shuffled_sentence_list

    # word2vecのパラメータ
    vector_size = 64
    w2v_params = {
        "vector_size": vector_size,  ## <= 変更点
        "seed": SEED,
        "min_count": 1,
        "workers": 1
    }

    # word2vecのモデル学習
    model = word2vec.Word2Vec(train_sentence_list, **w2v_params)

    # ユーザーごとの特徴ベクトルと対応するユーザーID
    user_factors = {user_id: np.mean([model.wv[anime_id] for anime_id in user_anime_list], axis=0) for user_id, user_anime_list in user_anime_list_dict.items()}

    # アイテムごとの特徴ベクトルと対応するアイテムID
    item_factors = {aid: model.wv[aid] for aid in anime_ids}

    # データフレームを作成
    user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "user_id"})
    item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "anime_id"})

    # データフレームのカラム名をリネーム
    user_factors_df.columns = ["user_id"] + [f"user_factor_{i}" for i in range(vector_size)]
    item_factors_df.columns = ["anime_id"] + [f"item_factor_{i}" for i in range(vector_size)]

    train_df = train_df.merge(user_factors_df, on="user_id", how="left")
    train_df = train_df.merge(item_factors_df, on="anime_id", how="left")

    val_df = val_df.merge(user_factors_df, on="user_id", how="left")
    val_df = val_df.merge(item_factors_df, on="anime_id", how="left")

    if test_df is not None:
        test_df = test_df.merge(user_factors_df, on="user_id", how="left")
        test_df = test_df.merge(item_factors_df, on="anime_id", how="left")
        return train_df, val_df, test_df

    return train_df, val_df

### Word2Vecによる特徴量エンジニアリング(Implicit Feedback)

今回は`test.csv`が丸ごと与えられ、新規ユーザーに対しても視聴情報が得られる状況なのでこれを活用します。  
とは言っても、単にtrain/testを先に結合してからスコアの情報を使用せずにWord2Vecの学習を行うだけです。

In [4]:
def add_w2v_features_without_score(train_test_df):
    
    anime_ids = train_test_df['anime_id'].unique().tolist()
    user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in train_test_df.groupby('user_id')['anime_id']}

    title_sentence_list = train_test_df.groupby('user_id')['anime_id'].apply(list).tolist()

    # ユーザごとにshuffleしたリストを作成
    shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]  ## <= 変更点

    # 元のリストとshuffleしたリストを合わせる
    train_sentence_list = title_sentence_list + shuffled_sentence_list

    # word2vecのパラメータ
    vector_size = 64
    w2v_params = {
        "vector_size": vector_size,  ## <= 変更点
        "seed": SEED,
        "min_count": 1,
        "workers": 1
    }

    # word2vecのモデル学習
    model = word2vec.Word2Vec(train_sentence_list, **w2v_params)

    # ユーザーごとの特徴ベクトルと対応するユーザーID
    user_factors = {user_id: np.mean([model.wv[anime_id] for anime_id in user_anime_list], axis=0) for user_id, user_anime_list in user_anime_list_dict.items()}

    # アイテムごとの特徴ベクトルと対応するアイテムID
    item_factors = {aid: model.wv[aid] for aid in anime_ids}

    # データフレームを作成
    user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "user_id"})
    item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "anime_id"})

    # データフレームのカラム名をリネーム
    user_factors_df.columns = ["user_id"] + [f"wo_score_user_factor_{i}" for i in range(vector_size)]
    item_factors_df.columns = ["anime_id"] + [f"wo_score_item_factor_{i}" for i in range(vector_size)]

    train_test_df = train_test_df.merge(user_factors_df, on="user_id", how="left")

    return train_test_df

# Training and prediction

### 学習に便利な関数

In [5]:
def load_data():
    train_df = pd.read_csv('/workspace/input/atmaCup15_dataset/train.csv')
    test_df = pd.read_csv('/workspace/input/atmaCup15_dataset/test.csv')
    test_df['score'] = 0 # dummy

    # Initialize submission file
    submission_df = pd.read_csv('/workspace/input/atmaCup15_dataset/sample_submission.csv')
    submission_df['score'] = 0
    return train_df, test_df, submission_df

def stratified_and_group_kfold_split(train_df):
    # https://www.guruguru.science/competitions/21/discussions/45ffc8a1-e37c-4b95-aac4-c4e338aa6a9b/
    
    # 20%のユーザを抽出
    n_user = train_df["user_id"].nunique()
    unseen_users = random.sample(sorted(train_df["user_id"].unique()), k=n_user // 5)
    train_df["unseen_user"] = train_df["user_id"].isin(unseen_users)
    unseen_df = train_df[train_df["unseen_user"]].reset_index(drop=True)
    train_df = train_df[~train_df["unseen_user"]].reset_index(drop=True)

    # train_dfの80%をStratifiedKFoldで分割
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold_id, (_, valid_idx) in enumerate(skf.split(train_df, train_df["user_id"])):
        train_df.loc[valid_idx, "fold"] = fold_id

    # 20%をGroupKFoldで分割
    gkf = GroupKFold(n_splits=5)
    unseen_df["fold"] = -1
    for fold_id, (_, valid_idx) in enumerate(gkf.split(unseen_df, unseen_df["user_id"], unseen_df["user_id"])):
        unseen_df.loc[valid_idx, "fold"] = fold_id

    # concat
    train_df = pd.concat([train_df, unseen_df], axis=0).reset_index(drop=True)
    train_df.drop(columns=["unseen_user"], inplace=True)
    return train_df


def train(train_df, original_test_df, submission_df):
    train_df['oof'] = 0
    train_df['seen'] = False

    for fold in range(5):
        # Prepare the train and validation data
        trn_df = train_df[train_df['fold'] != fold].copy()
        val_df = train_df[train_df['fold'] == fold].copy()

        trn_df, val_df, test_df = add_w2v_features_with_score(trn_df, val_df, original_test_df.copy())
        
        # Define the features and the target
        unused_cols = ['user_id', 'anime_id', 'score', 'fold', 'oof', 'seen']
        feature_cols = [col for col in trn_df.columns if col not in unused_cols]
        target_col = 'score'

        # Prepare the LightGBM datasets
        lgb_train = lgb.Dataset(trn_df[feature_cols], trn_df[target_col])
        lgb_val = lgb.Dataset(val_df[feature_cols], val_df[target_col])

        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.1,
            # 'reg_lambda': 1.0
        }

        # Train the model
        callbacks = [
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100)
        ]
        model_lgb = lgb.train(params, lgb_train, valid_sets=[
                              lgb_train, lgb_val], callbacks=callbacks, num_boost_round=10000)

        # Predict
        trn_df['preds'] = model_lgb.predict(trn_df[feature_cols], num_iteration=model_lgb.best_iteration)
        val_df['preds'] = model_lgb.predict(val_df[feature_cols], num_iteration=model_lgb.best_iteration)
        test_preds = model_lgb.predict(test_df[feature_cols], num_iteration=model_lgb.best_iteration)

        train_users = trn_df['user_id'].unique()
        is_seen = val_df['user_id'].isin(train_users)
        seen_val = val_df[is_seen]
        unseen_val = val_df[~is_seen]

        # Evaluate the model
        train_score = np.sqrt(mean_squared_error(trn_df['score'], trn_df['preds']))
        seen_val_score = np.sqrt(mean_squared_error(seen_val['score'], seen_val['preds']))
        unseen_val_score = np.sqrt(mean_squared_error(unseen_val['score'], unseen_val['preds']))
        print(f'fold{fold} train RMSE: {train_score:.3f}, seen val RMSE: {seen_val_score:.3f}, unseen val RMSE: {unseen_val_score:.3f}')
        
        submission_df['score'] += test_preds / 5

        train_df.loc[train_df['fold'] == fold, 'oof'] = val_df['preds'].values
        train_df.loc[train_df['fold'] == fold, 'seen'] = is_seen.values

    total_score = np.sqrt(mean_squared_error(train_df['score'], train_df['oof']))
    seen_score = np.sqrt(mean_squared_error(train_df[train_df['seen']]['score'], train_df[train_df['seen']]['oof']))
    unseen_score = np.sqrt(mean_squared_error(train_df[~train_df['seen']]['score'], train_df[~train_df['seen']]['oof']))
    print(f"Total RMSE: {total_score} | Seen RMSE: {seen_score} | Unseen RMSE: {unseen_score}")

    # train_df.to_csv(os.path.join("workspace", "working", "anime2vec",'train_anime2vec.csv'), index=False)
    # submission_df.to_csv(os.path.join("workspace", "working", "anime2vec",'submission.csv'), index=False)o
    return train_df, test_df, submission_df

# Main

In [6]:
with timer("Load the data"):
    train_df, test_df, submission_df = load_data()

with timer("Stratified & Group split"):
    train_df = stratified_and_group_kfold_split(train_df)

with timer("add_w2v_features_without_score"):
    # testの視聴情報も活用するため、trainとtestを結合して先に特徴量を作成
    train_test_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
    train_test_df = add_w2v_features_without_score(train_test_df)
    train_df = train_test_df[train_test_df['score'] != 0].copy().reset_index(drop=True)
    test_df = train_test_df[train_test_df['score'] == 0].copy().reset_index(drop=True)

with timer("Training and evaluation with LightGBM"):
    trained_df, tested_df, sub_df = train(train_df, test_df, submission_df)

[Load the data] done in 0 s
[Stratified & Group split] done in 0 s




[add_w2v_features_without_score] done in 2 s
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48946
[LightGBM] [Info] Number of data points in the train set: 109120, number of used features: 192
[LightGBM] [Info] Start training from score 7.769401
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 1.19326	valid_1's rmse: 1.25371
[200]	training's rmse: 1.13219	valid_1's rmse: 1.23215
[300]	training's rmse: 1.09239	valid_1's rmse: 1.22745
[400]	training's rmse: 1.05656	valid_1's rmse: 1.22442
[500]	training's rmse: 1.02384	valid_1's rmse: 1.22187
[600]	training's rmse: 0.99401	valid_1's rmse: 1.22066
[700]	training's rmse: 0.966113	valid_1's rmse: 1.21998
[800]	training's rmse: 0.939603	valid_1's rmse: 1.2194
[900]	training's rmse: 0.915844	valid_1's rmse: 1.21881
[1000]	training's rmse: 0.892012	valid_1's rmse: 1.2189
Early stopping, best iteration is:
[901]	training's rmse: 0.915595	valid_1's rmse: 1.21879
fold0 tr

In [7]:
train_path = os.path.join("/workspace", "working", "anime2vec",'train_anime2vec.csv')
print(train_path)
trained_df.to_csv(train_path, index=False)

/workspace/working/anime2vec/train_anime2vec.csv


In [8]:
test_path = os.path.join("/workspace", "working", "anime2vec",'test_anime2vec.csv')
print(test_path)
tested_df.to_csv(test_path, index=False)

/workspace/working/anime2vec/test_anime2vec.csv


In [9]:
sub_path = os.path.join("/workspace", "working", "anime2vec",'submission.csv')
print(sub_path)
sub_df.to_csv(sub_path, index=False)

/workspace/working/anime2vec/submission.csv


In [10]:
tested_df

Unnamed: 0,user_id,anime_id,score,fold,wo_score_user_factor_0,wo_score_user_factor_1,wo_score_user_factor_2,wo_score_user_factor_3,wo_score_user_factor_4,wo_score_user_factor_5,...,item_factor_54,item_factor_55,item_factor_56,item_factor_57,item_factor_58,item_factor_59,item_factor_60,item_factor_61,item_factor_62,item_factor_63
0,0008e10fb39e55447333,04068820a73e52dc3b32,0,,0.186446,-0.056671,0.320237,-0.156542,-0.093355,-0.408961,...,-0.563877,0.207953,-1.429638,-0.585801,1.282804,-0.558889,0.049126,-0.817434,-0.213285,1.126903
1,0008e10fb39e55447333,04a3d0b122b24965e909,0,,0.186446,-0.056671,0.320237,-0.156542,-0.093355,-0.408961,...,0.545700,0.441194,-0.306278,-1.848849,-1.776242,-0.241650,0.265209,-1.163607,-0.664191,0.412600
2,0008e10fb39e55447333,1447fe1f10b59912d6a8,0,,0.186446,-0.056671,0.320237,-0.156542,-0.093355,-0.408961,...,2.392855,-0.201189,-0.178445,-0.972658,-0.744394,1.533357,0.236169,1.386148,-0.019179,-0.890275
3,0008e10fb39e55447333,2622632598c68682afd5,0,,0.186446,-0.056671,0.320237,-0.156542,-0.093355,-0.408961,...,1.502723,-0.782589,0.452808,-0.044885,-1.627218,-1.846785,-0.043139,0.572776,0.613713,1.474543
4,0008e10fb39e55447333,2701850c7216516fec46,0,,0.186446,-0.056671,0.320237,-0.156542,-0.093355,-0.408961,...,-1.116279,-1.758816,-1.034489,1.472120,-0.186479,-2.805167,-0.891380,0.698424,-0.689170,-0.770951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117671,ffe85a36cd20500faa58,f508b02efeac8ecb8cc0,0,,0.234634,-0.268351,0.334127,-0.424138,-0.196915,-0.348194,...,0.634340,1.317287,-1.368719,-1.176869,0.324899,0.304721,-2.378378,0.387789,-0.123941,-0.848298
117672,ffe85a36cd20500faa58,f5b8ecea3beea4b82d79,0,,0.234634,-0.268351,0.334127,-0.424138,-0.196915,-0.348194,...,-1.215328,-0.631272,1.369881,0.291304,0.035983,-0.146845,-0.774278,0.235659,0.720971,0.037424
117673,ffe85a36cd20500faa58,f6c208226b6b69948053,0,,0.234634,-0.268351,0.334127,-0.424138,-0.196915,-0.348194,...,0.893342,1.053601,0.918209,1.750248,-0.969310,0.745615,0.485333,-0.431630,0.505872,-0.186105
117674,ffe85a36cd20500faa58,fe67592c312fc1e17745,0,,0.234634,-0.268351,0.334127,-0.424138,-0.196915,-0.348194,...,-0.914432,0.350686,1.212794,-1.586724,-0.282101,0.451981,-0.131902,-1.268467,0.140240,1.535510
