## Importing necessary libraries

In [1]:
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, StratifiedKFold
from gensim.models import word2vec

from surprise import Dataset, Reader, SVDpp


import time
from contextlib import contextmanager
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import os

## Utility functions

In [2]:
SEED = 0

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    

seed_everything(SEED)

In [3]:
def add_w2v_features_with_score(train_df, val_df, test_df=None):
    anime_ids = train_df['anime_id'].unique().tolist()
    user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in train_df.groupby('user_id')['anime_id']}

    # スコアを考慮する場合
    # 今回は1～10のレーティングなので、スコアが5のアニメは5回、スコアが10のアニメは10回、タイトルをリストに追加する
    title_sentence_list = []
    for user_id, user_df in train_df.groupby('user_id'):
        user_title_sentence_list = []
        for anime_id, anime_score in user_df[['anime_id', 'score']].values:
            for i in range(anime_score):
                user_title_sentence_list.append(anime_id)
        title_sentence_list.append(user_title_sentence_list)

    # ユーザごとにshuffleしたリストを作成
    shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]  ## <= 変更点

    # 元のリストとshuffleしたリストを合わせる
    train_sentence_list = title_sentence_list + shuffled_sentence_list

    # word2vecのパラメータ
    vector_size = 64
    w2v_params = {
        "vector_size": vector_size,  ## <= 変更点
        "seed": SEED,
        "min_count": 1,
        "workers": 1
    }

    # word2vecのモデル学習
    model = word2vec.Word2Vec(train_sentence_list, **w2v_params)

    # ユーザーごとの特徴ベクトルと対応するユーザーID
    user_factors = {user_id: np.mean([model.wv[anime_id] for anime_id in user_anime_list], axis=0) for user_id, user_anime_list in user_anime_list_dict.items()}

    # アイテムごとの特徴ベクトルと対応するアイテムID
    item_factors = {aid: model.wv[aid] for aid in anime_ids}

    # データフレームを作成
    user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "user_id"})
    item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "anime_id"})

    # データフレームのカラム名をリネーム
    user_factors_df.columns = ["user_id"] + [f"user_factor_{i}" for i in range(vector_size)]
    item_factors_df.columns = ["anime_id"] + [f"item_factor_{i}" for i in range(vector_size)]

    train_df = train_df.merge(user_factors_df, on="user_id", how="left")
    train_df = train_df.merge(item_factors_df, on="anime_id", how="left")

    val_df = val_df.merge(user_factors_df, on="user_id", how="left")
    val_df = val_df.merge(item_factors_df, on="anime_id", how="left")

    if test_df is not None:
        test_df = test_df.merge(user_factors_df, on="user_id", how="left")
        test_df = test_df.merge(item_factors_df, on="anime_id", how="left")
        return train_df, val_df, test_df

    return train_df, val_df

In [4]:
def add_w2v_features_without_score(train_test_df):
    
    anime_ids = train_test_df['anime_id'].unique().tolist()
    user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in train_test_df.groupby('user_id')['anime_id']}

    title_sentence_list = train_test_df.groupby('user_id')['anime_id'].apply(list).tolist()

    # ユーザごとにshuffleしたリストを作成
    shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]  ## <= 変更点

    # 元のリストとshuffleしたリストを合わせる
    train_sentence_list = title_sentence_list + shuffled_sentence_list

    # word2vecのパラメータ
    vector_size = 64
    w2v_params = {
        "vector_size": vector_size,  ## <= 変更点
        "seed": SEED,
        "min_count": 1,
        "workers": 1
    }

    # word2vecのモデル学習
    model = word2vec.Word2Vec(train_sentence_list, **w2v_params)

    # ユーザーごとの特徴ベクトルと対応するユーザーID
    user_factors = {user_id: np.mean([model.wv[anime_id] for anime_id in user_anime_list], axis=0) for user_id, user_anime_list in user_anime_list_dict.items()}

    # アイテムごとの特徴ベクトルと対応するアイテムID
    item_factors = {aid: model.wv[aid] for aid in anime_ids}

    # データフレームを作成
    user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "user_id"})
    item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "anime_id"})

    # データフレームのカラム名をリネーム
    user_factors_df.columns = ["user_id"] + [f"wo_score_user_factor_{i}" for i in range(vector_size)]
    item_factors_df.columns = ["anime_id"] + [f"wo_score_item_factor_{i}" for i in range(vector_size)]

    train_test_df = train_test_df.merge(user_factors_df, on="user_id", how="left")

    return train_test_df

# Utils

## utils for data

In [5]:
def merge_by_anime_id(left_df, right_df):
    return pd.merge(left_df["anime_id"], right_df, on="anime_id", how="left").drop(columns=["anime_id"])

## features func

In [6]:
def create_anime_numeric_feature(input_df: pd.DataFrame):
    """input_dfは train or test.csv のデータが入ってくることを想定しています."""
    
    use_columns = [
        "members", 
    ]
    
    return merge_by_anime_id(input_df, anime_df)[use_columns]

#### label encoding

In [7]:
def create_anime_genres_label_encoding(input_df: pd.DataFrame):
    target_col = "genres"
    encoder = LabelEncoder()
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_le": encoder.fit_transform(anime_df[target_col].fillna("nan"))
    })
    return merge_by_anime_id(input_df, encoded_df)

In [8]:
def create_anime_source_label_encoding(input_df: pd.DataFrame):
    target_col = "source"
    encoder = LabelEncoder()
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_le": encoder.fit_transform(anime_df[target_col].fillna("nan"))
    })
    return merge_by_anime_id(input_df, encoded_df)

#### count encoding

In [9]:
# animeのtypeをカウントエンコーディング
def create_anime_type_count_encoding(input_df: pd.DataFrame):
    target_col = "type"
    count = anime_df[target_col].map(anime_df["type"].value_counts())
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_count": count
    })
    
    return merge_by_anime_id(input_df, encoded_df)

In [10]:
def create_anime_studios_count_encoding(input_df: pd.DataFrame):
    target_col = "studios"
    count = anime_df[target_col].map(anime_df["type"].value_counts())
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_count": count
    })
    
    return merge_by_anime_id(input_df, encoded_df)

In [11]:
def create_anime_producers_count_encoding(input_df: pd.DataFrame):
    target_col = "producers"
    count = anime_df[target_col].map(anime_df["type"].value_counts())
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_count": count
    })
    
    return merge_by_anime_id(input_df, encoded_df)

In [12]:
def create_anime_animeid_count_encoding(input_df: pd.DataFrame):
    target_col = "anime_id"
    count = anime_df[target_col].map(anime_df["type"].value_counts())
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_count": count
    })
    
    return merge_by_anime_id(input_df, encoded_df)

#### one-hot encoding

In [13]:
# animeのtypeをone-hotエンコーディング
def create_anime_type_one_hot_encoding(input_df: pd.DataFrame):
    # 対象の列のユニーク集合を取る
    target_colname = "type"
    target_series = anime_df[target_colname]
    unique_values = target_series.unique()

    # ユニークな値ごとに列を作る
    out_df = pd.DataFrame()
    for value in unique_values:
        is_value = target_series == value
        if value == "Unknown":
            out_df["Unknown_type"] = is_value.astype(int)
        else:
            out_df[value] = is_value.astype(int)
    
    out_df["anime_id"] = anime_df["anime_id"]
    return merge_by_anime_id(input_df, out_df)

In [14]:
# animeのtypeをone-hotエンコーディング
def create_anime_rating_one_hot_encoding(input_df: pd.DataFrame):
    # 対象の列のユニーク集合を取る
    target_colname = "rating"
    target_series = anime_df[target_colname]
    unique_values = target_series.unique()

    # ユニークな値ごとに列を作る
    out_df = pd.DataFrame()
    for value in unique_values:
        is_value = target_series == value
        if value == "Unknown":
            out_df["Unknown_rate"] = is_value.astype(int)
        else:
            out_df[value] = is_value.astype(int)
    
    out_df["anime_id"] = anime_df["anime_id"]
    return merge_by_anime_id(input_df, out_df)

In [15]:
# 上で定義した関数をまとめて実行
def create_feature(input_df):
    # functions に特徴量作成関数を配列で定義しました.
    # どの関数も同じ input / output のインターフェイスなので for で回せて嬉しいですね ;)
    functions = [
        create_anime_numeric_feature,
        # label encoding
        create_anime_genres_label_encoding, 
        create_anime_source_label_encoding, 
        # count encoding
        create_anime_type_count_encoding,
        create_anime_studios_count_encoding,
        create_anime_producers_count_encoding,
        create_anime_animeid_count_encoding, 
        # one-hot encoding
        create_anime_type_one_hot_encoding,
        create_anime_rating_one_hot_encoding,
    ]
    
    out_df = pd.DataFrame()
    func_name_list = []
    for func in functions:
        func_name = str(func.__name__)
        func_name_list.append(func_name)
        with timer(f"create {func_name}"):
            _df = func(input_df)
        out_df = pd.concat([out_df, _df], axis=1)
    
    return out_df 

### 学習に便利な関数

In [16]:
def load_data():
    train_df = pd.read_csv('/workspace/input/atmaCup15_dataset/train.csv')
    test_df = pd.read_csv('/workspace/input/atmaCup15_dataset/test.csv')
    test_df['score'] = 0 # dummy

    # Initialize submission file
    submission_df = pd.read_csv('/workspace/input/atmaCup15_dataset/sample_submission.csv')
    submission_df['score'] = 0
    return train_df, test_df, submission_df

def stratified_and_group_kfold_split(train_df):
    # https://www.guruguru.science/competitions/21/discussions/45ffc8a1-e37c-4b95-aac4-c4e338aa6a9b/
    
    # 20%のユーザを抽出
    n_user = train_df["user_id"].nunique()
    unseen_users = random.sample(sorted(train_df["user_id"].unique()), k=n_user // 5)
    train_df["unseen_user"] = train_df["user_id"].isin(unseen_users)
    unseen_df = train_df[train_df["unseen_user"]].reset_index(drop=True)
    train_df = train_df[~train_df["unseen_user"]].reset_index(drop=True)

    # train_dfの80%をStratifiedKFoldで分割
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold_id, (_, valid_idx) in enumerate(skf.split(train_df, train_df["user_id"])):
        train_df.loc[valid_idx, "fold"] = fold_id

    # 20%をGroupKFoldで分割
    gkf = GroupKFold(n_splits=5)
    unseen_df["fold"] = -1
    for fold_id, (_, valid_idx) in enumerate(gkf.split(unseen_df, unseen_df["user_id"], unseen_df["user_id"])):
        unseen_df.loc[valid_idx, "fold"] = fold_id

    # concat
    train_df = pd.concat([train_df, unseen_df], axis=0).reset_index(drop=True)
    train_df.drop(columns=["unseen_user"], inplace=True)
    return train_df


def train(train_df, original_test_df, submission_df):
    train_df['oof'] = 0
    train_df['seen'] = False

    for fold in range(5):
        # Prepare the train and validation data
        trn_df = train_df[train_df['fold'] != fold].copy()
        val_df = train_df[train_df['fold'] == fold].copy()

        trn_df, val_df, test_df = add_w2v_features_with_score(trn_df, val_df, original_test_df.copy())
        
        # Define the features and the target
        unused_cols = ['user_id', 'anime_id', 'score', 'fold', 'oof', 'seen']
        feature_cols = [col for col in trn_df.columns if col not in unused_cols]
        target_col = 'score'

        # Prepare the LightGBM datasets
        lgb_train = lgb.Dataset(trn_df[feature_cols], trn_df[target_col])
        lgb_val = lgb.Dataset(val_df[feature_cols], val_df[target_col])

        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.1,
            # 'reg_lambda': 1.0
        }

        # Train the model
        callbacks = [
            lgb.early_stopping(stopping_rounds=200),
            lgb.log_evaluation(period=500)
        ]
        model_lgb = lgb.train(params,
                              lgb_train,
                              valid_sets=[lgb_train, lgb_val],
                              callbacks=callbacks,
                              num_boost_round=10000)
        # Predict
        trn_df['preds'] = model_lgb.predict(trn_df[feature_cols], num_iteration=model_lgb.best_iteration)
        val_df['preds'] = model_lgb.predict(val_df[feature_cols], num_iteration=model_lgb.best_iteration)
        test_preds = model_lgb.predict(test_df[feature_cols], num_iteration=model_lgb.best_iteration)

        train_users = trn_df['user_id'].unique()
        is_seen = val_df['user_id'].isin(train_users)
        seen_val = val_df[is_seen]
        unseen_val = val_df[~is_seen]

        # Evaluate the model
        train_score = np.sqrt(mean_squared_error(trn_df['score'], trn_df['preds']))
        seen_val_score = np.sqrt(mean_squared_error(seen_val['score'], seen_val['preds']))
        unseen_val_score = np.sqrt(mean_squared_error(unseen_val['score'], unseen_val['preds']))
        print(f'fold{fold} train RMSE: {train_score:.3f}, seen val RMSE: {seen_val_score:.3f}, unseen val RMSE: {unseen_val_score:.3f}')
        
        submission_df['score'] += test_preds / 5

        train_df.loc[train_df['fold'] == fold, 'oof'] = val_df['preds'].values
        train_df.loc[train_df['fold'] == fold, 'seen'] = is_seen.values

    total_score = np.sqrt(mean_squared_error(train_df['score'], train_df['oof']))
    seen_score = np.sqrt(mean_squared_error(train_df[train_df['seen']]['score'], train_df[train_df['seen']]['oof']))
    unseen_score = np.sqrt(mean_squared_error(train_df[~train_df['seen']]['score'], train_df[~train_df['seen']]['oof']))
    print(f"Total RMSE: {total_score} | Seen RMSE: {seen_score} | Unseen RMSE: {unseen_score}")

    # train_df.to_csv(os.path.join("workspace", "working", "anime2vec",'train_anime2vec.csv'), index=False)
    # submission_df.to_csv(os.path.join("workspace", "working", "anime2vec",'submission.csv'), index=False)o
    return train_df, test_df, submission_df

# Main

In [17]:
with timer("Load the data"):
    train_df, test_df, submission_df = load_data()
    anime_df = pd.read_csv("/workspace/input/atmaCup15_dataset/anime.csv")
with timer("Stratified & Group split"):
    train_df = stratified_and_group_kfold_split(train_df)

with timer("add_w2v_features_without_score"):
    # testの視聴情報も活用するため、trainとtestを結合して先に特徴量を作成
    train_test_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
    train_test_df = add_w2v_features_without_score(train_test_df)
    train_df = train_test_df[train_test_df['score'] != 0].copy().reset_index(drop=True)
    test_df = train_test_df[train_test_df['score'] == 0].copy().reset_index(drop=True)

with timer("train..."):
    train_feat_df = create_feature(train_df)
    train_df = pd.concat([train_df, train_feat_df], axis=1)

with timer("test..."):
    test_feat_df = create_feature(test_df)
    test_df = pd.concat([test_df, test_feat_df], axis=1)

[Load the data] done in 0 s
[Stratified & Group split] done in 0 s




[add_w2v_features_without_score] done in 2 s
[create create_anime_numeric_feature] done in 0 s
[create create_anime_genres_label_encoding] done in 0 s
[create create_anime_source_label_encoding] done in 0 s
[create create_anime_type_count_encoding] done in 0 s
[create create_anime_studios_count_encoding] done in 0 s
[create create_anime_producers_count_encoding] done in 0 s
[create create_anime_animeid_count_encoding] done in 0 s
[create create_anime_type_one_hot_encoding] done in 0 s
[create create_anime_rating_one_hot_encoding] done in 0 s
[train...] done in 0 s
[create create_anime_numeric_feature] done in 0 s
[create create_anime_genres_label_encoding] done in 0 s
[create create_anime_source_label_encoding] done in 0 s
[create create_anime_type_count_encoding] done in 0 s
[create create_anime_studios_count_encoding] done in 0 s
[create create_anime_producers_count_encoding] done in 0 s
[create create_anime_animeid_count_encoding] done in 0 s
[create create_anime_type_one_hot_encodi

In [18]:
INPUT_DIR = os.path.join("/workspace", "input", "atmaCup15_dataset")
class configs:
    INPUT_DIR = INPUT_DIR
    NPUT_DIR = os.path.join("/workspace", "input", "atmaCup15_dataset")
    TRAIN_CSV = os.path.join(INPUT_DIR, "train_stratifiedgroupkfold.csv")
    ANIME_CSV = os.path.join(INPUT_DIR, "anime.csv")
    TEST_CSV = os.path.join(INPUT_DIR, "test.csv")
    SAMPLE_SUB_CSV = os.path.join(INPUT_DIR, "sample_submission.csv")
    SEED = 42

# Load the dataset
train_df = pd.read_csv(os.path.join(configs.INPUT_DIR, "train_stratifiedkfold.csv"))
reader = Reader(rating_scale=(1, 10))

models = []
for fold in sorted(train_df["fold"].unique()):
    print("fold", fold)
    train_df_ = train_df[train_df["fold"] != fold].reset_index(drop=True)
    train_data = Dataset.load_from_df(train_df_[['user_id', 'anime_id', 'score']], reader)
    model = SVDpp()
    model.fit(train_data.build_full_trainset())
    models.append(model)

    
oof_df = pd.DataFrame()
for fold, model in enumerate(models):
    test_df_ = train_df[train_df["fold"] == fold].reset_index(drop=True)
    test_data = Dataset.load_from_df(test_df_[['user_id', 'anime_id', 'score']], reader)
    oof_pred = model.test(test_data.build_full_trainset().build_testset())
    pred = [pred.est for pred in oof_pred]
    # predictions.extend(oof_pred)
    if len(oof_df)==0:
        test_df_["svd"] = pred
        oof_df = test_df_
    else:
        test_df_["svd"] = pred
        oof_df = pd.concat([oof_df, test_df_], axis=0)
        print(len(oof_df))


train_df = pd.merge(train_df, oof_df[["user_id", "anime_id", "svd"]], on=["user_id", "anime_id"], how="left")

fold 0
fold 1
fold 2
fold 3
fold 4
54561
81841
109121
136401


In [20]:
test_svd = pd.read_csv('/workspace/input/atmaCup15_dataset/test.csv')
test_svd['score'] = 0

test_set = Dataset.load_from_df(test_svd, reader).build_full_trainset().build_testset()

predictions = []
for model in models:
    pred = model.test(test_set)
    pred_ = [pred.est for pred in pred]
    predictions.append(pred_)

pred_mean = np.mean(predictions, axis=0)
test_svd["svd"] = pred_mean


In [21]:
test_df = pd.merge(test_df, test_svd[["user_id", "anime_id", "svd"]], on=["user_id", "anime_id"], how="left")

In [22]:
with timer("Training and evaluation with LightGBM"):
    trained_df, tested_df, sub_df = train(train_df, test_df, submission_df)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32878
[LightGBM] [Info] Number of data points in the train set: 109120, number of used features: 129
[LightGBM] [Info] Start training from score 7.768759
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[38]	training's rmse: 1.16422	valid_1's rmse: 1.19141
fold0 train RMSE: 1.164, seen val RMSE: 1.191, unseen val RMSE: 1.742
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32883
[LightGBM] [Info] Number of data points in the train set: 109121, number of used features: 129
[LightGBM] [Info] Start training from score 7.768725
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[63]	training's rmse: 1.15287	valid_1's rmse: 1.1795
fold1 train RMSE: 1.153, seen val RMSE: 1.179, unseen val RMSE: 2.470
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] T

In [23]:
exp_name = "exp012_anime2vec"
os.makedirs(os.path.join("/workspace", "working", exp_name), exist_ok=True)


train_path = os.path.join("/workspace", "working", exp_name, 'train_anime2vec.csv')
print(train_path)
trained_df.to_csv(train_path, index=False)

test_path = os.path.join("/workspace", "working", exp_name,'test_anime2vec.csv')
print(test_path)
tested_df.to_csv(test_path, index=False)

sub_path = os.path.join("/workspace", "working", exp_name, 'submission.csv')
print(sub_path)
sub_df.to_csv(sub_path, index=False)

/workspace/working/exp012_anime2vec/train_anime2vec.csv
/workspace/working/exp012_anime2vec/test_anime2vec.csv
/workspace/working/exp012_anime2vec/submission.csv
