In [1]:
import os

import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVDpp
from surprise.model_selection import cross_validate

from sklearn.model_selection import StratifiedGroupKFold 

In [2]:
EXP_NAME = "svdpp"
class configs:
    OUTPUT_DIR = os.path.join("/workspace", "working", EXP_NAME)
    INPUT_DIR = os.path.join("/workspace", "input", "atmaCup15_dataset")
    TRAIN_CSV = os.path.join(INPUT_DIR, "train_stratifiedgroupkfold.csv")
    ANIME_CSV = os.path.join(INPUT_DIR, "anime.csv")
    TEST_CSV = os.path.join(INPUT_DIR, "test.csv")
    SAMPLE_SUB_CSV = os.path.join(INPUT_DIR, "sample_submission.csv")
    SEED = 42

In [3]:
# Load the dataset
train_df = pd.read_csv(os.path.join(configs.INPUT_DIR, "train_stratifiedgroupkfold.csv"))
reader = Reader(rating_scale=(1, 10))

models = []
for fold in sorted(train_df["fold"].unique()):
    print("fold", fold)
    train_df_ = train_df[train_df["fold"] != fold].reset_index(drop=True)
    train_data = Dataset.load_from_df(train_df_[['user_id', 'anime_id', 'score']], reader)
    model = SVDpp()
    model.fit(train_data.build_full_trainset())
    models.append(model)

fold 0
fold 1
fold 2
fold 3
fold 4


In [4]:
oof_df = pd.DataFrame()
for fold, model in enumerate(models):
    test_df_ = train_df[train_df["fold"] == fold].reset_index(drop=True)
    test_data = Dataset.load_from_df(test_df_[['user_id', 'anime_id', 'score']], reader)
    oof_pred = model.test(test_data.build_full_trainset().build_testset())
    pred = [pred.est for pred in oof_pred]
    # predictions.extend(oof_pred)
    if len(oof_df)==0:
        test_df_["svd"] = pred
        oof_df = test_df_
    else:
        test_df_["svd"] = pred
        oof_df = pd.concat([oof_df, test_df_], axis=0)
        print(len(oof_df))

53834
81742
107323
136401


In [5]:
len(train_df), len(oof_df)

(136401, 136401)

In [6]:
oof_df


Unnamed: 0,user_id,anime_id,score,fold,svd
0,019123288497ea8ee320,041995f3f394ba00c88c,8,0,8.174404
1,019123288497ea8ee320,041b0c10ba571cdea336,7,0,8.477443
2,019123288497ea8ee320,057c8610088179f68964,7,0,8.501348
3,019123288497ea8ee320,07e58cea0afab8d25b72,8,0,7.852516
4,019123288497ea8ee320,08aaefd0726338c6cda6,9,0,8.554666
...,...,...,...,...,...
29073,fe9c772c995668ea3b75,f2257ee9d3a0ea6906c7,7,4,7.999053
29074,fe9c772c995668ea3b75,f4c85e7df7a0a1ef141b,9,4,8.335970
29075,fe9c772c995668ea3b75,f81231bddc60c928c2a2,8,4,7.764472
29076,fe9c772c995668ea3b75,fc4ee74b1a05a70653c9,6,4,7.326481


In [7]:
# os.makedirs(configs.OUTPUT_DIR)
oof_df.to_csv(os.path.join(configs.OUTPUT_DIR, "train_addSVDpp_gfold.csv"), index=False)

# test

In [8]:
test_df = pd.read_csv('/workspace/input/atmaCup15_dataset/test.csv')
test_df['score'] = 0

# Convert the test dataset to the surprise format
test_set = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

predictions = []
for model in models:
    pred = model.test(test_set)
    pred_ = [pred.est for pred in pred]
    predictions.append(pred_)

pred_mean = np.mean(predictions, axis=0)
test_df["svd"] = pred_mean

In [9]:
test_df.to_csv(os.path.join(configs.OUTPUT_DIR, "test_addSVDpp_gfold.csv"), index=False)

In [10]:
test_df

Unnamed: 0,user_id,anime_id,score,svd
0,0008e10fb39e55447333,04068820a73e52dc3b32,0,8.114678
1,0008e10fb39e55447333,04a3d0b122b24965e909,0,8.587596
2,0008e10fb39e55447333,1447fe1f10b59912d6a8,0,6.618760
3,0008e10fb39e55447333,2622632598c68682afd5,0,7.646841
4,0008e10fb39e55447333,2701850c7216516fec46,0,5.870018
...,...,...,...,...
117671,ffe85a36cd20500faa58,f508b02efeac8ecb8cc0,0,8.131319
117672,ffe85a36cd20500faa58,f5b8ecea3beea4b82d79,0,7.645553
117673,ffe85a36cd20500faa58,f6c208226b6b69948053,0,7.395970
117674,ffe85a36cd20500faa58,fe67592c312fc1e17745,0,6.973164


In [11]:
from sklearn.metrics import mean_squared_error
def root_mean_squared_error(y_true, y_pred):
    """mean_squared_error の root (0.5乗)"""
    return mean_squared_error(y_true, y_pred) ** .5

In [12]:
rmse = root_mean_squared_error(oof_df["score"], oof_df["svd"])
print(rmse)

1.4472918697576835
