In [4]:
import os

import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVDpp
from surprise.model_selection import cross_validate

from sklearn.model_selection import StratifiedGroupKFold 

In [5]:
EXP_NAME = "svdpp"
class configs:
    OUTPUT_DIR = os.path.join("/workspace", "working", EXP_NAME)
    INPUT_DIR = os.path.join("/workspace", "input", "atmaCup15_dataset")
    TRAIN_CSV = os.path.join(INPUT_DIR, "train_stratifiedgroupkfold.csv")
    ANIME_CSV = os.path.join(INPUT_DIR, "anime.csv")
    TEST_CSV = os.path.join(INPUT_DIR, "test.csv")
    SAMPLE_SUB_CSV = os.path.join(INPUT_DIR, "sample_submission.csv")
    SEED = 42

In [6]:
# Load the dataset
train_df = pd.read_csv(os.path.join(configs.INPUT_DIR, "train_stratifiedkfold.csv"))
reader = Reader(rating_scale=(1, 10))

models = []
for fold in sorted(train_df["fold"].unique()):
    print("fold", fold)
    train_df_ = train_df[train_df["fold"] != fold].reset_index(drop=True)
    train_data = Dataset.load_from_df(train_df_[['user_id', 'anime_id', 'score']], reader)
    model = SVDpp()
    model.fit(train_data.build_full_trainset())
    models.append(model)

fold 0
fold 1
fold 2
fold 3
fold 4


In [7]:
oof_df = pd.DataFrame()
for fold, model in enumerate(models):
    test_df_ = train_df[train_df["fold"] == fold].reset_index(drop=True)
    test_data = Dataset.load_from_df(test_df_[['user_id', 'anime_id', 'score']], reader)
    oof_pred = model.test(test_data.build_full_trainset().build_testset())
    pred = [pred.est for pred in oof_pred]
    # predictions.extend(oof_pred)
    if len(oof_df)==0:
        test_df_["svd"] = pred
        oof_df = test_df_
    else:
        test_df_["svd"] = pred
        oof_df = pd.concat([oof_df, test_df_], axis=0)
        print(len(oof_df))

54561
81841
109121
136401


In [8]:
len(train_df), len(oof_df)

(136401, 136401)

In [9]:
oof_df


Unnamed: 0,user_id,anime_id,score,fold,svd
0,0008e10fb39e55447333,2290175205d55e81b197,8,0,7.800858
1,0008e10fb39e55447333,68241332330cbefa2b9e,9,0,6.984552
2,0008e10fb39e55447333,6c7ce24e0e4f56e0aac0,10,0,8.065854
3,0008e10fb39e55447333,741d2cba7471560b418a,9,0,7.216533
4,0008e10fb39e55447333,a1b5af1838c1af21fb99,2,0,4.090316
...,...,...,...,...,...
27275,ffa6ff8006f8630f3d11,35d87d3bdeed620ef335,8,4,8.141502
27276,ffa6ff8006f8630f3d11,963d6e728205d1311231,7,4,7.003046
27277,ffa6ff8006f8630f3d11,b22bd52bd814b6ed3ac4,7,4,7.592110
27278,ffa6ff8006f8630f3d11,d922517dbe7fc6d6ff64,7,4,6.778592


In [11]:
# os.makedirs(configs.OUTPUT_DIR)
oof_df.to_csv(os.path.join(configs.OUTPUT_DIR, "train_addSVDpp.csv"), index=False)

# test

In [12]:
test_df = pd.read_csv('/workspace/input/atmaCup15_dataset/test.csv')
test_df['score'] = 0

# Convert the test dataset to the surprise format
test_set = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset()

predictions = []
for model in models:
    pred = model.test(test_set)
    pred_ = [pred.est for pred in pred]
    predictions.append(pred_)

pred_mean = np.mean(predictions, axis=0)
test_df["svd"] = pred_mean

In [13]:
test_df.to_csv(os.path.join(configs.OUTPUT_DIR, "test_addSVDpp.csv"), index=False)

In [14]:
test_df

Unnamed: 0,user_id,anime_id,score,svd
0,0008e10fb39e55447333,04068820a73e52dc3b32,0,8.095059
1,0008e10fb39e55447333,04a3d0b122b24965e909,0,8.474763
2,0008e10fb39e55447333,1447fe1f10b59912d6a8,0,6.309822
3,0008e10fb39e55447333,2622632598c68682afd5,0,7.674819
4,0008e10fb39e55447333,2701850c7216516fec46,0,5.383076
...,...,...,...,...
117671,ffe85a36cd20500faa58,f508b02efeac8ecb8cc0,0,8.191629
117672,ffe85a36cd20500faa58,f5b8ecea3beea4b82d79,0,7.662358
117673,ffe85a36cd20500faa58,f6c208226b6b69948053,0,7.389410
117674,ffe85a36cd20500faa58,fe67592c312fc1e17745,0,6.965954


In [15]:
from sklearn.metrics import mean_squared_error
def root_mean_squared_error(y_true, y_pred):
    """mean_squared_error の root (0.5乗)"""
    return mean_squared_error(y_true, y_pred) ** .5

In [16]:
rmse = root_mean_squared_error(oof_df["score"], oof_df["svd"])
print(rmse)

1.1907176103307435
