In [1]:
# pip install scikit-surprise
# In the Windows environment, it is necessary to install the 'Desktop development with C++' workload of the Visual Studio in advance.
from surprise import Reader, Dataset, KNNBaseline, SVD, NMF, SlopeOne, CoClustering
from surprise.model_selection import KFold, cross_validate
import pandas as pd
import numpy as np

## 1. 데이터 로딩 / データのロード

- Excludes unrated data (rating=-1).

In [2]:
anime_df = pd.read_csv("anime.csv").sort_values(by="anime_id").reset_index(drop=True)
ratings_df = pd.read_csv("rating.csv")
ratings_df = ratings_df.loc[ratings_df["rating"] != -1, :]

- If load_from_df() is used, only the rating_scale parameter is required.
- https://surprise.readthedocs.io/en/stable/dataset.html#surprise.dataset.Dataset.load_from_df

In [3]:
reader = Reader(rating_scale=(1, 10))
dataset = Dataset.load_from_df(ratings_df, reader)

- The build_full_trainset() means that the entire data is used as training data.
- https://surprise.readthedocs.io/en/stable/dataset.html#surprise.dataset.DatasetAutoFolds.build_full_trainset

In [4]:
trainset = dataset.build_full_trainset()

## 2. 아이템 기반 추천 / アイテムベースレコメンド

### Fit a model

- Use simple knn algorithm.
- https://surprise.readthedocs.io/en/stable/knn_inspired.html#pred-package-knn-inpired
- Specifying the user_based as false will result in item-based recommendation.
- Use the "pearson_baseline" similarity measure is recommended.
- https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline

In [5]:
# sim_options = {"name": "cosine", "user_based": False}
sim_options = {"name": "pearson_baseline", "min_support": 5, "user_based": False}
model = KNNBaseline(k=40, sim_options=sim_options)
model.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1ca4c31f588>

- Similarities of item-item are being calculated.

In [6]:
len(ratings_df["anime_id"].unique())

9927

In [7]:
model.sim.shape

(9927, 9927)

### Recommend animations similar to the animation specified

- If you want to know only IDs, you may use get_neighbors().
- Since methods use an ID system called the inner id, it is necessary to convert the raw id (=anime_id) to the inner id.
- In addition, it is necessary to return the inner id to the raw id in order to display the recommended results.
- https://surprise.readthedocs.io/en/stable/FAQ.html#raw-inner-note
- It is useless to recommend unpopular animations, so set the threshold for the members.

In [8]:
def item_based_knn(anime_df, model, anime_name, top_n=10, members_threshold=10000):
    anime_id = anime_df.loc[anime_df["name"] == anime_name, "anime_id"].values[0]
    anime_inner_id = model.trainset.to_inner_iid(anime_id)
    
    sim = model.sim[anime_inner_id]
    sim_inner_id = sim.argsort()[::-1]
    
    rec_anime_names = []
    rec_anime_scores = []
    count = 0
    for idx in sim_inner_id:
        raw_id = model.trainset.to_raw_iid(idx)
        data = anime_df.loc[anime_df["anime_id"] == raw_id, :]
        if (anime_inner_id != idx) and (~np.isnan(sim[idx])) and (data["members"].values[0] >= members_threshold):
            rec_anime_names.append(data["name"].values[0])
            rec_anime_scores.append(sim[idx])
            count += 1
        if count >= top_n:
            break
    result = pd.DataFrame({"anime_name": rec_anime_names, "similarity": rec_anime_scores})
    
    return result

- As a result of entering the 「SLAM DUNK」, sports animations such as 「はじめの一歩」 were recommended.

In [9]:
item_based_knn(anime_df, model, "Slam Dunk")

Unnamed: 0,anime_name,similarity
0,Hajime no Ippo,0.290061
1,Hunter x Hunter,0.258532
2,Initial D First Stage,0.253423
3,Monster,0.242074
4,Gintama,0.22841
5,Yuu☆Yuu☆Hakusho,0.226722
6,Initial D Fourth Stage,0.223575
7,Shonan Junai Gumi!,0.219782
8,Berserk,0.218339
9,Initial D Second Stage,0.208931


- For 「けいおん!」, the same series was ranked high.

In [10]:
item_based_knn(anime_df, model, "K-On!")

Unnamed: 0,anime_name,similarity
0,K-On!!,0.722588
1,K-On!: Live House!,0.591782
2,K-On! Movie,0.504559
3,K-On!!: Keikaku!,0.484685
4,K-On!: Ura-On!,0.236541
5,Lucky☆Star,0.236119
6,A-Channel,0.210934
7,Shinryaku! Ika Musume,0.190845
8,Lucky☆Star: Original na Visual to Animation,0.186191
9,Kanamemo,0.171858


## 3. 사용자 기반 추천 / ユーザーベースレコメンド

### Fit a model

- This time, let's try SVD (Singular Value Decompositions).
- https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD

In [4]:
model = SVD(random_state=0)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1efd5a990f0>

### Extract animations not rated by the user

- Exclude animations rated by users from the full list of animations.

In [5]:
user_id = 12
anime_ids = ratings_df["anime_id"].unique()
rated_by_user = ratings_df.loc[ratings_df["user_id"] == user_id, "anime_id"]
not_rated_by_user = np.setdiff1d(anime_ids, rated_by_user)
not_rated_by_user

array([    1,     5,     6, ..., 34349, 34367, 34475], dtype=int64)

### Construct a test set to predict the rating

- For example, for user_id=12 (uid), the rating to anime_id=1 (iid) is predicted to be 9.4 (est).
- The r_ui is the value set when creating the test set.
- データをtrainsetとtestsetを分けた場合は、予測精度を測定するために使える

In [6]:
# All ratings in the testset have been set to 10, but are not used and have no effect.
testset = [[user_id, anime_id, 10] for anime_id in not_rated_by_user]
predictions = model.test(testset)
predictions[0]

Prediction(uid=12, iid=1, r_ui=10, est=9.425257830181703, details={'was_impossible': False})

### Recommend animations that the specified user has never rated

- Recommend animations in order of high predicted rating.

In [7]:
def user_based_recommendation(anime_df, ratings_df, model, user_id, top_n=10, members_threshold=10000):
    anime_ids = ratings_df["anime_id"].unique()
    anime_ids = np.setdiff1d(anime_ids, anime_df.loc[anime_df["members"] < members_threshold, "anime_id"])
    rated_by_user = ratings_df.loc[ratings_df["user_id"] == user_id, "anime_id"]
    not_rated_by_user = np.setdiff1d(anime_ids, rated_by_user)
    
    testset = [[user_id, anime_id, 10] for anime_id in not_rated_by_user]
    predictions = model.test(testset)
    
    rec_anime_ids = []
    rec_anime_ratings = []
    for pred in predictions:
        rec_anime_ids.append(pred[1])
        rec_anime_ratings.append(pred[3])
    result = pd.DataFrame({"anime_id": rec_anime_ids, "rating_prediction": rec_anime_ratings})
    result = pd.merge(result, anime_df, on="anime_id")[["name", "rating_prediction"]]
    result = result.sort_values(by="rating_prediction", ascending=False)[:top_n]
    
    return result

- Recommend user_id=12.
- First, let's check out the animations that the user rated highly.

In [8]:
def rated_by_user(anime_df, ratings_df, user_id):
    result = ratings_df.loc[ratings_df["user_id"] == user_id, ["anime_id", "rating"]]
    result = result.groupby("anime_id").mean().reset_index()
    result = pd.merge(result, anime_df[["anime_id", "name"]], on="anime_id")[["name", "rating"]]
    result = result.sort_values(by="rating", ascending=False)
    return result

In [9]:
rated_by_user(anime_df, ratings_df, user_id=12).head()

Unnamed: 0,name,rating
11,Code Geass: Hangyaku no Lelouch R2,10
7,Death Note,10
19,Steins;Gate,10
9,Code Geass: Hangyaku no Lelouch,10
21,Shingeki no Kyojin,10


- Many 「銀魂」 series have been recommended, but the reason is unclear.

In [10]:
user_based_recommendation(anime_df, ratings_df, model, user_id=12)

Unnamed: 0,name,rating_prediction
484,Ginga Eiyuu Densetsu,10.0
2599,Gintama°,10.0
548,Gintama,9.909467
2397,Shigatsu wa Kimi no Uso,9.894069
2015,Gintama&#039;: Enchousen,9.862311
1757,Hunter x Hunter (2011),9.855054
2802,Kimi no Na wa.,9.835394
1616,Gintama&#039;,9.834163
195,Hajime no Ippo,9.792984
21,Rurouni Kenshin: Meiji Kenkaku Romantan - Tsui...,9.790124


## 4. (부록) 모델간 비교 / （おまけ）モデルの比較

- Compare the prediction accuracy of the rating by dividing the data into training set and testing set.
- The n_splits is specified as 2 because processing takes too long on a local PC.

In [9]:
kf = KFold(n_splits=2, random_state=0)

In [6]:
out = cross_validate(CoClustering(), dataset, measures=["RMSE"], cv=kf, verbose=False)
out

{'test_rmse': array([1.22741366, 1.2335489 ]),
 'fit_time': (97.49518871307373, 99.2642011642456),
 'test_time': (48.79941368103027, 48.67022490501404)}

In [7]:
df = pd.DataFrame.from_dict(out).mean(axis=0)
df = df.append(pd.Series("CoClustering", index=["algorithm"]))
df

test_rmse         1.23048
fit_time          98.3797
test_time         48.7348
algorithm    CoClustering
dtype: object

In [10]:
algorithms = {"SVD": SVD(), "NMF": NMF(), "SlopeOne": SlopeOne(), "CoClustering": CoClustering()}
table = []
for name, algo in algorithms.items():
    out = cross_validate(algo, dataset, measures=["RMSE"], cv=kf, verbose=False)
    df = pd.DataFrame.from_dict(out).mean(axis=0)
    df = df.append(pd.Series(name, index=["algorithm"]))
    table.append(df)

result = pd.DataFrame(table).set_index("algorithm").sort_values("test_rmse")
result

Unnamed: 0_level_0,test_rmse,fit_time,test_time
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,1.183367,174.656221,53.847728
SlopeOne,1.203926,38.611949,934.643517
CoClustering,1.219341,102.943714,413.16963
NMF,2.227855,194.289427,93.691718
