<a href="https://colab.research.google.com/github/robert-myers/myanimelist-recommender/blob/master/notebooks/SVD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install surprise



In [0]:
import pandas as pd

from surprise import SVD, SVDpp, NMF
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, GridSearchCV#, train_test_split

from sklearn.model_selection import train_test_split

random_state = 8182868

In [3]:
df = pd.read_csv("https://s3.us-east-2.amazonaws.com/my.anime.list.sagemaker/surprise/custom_dataset.csv", index_col=0)
df = df[["username", "anime_id", "my_score"]]
X = df.drop(columns="my_score")
y = df["my_score"]
X, _, y, _ = train_test_split(X, y, stratify=y, test_size=.99, train_size=.01, random_state=random_state)
sample = pd.concat([X, y], axis=1)

  mask |= (ar1 == a)


In [0]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(sample, reader)

In [5]:
%%time
cross_validate(SVD(), data, cv=2)

CPU times: user 18.1 s, sys: 92.4 ms, total: 18.2 s
Wall time: 18.2 s


{'fit_time': (7.587606191635132, 7.559513330459595),
 'test_mae': array([1.17256245, 1.16578542]),
 'test_rmse': array([1.53556118, 1.52356704]),
 'test_time': (0.8626222610473633, 0.9665155410766602)}

In [6]:
%%time
cross_validate(SVDpp(), data, cv=2)

CPU times: user 32.4 s, sys: 88 ms, total: 32.4 s
Wall time: 32.4 s


{'fit_time': (13.42878770828247, 13.746953010559082),
 'test_mae': array([1.18388135, 1.18248963]),
 'test_rmse': array([1.54040196, 1.53759737]),
 'test_time': (1.6636502742767334, 1.6989991664886475)}

In [7]:
%%time
cross_validate(NMF(), data, cv=2)

CPU times: user 26.9 s, sys: 113 ms, total: 27.1 s
Wall time: 27 s


{'fit_time': (11.913732290267944, 11.407222032546997),
 'test_mae': array([2.09566739, 2.09455034]),
 'test_rmse': array([2.49705968, 2.49679394]),
 'test_time': (0.8375627994537354, 0.880028486251831)}

In [8]:
%%time
param_grid = {'n_epochs': [20], 'lr_all': [0.005],
              'reg_all': [0.02]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=10)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.4923450940024015
{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}
CPU times: user 2min 19s, sys: 278 ms, total: 2min 19s
Wall time: 2min 19s


In [9]:
algo = gs.best_estimator['rmse']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=.8, test_size=.2, random_state=random_state)
train = pd.concat([X_train, y_train], axis=1)
train = Dataset.load_from_df(train, reader)

algo.fit(train.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2c0e0be128>

In [10]:
algo.predict(X_test["username"].iloc[0], X_test["anime_id"].iloc[0]), y_test.iloc[0]

(Prediction(uid=1046549, iid=16518, r_ui=None, est=6.370700041043704, details={'was_impossible': False}),
 7)

In [0]:
train = pd.read_csv("https://s3.us-east-2.amazonaws.com/my.anime.list.sagemaker/sagemaker/train_sagemaker_one_percent.csv", index_col=0)
test = pd.read_csv("https://s3.us-east-2.amazonaws.com/my.anime.list.sagemaker/sagemaker/test_sagemaker_one_percent.csv", index_col=0)

In [12]:
train.isna().sum()

userID                          0
itemID                          0
my_score                        0
username                        0
user_watching                   0
user_completed                  0
user_onhold                     0
user_dropped                    0
user_plantowatch                0
user_days_spent_watching        0
gender                          0
location                        5
birth_date                      0
join_date                       0
last_online                     0
stats_mean_score                0
stats_rewatched                 0
stats_episodes                  0
title                           0
title_english               24487
title_japanese                  0
title_synonyms              35450
image_url                       0
type                            0
source                          0
episodes                        0
status                          0
airing                          0
aired_string                    0
aired         

In [13]:
test.head(1)

Unnamed: 0,userID,itemID,my_score,username,user_watching,user_completed,user_onhold,user_dropped,user_plantowatch,user_days_spent_watching,gender,location,birth_date,join_date,last_online,stats_mean_score,stats_rewatched,stats_episodes,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,airing,aired_string,aired,duration,rating,score,scored_by,rank,popularity,members,favorites,background,premiered,broadcast,related,producer,licensor,studio,genre,opening_theme,ending_theme,duration_min,aired_from_year
3133147,2155849,13663,7,DeathDot,0,106,2,0,5,61.1125,Male,"Bahrain, Arad",1995-12-26 00:00:00,2013-02-08 00:00:00,2018-02-10 08:16:33,7.64,2.0,3677,To LOVE-Ru Darkness,To LOVE Ru Darkness,To LOVEる -とらぶる- ダークネス,"To LOVE-Ru Trouble Darkness, To-Love-Ru Darkne...",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,12,Finished Airing,False,"Oct 6, 2012 to Dec 29, 2012","{'from': '2012-10-06', 'to': '2012-12-29'}",23 min. per ep.,R+ - Mild Nudity,7.71,103748,1091.0,405,189198,1634,,Fall 2012,Unknown,"{'Adaptation': [{'mal_id': 22519, 'type': 'man...","Geneon Universal Entertainment, Magic Capsule,...",Sentai Filmworks,Xebec,"Comedy, Ecchi, Harem, Romance, School, Sci-Fi,...","['""RAKUEN PROJECT (楽園PROJECT)"" by Ray']","['""Foul Play ni Kurari (ファールプレーにくらり)"" by Kanon...",23.0,2012.0
