<a href="https://colab.research.google.com/github/robert-myers/myanimelist-recommender/blob/master/notebooks/mal_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
pip install jikanpy && pip install scikit-surprise && pip install --upgrade pandas

In [0]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
import ast
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import surprise as sp

from jikanpy import Jikan
from surprise import Dataset, NormalPredictor, Reader
from surprise.model_selection import cross_validate, train_test_split

In [0]:
jikan = Jikan()

In [0]:
%%time

anime_lists_df = pd.read_csv("https://s3.us-east-2.amazonaws.com/my.anime.list.sagemaker/myanimelist/animelists_cleaned.csv",
                             parse_dates=["my_last_updated"],
                             ).drop(columns=["my_start_date", "my_finish_date"])
anime_lists_df = anime_lists_df[anime_lists_df["my_score"] != 0]
anime_lists_df = anime_lists_df[anime_lists_df["username"].notna()]

CPU times: user 52.2 s, sys: 9.92 s, total: 1min 2s
Wall time: 1min 28s


In [0]:
anime_lists_by_user = anime_lists_df.groupby("username")
anime_lists_by_anime = anime_lists_df.groupby("anime_id")

In [0]:
users_df = pd.read_csv("https://s3.amazonaws.com/my.anime.list/myanimelist/users_cleaned.csv",
                       parse_dates=["birth_date",	"join_date",	"last_online"],
                       index_col="user_id",
                       ).drop(columns=["access_rank"])

In [0]:
anime_df = pd.read_csv("https://s3.us-east-2.amazonaws.com/my.anime.list.sagemaker/myanimelist/anime_cleaned.csv",
                       index_col="anime_id",
                       )

In [0]:
def lit_eval(row):
  try:
    return ast.literal_eval(row)
  except:
    return ast.literal_eval(str(row))

anime_df["aired"] = anime_df["aired"].apply(lambda row: lit_eval(row))
anime_df["aired_from"] = anime_df["aired"].apply(lambda x: x["from"])
anime_df["aired_to"] = anime_df["aired"].apply(lambda x: x["to"])
anime_df["aired_from"] = pd.to_datetime(anime_df["aired_from"])
anime_df["aired_to"] = pd.to_datetime(anime_df["aired_to"])
# anime_df["aired_from_year"] = pd.to_datetime(anime_df["aired_from_year"], format="%Y")
anime_df.drop(columns=["aired_string", "aired", "aired_from_year"], inplace=True)

In [0]:
custom_df = pd.read_csv("https://s3.amazonaws.com/my.anime.list/surprise/custom_dataset.csv").drop(columns="Unnamed: 0")
custom_df.rename(columns={"username": "userID", "anime_id": "itemID", "my_score": "rating"}, inplace=True)
custom_df = custom_df[['userID', 'itemID', 'rating']]

In [0]:
custom_df.tail()

Unnamed: 0,userID,itemID,rating
19171945,4862000,15611,9
19171946,4862000,27815,9
19171947,299167,5945,8
19171948,263803,1316,9
19171949,48074,1744,10


## anime_lists_df

In [0]:
anime_lists_df.isna().sum()

username                      0
anime_id                      0
my_watched_episodes           0
my_score                      0
my_status                     0
my_rewatching           5059315
my_rewatching_ep              0
my_last_updated               0
my_tags                17522052
dtype: int64

In [0]:
train_data, test_data, _ = np.split(data.sample(frac=1, random_state=123), 
                                                  [int(0.95 * len(data)), int(len(data))])

In [0]:
anime_lists_df

Unnamed: 0,username,anime_id,my_watched_episodes,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
0,karthiga,21,586,9,1,,0,2013-03-03 10:52:53,
1,karthiga,59,26,7,2,,0,2013-03-10 13:54:51,
2,karthiga,74,26,7,2,,0,2013-04-27 16:43:35,
3,karthiga,120,26,7,2,,0,2013-03-03 10:53:57,
4,karthiga,178,26,7,2,0.0,0,2013-03-27 15:59:13,
...,...,...,...,...,...,...,...,...,...
31284025,Yokonightcore,15611,48,9,1,,0,2015-09-07 17:33:03,
31284026,Yokonightcore,27815,22,9,1,,0,2015-09-07 17:32:05,
31284027,wargod,5945,39,8,2,0.0,0,2010-03-29 04:24:12,
31284028,JMc_SetoKai_LoVe,1316,52,9,2,,0,2009-12-23 05:45:14,


In [0]:
anime_lists_by_user.get_group("hinogurl_mikha")

Unnamed: 0,username,anime_id,my_watched_episodes,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
31284029,hinogurl_mikha,1744,58,10,1,0.0,0,2008-04-05 11:36:20,


In [0]:
anime_lists_by_anime["my_score"].mean().sort_values(ascending=False).head(10)

anime_id
5114     9.217382
28977    9.177290
9969     9.136981
9253     9.129402
32281    9.122810
15335    9.094883
4181     9.085412
15417    9.072807
11061    9.053900
37179    9.000000
Name: my_score, dtype: float64

## users_df

In [0]:
custom_df["userID"].nunique()

106401

In [0]:
users_df.isna().sum()

username                    1
user_watching               0
user_completed              0
user_onhold                 0
user_dropped                0
user_plantowatch            0
user_days_spent_watching    0
gender                      0
location                    5
birth_date                  0
join_date                   0
last_online                 0
stats_mean_score            0
stats_rewatched             0
stats_episodes              0
dtype: int64

In [0]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108711 entries, 2255153 to 1289601
Data columns (total 15 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   username                  108710 non-null  object        
 1   user_watching             108711 non-null  int64         
 2   user_completed            108711 non-null  int64         
 3   user_onhold               108711 non-null  int64         
 4   user_dropped              108711 non-null  int64         
 5   user_plantowatch          108711 non-null  int64         
 6   user_days_spent_watching  108711 non-null  float64       
 7   gender                    108711 non-null  object        
 8   location                  108706 non-null  object        
 9   birth_date                108711 non-null  datetime64[ns]
 10  join_date                 108711 non-null  datetime64[ns]
 11  last_online               108711 non-null  datetime64[ns]


In [0]:
users_df

Unnamed: 0_level_0,username,user_watching,user_completed,user_onhold,user_dropped,user_plantowatch,user_days_spent_watching,gender,location,birth_date,join_date,last_online,stats_mean_score,stats_rewatched,stats_episodes
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2255153,karthiga,3,49,1,0,0,55.091667,Female,"Chennai, India",1990-04-29,2013-03-03,2014-02-04 01:32:00,7.43,0.0,3391
37326,Damonashu,45,195,27,25,59,82.574306,Male,"Detroit,Michigan",1991-08-01,2008-02-13,2017-07-10 06:52:54,6.15,6.0,4903
228342,bskai,25,414,2,5,11,159.483333,Male,"Nayarit, Mexico",1990-12-14,2009-08-31,2014-05-12 16:35:00,8.27,1.0,9701
327311,terune_uzumaki,5,5,0,0,0,11.394444,Female,"Malaysia, Kuantan",1998-08-24,2010-05-10,2012-10-18 19:06:00,9.70,6.0,697
5015094,Bas_G,35,114,6,20,175,30.458333,Male,"Nijmegen, Nederland",1999-10-24,2015-11-26,2018-05-10 20:53:37,7.86,0.0,1847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
797785,isoann,14,213,11,20,136,64.372222,Male,Poland:,1997-01-13,2011-10-23,2018-02-25 01:20:27,8.03,2.0,3939
1396285,bumcakee,13,116,5,9,22,41.365972,Female,"nhollywood, california",1998-07-18,2012-07-14,2017-02-14 00:30:00,8.86,24.0,2537
2478991,Scarlet95,6,103,10,8,54,46.827083,Female,Belgium,1995-10-17,2013-04-24,2016-12-18 08:41:00,7.40,1.0,2869
3975907,Torasori,22,239,0,4,176,72.361111,Male,"Latvia, Riga",1998-11-18,2014-07-30,2018-05-24 21:34:46,8.98,47.0,4469


## anime_df

In [0]:
anime_df.isna().sum()

title                0
title_english     3230
title_japanese       5
title_synonyms    2187
image_url            2
type                 0
source               0
episodes             0
status               0
airing               0
duration             0
rating               0
score                0
scored_by            0
rank               356
popularity           0
members              0
favorites            0
background        5855
premiered         3702
broadcast         3688
related              0
producer          2266
licensor          3881
studio               0
genre                4
opening_theme        0
ending_theme         0
duration_min         0
aired_from         115
aired_to           292
dtype: int64

In [0]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6668 entries, 11013 to 36315
Data columns (total 31 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   title           6668 non-null   object        
 1   title_english   3438 non-null   object        
 2   title_japanese  6663 non-null   object        
 3   title_synonyms  4481 non-null   object        
 4   image_url       6666 non-null   object        
 5   type            6668 non-null   object        
 6   source          6668 non-null   object        
 7   episodes        6668 non-null   int64         
 8   status          6668 non-null   object        
 9   airing          6668 non-null   bool          
 10  duration        6668 non-null   object        
 11  rating          6668 non-null   object        
 12  score           6668 non-null   float64       
 13  scored_by       6668 non-null   int64         
 14  rank            6312 non-null   float64       
 15 

In [0]:
anime_df.head(1)

Unnamed: 0_level_0,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,airing,duration,rating,score,scored_by,rank,popularity,members,favorites,background,premiered,broadcast,related,producer,licensor,studio,genre,opening_theme,ending_theme,duration_min,aired_from,aired_to
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
11013,Inu x Boku SS,Inu X Boku Secret Service,妖狐×僕SS,Youko x Boku SS,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,12,Finished Airing,False,24 min. per ep.,PG-13 - Teens 13 or older,7.63,139250,1274.0,231,283882,2809,Inu x Boku SS was licensed by Sentai Filmworks...,Winter 2012,Fridays at Unknown,"{'Adaptation': [{'mal_id': 17207, 'type': 'man...","Aniplex, Square Enix, Mainichi Broadcasting Sy...",Sentai Filmworks,David Production,"Comedy, Supernatural, Romance, Shounen","['""Nirvana"" by MUCC']","['#1: ""Nirvana"" by MUCC (eps 1, 11-12)', '#2: ...",24.0,2012-01-13,2012-03-30


In [0]:
anime_df.loc[2104]

title                                               Seto no Hanayome
title_english                                  My Bride is a Mermaid
title_japanese                                                 瀬戸の花嫁
title_synonyms                                  The Inland Sea Bride
image_url          https://myanimelist.cdn-dena.com/images/anime/...
type                                                              TV
source                                                         Manga
episodes                                                          26
status                                               Finished Airing
airing                                                         False
aired_string                              Apr 2, 2007 to Oct 1, 2007
aired                     {'from': '2007-04-02', 'to': '2007-10-01'}
duration                                             24 min. per ep.
rating                                     PG-13 - Teens 13 or older
score                             

In [0]:
anime_df["rating"].unique()

array(['PG-13 - Teens 13 or older', 'PG - Children', 'G - All Ages',
       'R+ - Mild Nudity', 'R - 17+ (violence & profanity)', 'None',
       'Rx - Hentai'], dtype=object)

In [0]:
anime_lists_by_anime.get_group(11013)["my_score"].mean()

7.642763330706593

In [0]:
for k, v in jikan.anime(2104).items():
  print(k, v)

request_hash request:anime:cf2a8fdd20221a9ad08fb1b3d28089e4af32791f
request_cached True
request_cache_expiry 46197
mal_id 2104
url https://myanimelist.net/anime/2104/Seto_no_Hanayome
image_url https://cdn.myanimelist.net/images/anime/13/58383.jpg
trailer_url https://www.youtube.com/embed/gqgk9frneoA?enablejsapi=1&wmode=opaque&autoplay=1
title Seto no Hanayome
title_english My Bride is a Mermaid
title_japanese 瀬戸の花嫁
title_synonyms ['The Inland Sea Bride']
type TV
source Manga
episodes 26
status Finished Airing
airing False
aired {'from': '2007-04-02T00:00:00+00:00', 'to': '2007-10-01T00:00:00+00:00', 'prop': {'from': {'day': 2, 'month': 4, 'year': 2007}, 'to': {'day': 1, 'month': 10, 'year': 2007}}, 'string': 'Apr 2, 2007 to Oct 1, 2007'}
duration 24 min per ep
rating PG-13 - Teens 13 or older
score 7.82
scored_by 103796
rank 819
popularity 443
members 238651
favorites 2767
synopsis During his summer vacation, middle school student Nagasumi Michishio travels to the Seto Inland Sea. One 

# custom dataset

In [0]:
sagemaker_df = custom_df.copy()
sagemaker_df.rename(columns={"rating": "my_score"}, inplace=True)

In [0]:
sagemaker_df

Unnamed: 0,userID,itemID,my_score
0,2255153,21,9
1,2255153,59,7
2,2255153,74,7
3,2255153,120,7
4,2255153,178,7
...,...,...,...
19171945,4862000,15611,9
19171946,4862000,27815,9
19171947,299167,5945,8
19171948,263803,1316,9


In [0]:
users_dct = dict(users_df.T)

In [0]:
anime_dct = dict(anime_df.T)

In [0]:
anime_df.loc[1744]

title                                  Wagamama☆Fairy Mirumo de Pon!
title_english                                          Mirmo Zibang!
title_japanese                                    わがまま☆フェアリー ミルモでポン!
title_synonyms     Milmo de Pon!, Mirmo!, Wagamama Fairy Milmo De...
image_url          https://myanimelist.cdn-dena.com/images/anime/...
type                                                              TV
source                                                         Manga
episodes                                                         172
status                                               Finished Airing
airing                                                         False
aired_string                             Apr 6, 2002 to Sep 27, 2005
aired                     {'from': '2002-04-06', 'to': '2005-09-27'}
duration                                             21 min. per ep.
rating                                                  G - All Ages
score                             

In [0]:
sagemaker_df["my_score"]

0            9
1            7
2            7
3            7
4            7
            ..
19171945     9
19171946     9
19171947     8
19171948     9
19171949    10
Name: my_score, Length: 19171950, dtype: int64

In [0]:
def make_sage(df):
    df = df.join(df.apply(lambda x: users_dct[x["userID"]].append(anime_dct[x["itemID"]]), axis=1))
    return df

X = sagemaker_df.drop(columns="my_score")
y = sagemaker_df["my_score"]

X, _, y, _ = train_test_split(X, y, stratify=y, test_size=.99, train_size=.01)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [0]:
X.nunique()

userID    65545
itemID     5383
dtype: int64

In [0]:
train_one_percent = pd.concat([X_train, y_train], axis=1)
train_one_percent = make_sage(train_one_percent)
test_one_percent = pd.concat([X_test, y_test], axis=1)
test_one_percent = make_sage(test_one_percent)

In [0]:
len(train_one_percent), len(test_one_percent)

(143789, 47930)

In [0]:
train_one_percent.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143789 entries, 7377478 to 9145034
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   userID                    143789 non-null  int64         
 1   itemID                    143789 non-null  int64         
 2   my_score                  143789 non-null  int64         
 3   username                  143789 non-null  object        
 4   user_watching             143789 non-null  int64         
 5   user_completed            143789 non-null  int64         
 6   user_onhold               143789 non-null  int64         
 7   user_dropped              143789 non-null  int64         
 8   user_plantowatch          143789 non-null  int64         
 9   user_days_spent_watching  143789 non-null  float64       
 10  gender                    143789 non-null  object        
 11  location                  143784 non-null  object        


In [0]:
train_one_percent.to_csv("train_sagemaker_one_percent.csv")

In [0]:
test_one_percent.to_csv("test_sagemaker_one_percent.csv")

In [0]:
sagemaker_df = make_sage(sagemaker_df)

In [0]:
sagemaker_df = sagemaker_df.apply(lambda x: users_df.loc[x["userID"]].append(anime_df.loc[x["itemID"]]).append(sagemaker_df.loc[x.name][["my_score"]]), axis=1)

In [0]:
users_df.loc[a_user_id].append(anime_df.loc[an_item_id]).append()

username                                                       hinogurl_mikha
user_watching                                                               1
user_completed                                                              0
user_onhold                                                                 0
user_dropped                                                                0
user_plantowatch                                                            0
user_days_spent_watching                                             0.845833
gender                                                                 Female
location                                                         makati,phil.
birth_date                                                1992-11-17 00:00:00
join_date                                                 2008-04-05 00:00:00
last_online                                               2008-04-14 04:32:00
stats_mean_score                                                

In [0]:
custom_df["rating"].mean(), custom_df["rating"].median(), custom_df["rating"].mode()

(7.591403847808908, 8.0, 0    8
 dtype: int64)

In [0]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(custom_df, reader)

In [0]:
from surprise import AlgoBase
from surprise import Dataset
from surprise.model_selection import cross_validate


class MyOwnAlgorithm(AlgoBase):

    def __init__(self):

        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def estimate(self, u, i):

        return 8


algo = MyOwnAlgorithm()


from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import KFold

# define a cross-validation iterator
kf = KFold(n_splits=10)

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.7544
RMSE: 1.7539
RMSE: 1.7544
RMSE: 1.7542
RMSE: 1.7552
RMSE: 1.7564
RMSE: 1.7542
RMSE: 1.7536
RMSE: 1.7519
RMSE: 1.7561


In [0]:
rmses = [1.7544, 1.7539, 1.7544, 1.7542, 1.7552, 1.7564, 1.7542, 1.7536, 1.7519, 1.7561]
np.mean(rmses)

1.75443

In [0]:
user_id_dct = dict(users_df["username"].dropna().reset_index().set_index("username")["user_id"])

In [0]:

from joblib import Parallel, delayed
import multiprocessing
from multiprocessing import cpu_count

def name_to_id(df):
    df.replace(user_id_dct, inplace=True)
    return df

cores = cpu_count() #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want
 
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

def applyParallel(dfGrouped, func):
    retLst = Parallel(n_jobs=multiprocessing.cpu_count(), verbose=multiprocessing.cpu_count())(delayed(func)(group) for name, group in dfGrouped)
    return pd.concat(retLst)

# custom_df["username"] = custom_df["username"].apply(lambda x: users_df[users_df["username"] == x].index[0])

# applyParallel(custom_df[:5].groupby(custom_df[:5].index), name_to_id)

In [0]:
# data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
anime_lists_df

In [0]:
%%time

custom_df[:5].replace(user_id_dct)

CPU times: user 4.7 s, sys: 88.8 ms, total: 4.79 s
Wall time: 4.79 s


Unnamed: 0,anime_id,my_score,username
0,21,9,2255153
1,59,7,2255153
2,74,7,2255153
3,120,7,2255153
4,178,7,2255153


In [0]:
users_df.loc[4862000]

In [0]:
users_df[["username"]].reset_index().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108711 entries, 0 to 108710
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   user_id   108711 non-null  int64 
 1   username  108710 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.7+ MB


In [0]:
unique_usernames = custom_df["username"].unique()
unique_usernames.

numpy.ndarray

In [0]:
custom_df["username"] = custom_df["username"].apply(lambda x: user_id_dct[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [0]:
custom_df.to_csv("custom_dataset.csv")

In [0]:
anime_lists_df[anime_lists_df["username"].notna()]

Unnamed: 0,username,anime_id,my_watched_episodes,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
0,karthiga,21,586,9,1,,0,2013-03-03 10:52:53,
1,karthiga,59,26,7,2,,0,2013-03-10 13:54:51,
2,karthiga,74,26,7,2,,0,2013-04-27 16:43:35,
3,karthiga,120,26,7,2,,0,2013-03-03 10:53:57,
4,karthiga,178,26,7,2,0.0,0,2013-03-27 15:59:13,
...,...,...,...,...,...,...,...,...,...
31284025,Yokonightcore,15611,48,9,1,,0,2015-09-07 17:33:03,
31284026,Yokonightcore,27815,22,9,1,,0,2015-09-07 17:32:05,
31284027,wargod,5945,39,8,2,0.0,0,2010-03-29 04:24:12,
31284028,JMc_SetoKai_LoVe,1316,52,9,2,,0,2009-12-23 05:45:14,
