In [15]:
! pip install scikit-surprise
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise import Trainset

from collections import defaultdict
from operator import itemgetter
import heapq

import os
import csv

import pandas as pd



In [5]:
# Download the (small) Movielens dataset
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip
!ls

--2023-05-24 13:38:39--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2023-05-24 13:38:41 (1.20 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  
Rukun_Data_cleaning.ipynb
Rukun_surprise_test.ipynb
SurpriseLib.ipynb
[1m[36mml-latest-small[m[m
ml-latest-small.zip
myanimelist-recommender-system.ipynb
saratsiri_anime_rec_Rukun played____.ipynb
saratsiri_anime_rec_v3.ipynb


In [3]:
#Importing the CSVs to Dataframe format
UsersDF = pd.read_csv('../raw_data/users_cleaned.csv')
AnimesDF = pd.read_csv('../raw_data/anime_cleaned.csv')
ScoresDF = pd.read_csv('../raw_data/animelists_cleaned.csv')

# Build trainset

In [27]:
ScoresDF["my_score"].value_counts()

my_score
0     12111905
8      4834595
7      4234726
9      3443674
10     2507404
6      2128502
5      1085660
4       480871
3       223202
2       130314
1       103177
Name: count, dtype: int64

In [31]:
ScoresDF.shape

(31284030, 11)

In [72]:
ScoresDF.head()

Unnamed: 0,username,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
0,karthiga,21,586,0000-00-00,0000-00-00,9,1,,0,2013-03-03 10:52:53,
1,karthiga,59,26,0000-00-00,0000-00-00,7,2,,0,2013-03-10 13:54:51,
2,karthiga,74,26,0000-00-00,0000-00-00,7,2,,0,2013-04-27 16:43:35,
3,karthiga,120,26,0000-00-00,0000-00-00,7,2,,0,2013-03-03 10:53:57,
4,karthiga,178,26,0000-00-00,0000-00-00,7,2,0.0,0,2013-03-27 15:59:13,


In [30]:
ScoresDF.groupby("anime_id").agg({"my_score":["min","median","mean","max","std"]})

Unnamed: 0_level_0,my_score,my_score,my_score,my_score,my_score
Unnamed: 0_level_1,min,median,mean,max,std
anime_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,0,8.0,5.784002,10,4.242280
5,0,8.0,6.333299,10,3.755587
6,0,7.0,5.433571,10,4.057547
7,0,5.0,3.890678,10,3.834408
8,0,5.0,3.807520,10,3.728352
...,...,...,...,...,...
37886,0,0.0,0.000000,0,0.000000
37888,0,0.0,0.000000,0,0.000000
37894,0,0.0,0.000000,0,0.000000
37895,0,0.0,0.000000,0,0.000000


In [32]:
ScoresDF_selected= ScoresDF[ScoresDF["my_score"]>0][["username","anime_id","my_score","my_last_updated"]]

In [33]:
ScoresDF_selected.shape

(19172125, 4)

In [115]:
small_fraction = 0.0001
ScoresDF_selected_small = ScoresDF_selected.sample(int(len(ScoresDF_selected)*small_fraction))
ScoresDF_selected_small.shape

(1917, 4)

# Transform dataframe to surprise trainset

In [116]:
reader = Reader(rating_scale=(0, 10))
scoredata = Dataset.load_from_df(ScoresDF_selected_small[['username', 'anime_id', 'my_score']], reader)
trainset = scoredata.build_full_trainset()

In [117]:
trainset

<surprise.trainset.Trainset at 0x111821780>

# Try KNNBasic

In [144]:
knn_predictor = KNNBasic(sim_options={
        'name': 'cosine',
        'user_based': False
        })
        
knn_predictor.fit(trainset)
similarity_matrix = knn_predictor.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [145]:
similarity_matrix


array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

# Create Movie Dictionary 

In [120]:
AnimesDF.head()

Unnamed: 0,anime_id,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,...,broadcast,related,producer,licensor,studio,genre,opening_theme,ending_theme,duration_min,aired_from_year
0,11013,Inu x Boku SS,Inu X Boku Secret Service,妖狐×僕SS,Youko x Boku SS,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,12,Finished Airing,...,Fridays at Unknown,"{'Adaptation': [{'mal_id': 17207, 'type': 'man...","Aniplex, Square Enix, Mainichi Broadcasting Sy...",Sentai Filmworks,David Production,"Comedy, Supernatural, Romance, Shounen","['""Nirvana"" by MUCC']","['#1: ""Nirvana"" by MUCC (eps 1, 11-12)', '#2: ...",24.0,2012.0
1,2104,Seto no Hanayome,My Bride is a Mermaid,瀬戸の花嫁,The Inland Sea Bride,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,26,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 759, 'type': 'manga...","TV Tokyo, AIC, Square Enix, Sotsu",Funimation,Gonzo,"Comedy, Parody, Romance, School, Shounen","['""Romantic summer"" by SUN&LUNAR']","['#1: ""Ashita e no Hikari (明日への光)"" by Asuka Hi...",24.0,2007.0
2,5262,Shugo Chara!! Doki,Shugo Chara!! Doki,しゅごキャラ！！どきっ,"Shugo Chara Ninenme, Shugo Chara! Second Year",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,51,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 101, 'type': 'manga...","TV Tokyo, Sotsu",,Satelight,"Comedy, Magic, School, Shoujo","['#1: ""Minna no Tamago (みんなのたまご)"" by Shugo Cha...","['#1: ""Rottara Rottara (ロッタラ ロッタラ)"" by Buono! ...",24.0,2008.0
3,721,Princess Tutu,Princess Tutu,プリンセスチュチュ,,https://myanimelist.cdn-dena.com/images/anime/...,TV,Original,38,Finished Airing,...,Fridays at Unknown,"{'Adaptation': [{'mal_id': 1581, 'type': 'mang...","Memory-Tech, GANSIS, Marvelous AQL",ADV Films,Hal Film Maker,"Comedy, Drama, Magic, Romance, Fantasy","['""Morning Grace"" by Ritsuko Okazaki']","['""Watashi No Ai Wa Chiisaikeredo"" by Ritsuko ...",16.0,2002.0
4,12365,Bakuman. 3rd Season,Bakuman.,バクマン。,Bakuman Season 3,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 9711, 'type': 'mang...","NHK, Shueisha",,J.C.Staff,"Comedy, Drama, Romance, Shounen","['#1: ""Moshimo no Hanashi (もしもの話)"" by nano.RIP...","['#1: ""Pride on Everyday"" by Sphere (eps 1-13)...",24.0,2012.0


In [121]:
animeID_to_name = AnimesDF.set_index('anime_id')['title'].to_dict()
animeID_to_name

{11013: 'Inu x Boku SS',
 2104: 'Seto no Hanayome',
 5262: 'Shugo Chara!! Doki',
 721: 'Princess Tutu',
 12365: 'Bakuman. 3rd Season',
 6586: 'Yume-iro Pâtissière',
 178: 'Ultra Maniac',
 2787: 'Shakugan no Shana II (Second)',
 4477: 'Nodame Cantabile: Paris-hen',
 853: 'Ouran Koukou Host Club',
 4814: 'Junjou Romantica 2',
 7054: 'Kaichou wa Maid-sama!',
 11123: 'Sekaiichi Hatsukoi 2',
 14227: 'Tonari no Kaibutsu-kun',
 269: 'Bleach',
 59: 'Chobits',
 6045: 'Kimi ni Todoke',
 1735: 'Naruto: Shippuuden',
 210: 'Ranma ½',
 4224: 'Toradora!',
 10030: 'Bakuman. 2nd Season',
 74: 'Gakuen Alice',
 4722: 'Skip Beat!',
 14397: 'Chihayafuru 2',
 1557: 'Shounen Onmyouji',
 10800: 'Chihayafuru',
 3731: 'Itazura na Kiss',
 9513: 'Beelzebub',
 5835: 'Hanasakeru Seishounen',
 9863: 'SKET Dance',
 7817: 'B-gata H-kei',
 966: 'Crayon Shin-chan',
 120: 'Fruits Basket',
 957: 'Saiunkoku Monogatari',
 21: 'One Piece',
 1974: 'Glass no Kamen (2005)',
 857: 'Air Gear',
 1914: 'Saiunkoku Monogatari 2nd Sea

# Set variables

In [124]:
ScoresDF_selected_small.groupby("username").agg({"anime_id":"count"}).describe()

Unnamed: 0,anime_id
count,1873.0
mean,1.023492
std,0.154985
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,3.0


In [127]:
ScoresDF_selected_small.head()

Unnamed: 0,username,anime_id,my_score,my_last_updated
24627169,maniac-chan,228,10,2008-01-31 15:51:30
23556643,TsundereIdol,32864,7,2016-12-10 03:54:05
16843400,SlipperyCircles,16035,7,2013-06-29 12:42:22
26081383,Pedrosena,23673,7,2014-12-26 00:19:57
28614242,Plakkis,4280,9,2014-02-26 23:56:03


In [128]:
# Pick a random user name.
# Play around and see how the final recommendations change
# depending on the user! 1-610
test_subject = 'maniac-chan'

# Get the top K items user rated
k = 20

In [129]:
# When using Surprise, there are RAW and INNER IDs.
# Raw IDs are the IDs, strings or numbers, you use when
# creating the trainset. The raw ID will be converted to
# an unique integer Surprise can more easily manipulate
# for computations.
#
# So in order to find an user inside the trainset, you
# need to convert their RAW ID to the INNER Id. Read
# here for more info https://surprise.readthedocs.io/en/stable/FAQ.html#what-are-raw-and-inner-ids
test_subject_iid = trainset.to_inner_uid(test_subject)
test_subject_iid



0

In [89]:
test_subject_ratings = trainset.ur[test_subject_iid]
test_subject_ratings

[(0, 5.0),
 (1, 8.0),
 (3, 8.0),
 (5, 7.0),
 (6, 10.0),
 (9, 8.0),
 (11, 9.0),
 (12, 8.0),
 (19, 7.0),
 (25, 8.0),
 (27, 10.0),
 (29, 9.0),
 (30, 8.0),
 (34, 8.0),
 (37, 10.0),
 (38, 9.0),
 (41, 10.0),
 (42, 6.0),
 (43, 9.0),
 (44, 9.0),
 (45, 9.0),
 (47, 8.0),
 (49, 10.0),
 (50, 9.0),
 (52, 9.0),
 (242, 8.0),
 (56, 7.0),
 (58, 8.0),
 (685, 9.0),
 (59, 9.0),
 (687, 5.0),
 (61, 7.0),
 (62, 8.0),
 (64, 9.0),
 (691, 6.0),
 (692, 5.0),
 (246, 9.0),
 (693, 9.0),
 (695, 9.0),
 (66, 8.0),
 (67, 7.0),
 (68, 8.0),
 (69, 10.0),
 (70, 8.0),
 (247, 9.0),
 (72, 7.0),
 (700, 10.0),
 (76, 9.0),
 (703, 7.0),
 (251, 7.0),
 (252, 7.0),
 (79, 10.0),
 (256, 8.0),
 (82, 10.0),
 (709, 8.0),
 (710, 5.0),
 (714, 9.0),
 (576, 8.0),
 (718, 9.0),
 (722, 7.0),
 (728, 5.0),
 (259, 5.0),
 (730, 7.0),
 (260, 8.0),
 (261, 5.0),
 (731, 8.0),
 (262, 9.0),
 (733, 9.0),
 (735, 9.0),
 (86, 8.0),
 (87, 6.0),
 (88, 6.0),
 (89, 9.0),
 (746, 6.0),
 (268, 7.0),
 (93, 9.0),
 (273, 9.0),
 (757, 9.0),
 (758, 6.0),
 (96, 8.0),
 (7

In [130]:
# Get the top K items we rated
k_neighbors = heapq.nlargest(k, test_subject_ratings, key=lambda t: t[1])
k_neighbors

[(6, 10.0),
 (27, 10.0),
 (37, 10.0),
 (41, 10.0),
 (49, 10.0),
 (69, 10.0),
 (700, 10.0),
 (79, 10.0),
 (82, 10.0),
 (97, 10.0),
 (779, 10.0),
 (788, 10.0),
 (859, 10.0),
 (916, 10.0),
 (926, 10.0),
 (942, 10.0),
 (124, 10.0),
 (127, 10.0),
 (961, 10.0),
 (967, 10.0)]

In [131]:
test_subject_iid

0

In [132]:
itemID_tmp =6
rating_temp=10.0

In [146]:
similarity_matrix[itemID_tmp]

array([0., 0., 0., ..., 0., 0., 0.])

In [147]:
# Default dict is basically a standard dictionary,
# the difference beeing that it doesn't throw an error
# when trying to access a key which does not exist,
# instead a new entry, with that key, is created.
candidates = defaultdict(float)

for itemID, rating in k_neighbors:
    try:
      similaritities = similarity_matrix[itemID]
      for innerID, score in enumerate(similaritities):
          candidates[innerID] += score * (rating / 5.0)
    except:
      continue
  


In [148]:
similarity_matrix.compute_similarities()[1]

AttributeError: 'numpy.ndarray' object has no attribute 'compute_similarities'

In [149]:
# Utility we'll use later.
def getAnimeName(animeID):
  if int(animeID) in animeID_to_name:
    return animeID_to_name[int(animeID)]
  else:
      return ""

In [150]:
getAnimeName("5231")

'Inazuma Eleven'

In [151]:
# Build a dictionary of movies the user has watched
watched = {}
for itemID, rating in trainset.ur[test_subject_iid]:
  watched[itemID] = 1

# Add items to list of user's recommendations
# If they are similar to their favorite movies,
# AND have not already been watched.
recommendations = []

position = 0
for itemID, rating_sum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
  if not itemID in watched:
    recommendations.append(getAnimeName(trainset.to_raw_iid(itemID)))
    position += 1
    if (position > 10): break # We only want top 10

for rec in recommendations:
  print("Anime: ", rec)

Anime:  Rurouni Kenshin: Meiji Kenkaku Romantan - Tsuioku-hen
Anime:  Colorful (Movie)
Anime:  Fullmetal Alchemist: Brotherhood Specials
Anime:  Isekai no Seikishi Monogatari
Anime:  Arcana Famiglia
Anime:  Shigofumi: Sorekara
Anime:  Cowboy Bebop
Anime:  Sword Art Online
Anime:  Ushio to Tora (TV)
Anime:  Mahou Shoujo Madoka★Magica Movie 3: Hangyaku no Monogatari
Anime:  Okusama ga Seitokaichou!


In [110]:
watched

{4577: 1, 4929: 1}