In [1]:
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise import Trainset

from collections import defaultdict
from operator import itemgetter
import heapq

import os
import csv

import pandas as pd



# Import CSV 

In [2]:
UsersDF = pd.read_csv('../raw_data/users_cleaned.csv')
AnimesDF = pd.read_csv('../raw_data/anime_cleaned.csv')
ScoresDF = pd.read_csv('../raw_data/animelists_cleaned.csv')

# Build trainset

In [3]:
ScoresDF["my_score"].value_counts()

0     12111905
8      4834595
7      4234726
9      3443674
10     2507404
6      2128502
5      1085660
4       480871
3       223202
2       130314
1       103177
Name: my_score, dtype: int64

In [4]:
ScoresDF.shape

(31284030, 11)

In [5]:
ScoresDF.head()

Unnamed: 0,username,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
0,karthiga,21,586,0000-00-00,0000-00-00,9,1,,0,2013-03-03 10:52:53,
1,karthiga,59,26,0000-00-00,0000-00-00,7,2,,0,2013-03-10 13:54:51,
2,karthiga,74,26,0000-00-00,0000-00-00,7,2,,0,2013-04-27 16:43:35,
3,karthiga,120,26,0000-00-00,0000-00-00,7,2,,0,2013-03-03 10:53:57,
4,karthiga,178,26,0000-00-00,0000-00-00,7,2,0.0,0,2013-03-27 15:59:13,


In [6]:
ScoresDF.groupby("anime_id").agg({"my_score":["min","median","mean","max","std"]})

Unnamed: 0_level_0,my_score,my_score,my_score,my_score,my_score
Unnamed: 0_level_1,min,median,mean,max,std
anime_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,0,8.0,5.784002,10,4.242280
5,0,8.0,6.333299,10,3.755587
6,0,7.0,5.433571,10,4.057547
7,0,5.0,3.890678,10,3.834408
8,0,5.0,3.807520,10,3.728352
...,...,...,...,...,...
37886,0,0.0,0.000000,0,0.000000
37888,0,0.0,0.000000,0,0.000000
37894,0,0.0,0.000000,0,0.000000
37895,0,0.0,0.000000,0,0.000000


In [7]:
ScoresDF_selected= ScoresDF[ScoresDF["my_score"]>0][["username","anime_id","my_score","my_last_updated"]]

In [8]:
ScoresDF_selected.shape

(19172125, 4)

In [9]:
#small_fraction = 0.0001
#ScoresDF_selected_small = ScoresDF_selected.sample(int(len(ScoresDF_selected)*small_fraction))
#ScoresDF_selected_small.shape

(1917, 4)

# Transform dataframe to surprise trainset

In [35]:
reader = Reader(rating_scale=(0, 10))
scoredata = Dataset.load_from_df(ScoresDF_selected[['username', 'anime_id', 'my_score']], reader)
trainset = scoredata.build_full_trainset()

In [71]:
print(trainset)

<surprise.trainset.Trainset object at 0x1d1624fd0>


# Try KNNBasic

In [37]:
knn_predictor = KNNBasic(sim_options={
        'name': 'cosine',
        'user_based': False
        })
        
knn_predictor.fit(trainset)
similarity_matrix = knn_predictor.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [38]:
similarity_matrix


array([[1.        , 0.9613837 , 0.97060007, ..., 0.        , 0.        ,
        0.        ],
       [0.9613837 , 1.        , 0.97409777, ..., 0.        , 0.        ,
        0.        ],
       [0.97060007, 0.97409777, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [72]:
# save numpy array as npy file
from numpy import asarray
from numpy import save

# save to npy file
save('similarity_matrix.npy', similarity_matrix)

# Create Movie Dictionary 

In [40]:
AnimesDF.head()

Unnamed: 0,anime_id,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,...,broadcast,related,producer,licensor,studio,genre,opening_theme,ending_theme,duration_min,aired_from_year
0,11013,Inu x Boku SS,Inu X Boku Secret Service,妖狐×僕SS,Youko x Boku SS,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,12,Finished Airing,...,Fridays at Unknown,"{'Adaptation': [{'mal_id': 17207, 'type': 'man...","Aniplex, Square Enix, Mainichi Broadcasting Sy...",Sentai Filmworks,David Production,"Comedy, Supernatural, Romance, Shounen","['""Nirvana"" by MUCC']","['#1: ""Nirvana"" by MUCC (eps 1, 11-12)', '#2: ...",24.0,2012.0
1,2104,Seto no Hanayome,My Bride is a Mermaid,瀬戸の花嫁,The Inland Sea Bride,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,26,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 759, 'type': 'manga...","TV Tokyo, AIC, Square Enix, Sotsu",Funimation,Gonzo,"Comedy, Parody, Romance, School, Shounen","['""Romantic summer"" by SUN&LUNAR']","['#1: ""Ashita e no Hikari (明日への光)"" by Asuka Hi...",24.0,2007.0
2,5262,Shugo Chara!! Doki,Shugo Chara!! Doki,しゅごキャラ！！どきっ,"Shugo Chara Ninenme, Shugo Chara! Second Year",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,51,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 101, 'type': 'manga...","TV Tokyo, Sotsu",,Satelight,"Comedy, Magic, School, Shoujo","['#1: ""Minna no Tamago (みんなのたまご)"" by Shugo Cha...","['#1: ""Rottara Rottara (ロッタラ ロッタラ)"" by Buono! ...",24.0,2008.0
3,721,Princess Tutu,Princess Tutu,プリンセスチュチュ,,https://myanimelist.cdn-dena.com/images/anime/...,TV,Original,38,Finished Airing,...,Fridays at Unknown,"{'Adaptation': [{'mal_id': 1581, 'type': 'mang...","Memory-Tech, GANSIS, Marvelous AQL",ADV Films,Hal Film Maker,"Comedy, Drama, Magic, Romance, Fantasy","['""Morning Grace"" by Ritsuko Okazaki']","['""Watashi No Ai Wa Chiisaikeredo"" by Ritsuko ...",16.0,2002.0
4,12365,Bakuman. 3rd Season,Bakuman.,バクマン。,Bakuman Season 3,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 9711, 'type': 'mang...","NHK, Shueisha",,J.C.Staff,"Comedy, Drama, Romance, Shounen","['#1: ""Moshimo no Hanashi (もしもの話)"" by nano.RIP...","['#1: ""Pride on Everyday"" by Sphere (eps 1-13)...",24.0,2012.0


In [41]:
animeID_to_name = AnimesDF.set_index('anime_id')['title'].to_dict()
animeID_to_name

{11013: 'Inu x Boku SS',
 2104: 'Seto no Hanayome',
 5262: 'Shugo Chara!! Doki',
 721: 'Princess Tutu',
 12365: 'Bakuman. 3rd Season',
 6586: 'Yume-iro Pâtissière',
 178: 'Ultra Maniac',
 2787: 'Shakugan no Shana II (Second)',
 4477: 'Nodame Cantabile: Paris-hen',
 853: 'Ouran Koukou Host Club',
 4814: 'Junjou Romantica 2',
 7054: 'Kaichou wa Maid-sama!',
 11123: 'Sekaiichi Hatsukoi 2',
 14227: 'Tonari no Kaibutsu-kun',
 269: 'Bleach',
 59: 'Chobits',
 6045: 'Kimi ni Todoke',
 1735: 'Naruto: Shippuuden',
 210: 'Ranma ½',
 4224: 'Toradora!',
 10030: 'Bakuman. 2nd Season',
 74: 'Gakuen Alice',
 4722: 'Skip Beat!',
 14397: 'Chihayafuru 2',
 1557: 'Shounen Onmyouji',
 10800: 'Chihayafuru',
 3731: 'Itazura na Kiss',
 9513: 'Beelzebub',
 5835: 'Hanasakeru Seishounen',
 9863: 'SKET Dance',
 7817: 'B-gata H-kei',
 966: 'Crayon Shin-chan',
 120: 'Fruits Basket',
 957: 'Saiunkoku Monogatari',
 21: 'One Piece',
 1974: 'Glass no Kamen (2005)',
 857: 'Air Gear',
 1914: 'Saiunkoku Monogatari 2nd Sea

# Set variables

In [42]:
ScoresDF_selected.groupby("username").agg({"anime_id":"count"}).describe()

Unnamed: 0,anime_id
count,106401.0
mean,180.185807
std,214.190233
min,1.0
25%,44.0
50%,114.0
75%,237.0
max,3785.0


In [43]:
ScoresDF_selected.head()

Unnamed: 0,username,anime_id,my_score,my_last_updated
0,karthiga,21,9,2013-03-03 10:52:53
1,karthiga,59,7,2013-03-10 13:54:51
2,karthiga,74,7,2013-04-27 16:43:35
3,karthiga,120,7,2013-03-03 10:53:57
4,karthiga,178,7,2013-03-27 15:59:13


In [58]:
# Pick a random user name.
# Play around and see how the final recommendations change
# depending on the user! 1-610
test_subject = 'karthiga'

# Get the top K items user rated
k = 20

In [59]:
# When using Surprise, there are RAW and INNER IDs.
# Raw IDs are the IDs, strings or numbers, you use when
# creating the trainset. The raw ID will be converted to
# an unique integer Surprise can more easily manipulate
# for computations.
#
# So in order to find an user inside the trainset, you
# need to convert their RAW ID to the INNER Id. Read
# here for more info https://surprise.readthedocs.io/en/stable/FAQ.html#what-are-raw-and-inner-ids
test_subject_iid = trainset.to_inner_uid(test_subject)
test_subject_iid



0

In [60]:
test_subject_ratings = trainset.ur[test_subject_iid]
test_subject_ratings

[(0, 9.0),
 (1, 7.0),
 (2, 7.0),
 (3, 7.0),
 (4, 7.0),
 (5, 7.0),
 (6, 6.0),
 (7, 6.0),
 (8, 8.0),
 (9, 10.0),
 (10, 7.0),
 (11, 7.0),
 (12, 10.0),
 (13, 8.0),
 (14, 10.0),
 (15, 8.0),
 (16, 8.0),
 (17, 9.0),
 (18, 7.0),
 (19, 9.0),
 (20, 7.0),
 (21, 8.0),
 (22, 7.0),
 (23, 5.0),
 (24, 8.0),
 (25, 8.0),
 (26, 8.0),
 (27, 6.0),
 (28, 8.0),
 (29, 8.0),
 (30, 7.0),
 (31, 7.0),
 (32, 8.0),
 (33, 7.0),
 (34, 6.0),
 (35, 6.0),
 (36, 5.0),
 (37, 9.0),
 (38, 7.0),
 (39, 5.0),
 (40, 7.0),
 (41, 8.0),
 (42, 7.0),
 (43, 7.0),
 (44, 8.0),
 (45, 7.0),
 (46, 8.0),
 (47, 7.0),
 (48, 5.0),
 (49, 8.0),
 (50, 8.0),
 (51, 7.0),
 (52, 10.0)]

In [61]:
# Get the top K items we rated
k_neighbors = heapq.nlargest(k, test_subject_ratings, key=lambda t: t[1])
k_neighbors

[(9, 10.0),
 (12, 10.0),
 (14, 10.0),
 (52, 10.0),
 (0, 9.0),
 (17, 9.0),
 (19, 9.0),
 (37, 9.0),
 (8, 8.0),
 (13, 8.0),
 (15, 8.0),
 (16, 8.0),
 (21, 8.0),
 (24, 8.0),
 (25, 8.0),
 (26, 8.0),
 (28, 8.0),
 (29, 8.0),
 (32, 8.0),
 (41, 8.0)]

In [62]:
test_subject_iid

0

In [63]:
itemID_tmp =6
rating_temp=10.0

In [64]:
similarity_matrix[itemID_tmp]

array([0.96343299, 0.97550379, 0.98100599, ..., 1.        , 0.        ,
       0.        ])

In [65]:
# Default dict is basically a standard dictionary,
# the difference beeing that it doesn't throw an error
# when trying to access a key which does not exist,
# instead a new entry, with that key, is created.
candidates = defaultdict(float)

for itemID, rating in k_neighbors:
    try:
      similaritities = similarity_matrix[itemID]
      for innerID, score in enumerate(similaritities):
          candidates[innerID] += score * (rating / 5.0)
    except:
      continue
  


In [66]:
candidates

defaultdict(float,
            {0: 33.40987885632611,
             1: 33.42945676358067,
             2: 33.61365356521377,
             3: 33.49799939930693,
             4: 33.50262507392961,
             5: 33.489909183142466,
             6: 33.47566518893529,
             7: 33.511013396010405,
             8: 33.52364977769143,
             9: 33.476039597100566,
             10: 33.081571315236275,
             11: 33.60390274662238,
             12: 33.51645344720619,
             13: 33.59708356691992,
             14: 32.8749811627674,
             15: 33.604889369880055,
             16: 33.63339721787272,
             17: 33.59771221810571,
             18: 33.62696889751557,
             19: 33.394499801938814,
             20: 33.66428452427179,
             21: 33.51825786587339,
             22: 33.3997564580003,
             23: 33.244871858272866,
             24: 33.58365174239165,
             25: 33.474344025740045,
             26: 33.53215064682998,
             

In [67]:
similarity_matrix[1]

array([0.9613837 , 1.        , 0.97409777, ..., 0.        , 0.        ,
       0.        ])

In [68]:
# Utility we'll use later.
def getAnimeName(animeID):
  if int(animeID) in animeID_to_name:
    return animeID_to_name[int(animeID)]
  else:
      return ""

In [69]:
getAnimeName("5231")

'Inazuma Eleven'

In [70]:
# Build a dictionary of movies the user has watched
watched = {}
for itemID, rating in trainset.ur[test_subject_iid]:
  watched[itemID] = 1

# Add items to list of user's recommendations
# If they are similar to their favorite movies,
# AND have not already been watched.
recommendations = []

position = 0
for itemID, rating_sum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
  if not itemID in watched:
    recommendations.append(getAnimeName(trainset.to_raw_iid(itemID)))
    position += 1
    if (position > 10): break # We only want top 10

for rec in recommendations:
  print("Anime: ", rec)

Anime:  Detective Conan: The Magician of Starlight
Anime:  Kaze no You ni
Anime:  Crayon Shin-chan Movie 25: Shin-chan Shuurai! Uchuujin Shiriri
Anime:  Youkai Watch: Shadow Side
Anime:  Mama wa Poyopoyo Saurus ga Osuki
Anime:  Choujikuu Seiki Orguss Memorial
Anime:  Wan Wan Celepoo Soreyuke! Tetsunoshin
Anime:  Kuruneko: Nyaalock Holmes no Bouken
Anime:  Saga-ken wo Meguru Animation (2017)
Anime:  Ring ni Kakero 1 Pilot
Anime:  Kimi no Koe wo Todoketai


In [57]:
watched

{1: 1,
 3: 1,
 5: 1,
 6: 1,
 9: 1,
 19: 1,
 22: 1,
 26: 1,
 27: 1,
 29: 1,
 34: 1,
 37: 1,
 38: 1,
 39: 1,
 40: 1,
 42: 1,
 44: 1,
 48: 1,
 49: 1,
 50: 1,
 53: 1,
 242: 1,
 56: 1,
 57: 1,
 58: 1,
 59: 1,
 687: 1,
 61: 1,
 65: 1,
 70: 1,
 247: 1,
 71: 1,
 75: 1,
 249: 1,
 250: 1,
 252: 1,
 258: 1,
 261: 1,
 85: 1,
 3343: 1,
 265: 1,
 87: 1,
 88: 1,
 271: 1,
 747: 1,
 92: 1,
 273: 1,
 94: 1,
 95: 1,
 276: 1,
 97: 1,
 764: 1,
 98: 1,
 767: 1,
 779: 1,
 281: 1,
 283: 1,
 286: 1,
 101: 1,
 788: 1,
 103: 1,
 289: 1,
 293: 1,
 105: 1,
 297: 1,
 806: 1,
 589: 1,
 816: 1,
 818: 1,
 819: 1,
 591: 1,
 299: 1,
 592: 1,
 823: 1,
 107: 1,
 3415: 1,
 3543: 1,
 108: 1,
 109: 1,
 3561: 1,
 830: 1,
 594: 1,
 840: 1,
 842: 1,
 844: 1,
 845: 1,
 847: 1,
 111: 1,
 851: 1,
 852: 1,
 3732: 1,
 3416: 1,
 596: 1,
 597: 1,
 858: 1,
 112: 1,
 860: 1,
 3733: 1,
 3734: 1,
 3647: 1,
 863: 1,
 3630: 1,
 115: 1,
 599: 1,
 875: 1,
 3417: 1,
 876: 1,
 3418: 1,
 878: 1,
 3419: 1,
 601: 1,
 883: 1,
 885: 1,
 117: 1,
 888