In [1]:
! pip install scikit-surprise
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise import Trainset

from collections import defaultdict
from operator import itemgetter
import heapq

import os
import csv

import pandas as pd



# Import CSV 

In [2]:
UsersDF = pd.read_csv('../raw_data/users_cleaned.csv')
AnimesDF = pd.read_csv('../raw_data/anime_cleaned.csv')
ScoresDF = pd.read_csv('../raw_data/animelists_cleaned.csv')

# Build trainset

In [3]:
ScoresDF["my_score"].value_counts()

my_score
0     12111905
8      4834595
7      4234726
9      3443674
10     2507404
6      2128502
5      1085660
4       480871
3       223202
2       130314
1       103177
Name: count, dtype: int64

In [4]:
ScoresDF.shape

(31284030, 11)

In [5]:
ScoresDF.head()

Unnamed: 0,username,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
0,karthiga,21,586,0000-00-00,0000-00-00,9,1,,0,2013-03-03 10:52:53,
1,karthiga,59,26,0000-00-00,0000-00-00,7,2,,0,2013-03-10 13:54:51,
2,karthiga,74,26,0000-00-00,0000-00-00,7,2,,0,2013-04-27 16:43:35,
3,karthiga,120,26,0000-00-00,0000-00-00,7,2,,0,2013-03-03 10:53:57,
4,karthiga,178,26,0000-00-00,0000-00-00,7,2,0.0,0,2013-03-27 15:59:13,


In [6]:
ScoresDF.groupby("anime_id").agg({"my_score":["min","median","mean","max","std"]})

Unnamed: 0_level_0,my_score,my_score,my_score,my_score,my_score
Unnamed: 0_level_1,min,median,mean,max,std
anime_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,0,8.0,5.784002,10,4.242280
5,0,8.0,6.333299,10,3.755587
6,0,7.0,5.433571,10,4.057547
7,0,5.0,3.890678,10,3.834408
8,0,5.0,3.807520,10,3.728352
...,...,...,...,...,...
37886,0,0.0,0.000000,0,0.000000
37888,0,0.0,0.000000,0,0.000000
37894,0,0.0,0.000000,0,0.000000
37895,0,0.0,0.000000,0,0.000000


In [7]:
ScoresDF_selected= ScoresDF[ScoresDF["my_score"]>0][["username","anime_id","my_score","my_last_updated"]]

In [8]:
ScoresDF_selected.shape

(19172125, 4)

In [9]:
small_fraction = 0.0001
ScoresDF_selected_small = ScoresDF_selected.sample(int(len(ScoresDF_selected)*small_fraction))
ScoresDF_selected_small.shape

(1917, 4)

# Transform dataframe to surprise trainset

In [10]:
reader = Reader(rating_scale=(0, 10))
scoredata = Dataset.load_from_df(ScoresDF_selected_small[['username', 'anime_id', 'my_score']], reader)
trainset = scoredata.build_full_trainset()

In [11]:
trainset

<surprise.trainset.Trainset at 0x1254a36a0>

# Try KNNBasic

In [12]:
knn_predictor = KNNBasic(sim_options={
        'name': 'cosine',
        'user_based': False
        })
        
knn_predictor.fit(trainset)
similarity_matrix = knn_predictor.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [13]:
similarity_matrix


array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [33]:
type(similarity_matrix)

numpy.ndarray

# Create Movie Dictionary 

In [14]:
AnimesDF.head()

Unnamed: 0,anime_id,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,...,broadcast,related,producer,licensor,studio,genre,opening_theme,ending_theme,duration_min,aired_from_year
0,11013,Inu x Boku SS,Inu X Boku Secret Service,妖狐×僕SS,Youko x Boku SS,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,12,Finished Airing,...,Fridays at Unknown,"{'Adaptation': [{'mal_id': 17207, 'type': 'man...","Aniplex, Square Enix, Mainichi Broadcasting Sy...",Sentai Filmworks,David Production,"Comedy, Supernatural, Romance, Shounen","['""Nirvana"" by MUCC']","['#1: ""Nirvana"" by MUCC (eps 1, 11-12)', '#2: ...",24.0,2012.0
1,2104,Seto no Hanayome,My Bride is a Mermaid,瀬戸の花嫁,The Inland Sea Bride,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,26,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 759, 'type': 'manga...","TV Tokyo, AIC, Square Enix, Sotsu",Funimation,Gonzo,"Comedy, Parody, Romance, School, Shounen","['""Romantic summer"" by SUN&LUNAR']","['#1: ""Ashita e no Hikari (明日への光)"" by Asuka Hi...",24.0,2007.0
2,5262,Shugo Chara!! Doki,Shugo Chara!! Doki,しゅごキャラ！！どきっ,"Shugo Chara Ninenme, Shugo Chara! Second Year",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,51,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 101, 'type': 'manga...","TV Tokyo, Sotsu",,Satelight,"Comedy, Magic, School, Shoujo","['#1: ""Minna no Tamago (みんなのたまご)"" by Shugo Cha...","['#1: ""Rottara Rottara (ロッタラ ロッタラ)"" by Buono! ...",24.0,2008.0
3,721,Princess Tutu,Princess Tutu,プリンセスチュチュ,,https://myanimelist.cdn-dena.com/images/anime/...,TV,Original,38,Finished Airing,...,Fridays at Unknown,"{'Adaptation': [{'mal_id': 1581, 'type': 'mang...","Memory-Tech, GANSIS, Marvelous AQL",ADV Films,Hal Film Maker,"Comedy, Drama, Magic, Romance, Fantasy","['""Morning Grace"" by Ritsuko Okazaki']","['""Watashi No Ai Wa Chiisaikeredo"" by Ritsuko ...",16.0,2002.0
4,12365,Bakuman. 3rd Season,Bakuman.,バクマン。,Bakuman Season 3,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 9711, 'type': 'mang...","NHK, Shueisha",,J.C.Staff,"Comedy, Drama, Romance, Shounen","['#1: ""Moshimo no Hanashi (もしもの話)"" by nano.RIP...","['#1: ""Pride on Everyday"" by Sphere (eps 1-13)...",24.0,2012.0


In [15]:
animeID_to_name = AnimesDF.set_index('anime_id')['title'].to_dict()
animeID_to_name

{11013: 'Inu x Boku SS',
 2104: 'Seto no Hanayome',
 5262: 'Shugo Chara!! Doki',
 721: 'Princess Tutu',
 12365: 'Bakuman. 3rd Season',
 6586: 'Yume-iro Pâtissière',
 178: 'Ultra Maniac',
 2787: 'Shakugan no Shana II (Second)',
 4477: 'Nodame Cantabile: Paris-hen',
 853: 'Ouran Koukou Host Club',
 4814: 'Junjou Romantica 2',
 7054: 'Kaichou wa Maid-sama!',
 11123: 'Sekaiichi Hatsukoi 2',
 14227: 'Tonari no Kaibutsu-kun',
 269: 'Bleach',
 59: 'Chobits',
 6045: 'Kimi ni Todoke',
 1735: 'Naruto: Shippuuden',
 210: 'Ranma ½',
 4224: 'Toradora!',
 10030: 'Bakuman. 2nd Season',
 74: 'Gakuen Alice',
 4722: 'Skip Beat!',
 14397: 'Chihayafuru 2',
 1557: 'Shounen Onmyouji',
 10800: 'Chihayafuru',
 3731: 'Itazura na Kiss',
 9513: 'Beelzebub',
 5835: 'Hanasakeru Seishounen',
 9863: 'SKET Dance',
 7817: 'B-gata H-kei',
 966: 'Crayon Shin-chan',
 120: 'Fruits Basket',
 957: 'Saiunkoku Monogatari',
 21: 'One Piece',
 1974: 'Glass no Kamen (2005)',
 857: 'Air Gear',
 1914: 'Saiunkoku Monogatari 2nd Sea

# Set variables

In [16]:
ScoresDF_selected_small.groupby("username").agg({"anime_id":"count"}).describe()

Unnamed: 0,anime_id
count,1887.0
mean,1.015898
std,0.129284
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,3.0


In [17]:
ScoresDF_selected_small.head()

Unnamed: 0,username,anime_id,my_score,my_last_updated
24957819,dirudie30,437,8,2008-08-04 22:41:49
20303854,Tesic,1571,9,2011-06-12 09:27:45
22448655,Yaboku,32013,7,2017-01-17 16:54:36
24174443,KuroxNeko,304,9,2007-12-10 08:29:39
20870216,AzudeaKyrios,23281,8,2014-12-23 04:59:41


In [20]:
# Pick a random user name.
# Play around and see how the final recommendations change
# depending on the user! 1-610
test_subject = 'Tesic'

# Get the top K items user rated
k = 20

In [21]:
# When using Surprise, there are RAW and INNER IDs.
# Raw IDs are the IDs, strings or numbers, you use when
# creating the trainset. The raw ID will be converted to
# an unique integer Surprise can more easily manipulate
# for computations.
#
# So in order to find an user inside the trainset, you
# need to convert their RAW ID to the INNER Id. Read
# here for more info https://surprise.readthedocs.io/en/stable/FAQ.html#what-are-raw-and-inner-ids
test_subject_iid = trainset.to_inner_uid(test_subject)
test_subject_iid



1

In [22]:
test_subject_ratings = trainset.ur[test_subject_iid]
test_subject_ratings

[(1, 9.0)]

In [23]:
# Get the top K items we rated
k_neighbors = heapq.nlargest(k, test_subject_ratings, key=lambda t: t[1])
k_neighbors

[(1, 9.0)]

In [24]:
test_subject_iid

1

In [25]:
itemID_tmp =6
rating_temp=10.0

In [26]:
similarity_matrix[itemID_tmp]

array([0., 0., 0., ..., 0., 0., 0.])

In [27]:
# Default dict is basically a standard dictionary,
# the difference beeing that it doesn't throw an error
# when trying to access a key which does not exist,
# instead a new entry, with that key, is created.
candidates = defaultdict(float)

for itemID, rating in k_neighbors:
    try:
      similaritities = similarity_matrix[itemID]
      for innerID, score in enumerate(similaritities):
          candidates[innerID] += score * (rating / 5.0)
    except:
      continue
  


In [32]:
candidates

defaultdict(float,
            {0: 0.0,
             1: 1.8,
             2: 0.0,
             3: 0.0,
             4: 0.0,
             5: 0.0,
             6: 0.0,
             7: 0.0,
             8: 0.0,
             9: 0.0,
             10: 0.0,
             11: 0.0,
             12: 0.0,
             13: 0.0,
             14: 0.0,
             15: 0.0,
             16: 0.0,
             17: 0.0,
             18: 0.0,
             19: 0.0,
             20: 0.0,
             21: 0.0,
             22: 0.0,
             23: 0.0,
             24: 0.0,
             25: 0.0,
             26: 0.0,
             27: 0.0,
             28: 0.0,
             29: 0.0,
             30: 0.0,
             31: 0.0,
             32: 0.0,
             33: 0.0,
             34: 0.0,
             35: 0.0,
             36: 0.0,
             37: 0.0,
             38: 0.0,
             39: 0.0,
             40: 0.0,
             41: 0.0,
             42: 0.0,
             43: 0.0,
             44: 0.0,
 

In [28]:
similarity_matrix[1]

array([0., 1., 0., ..., 0., 0., 0.])

In [29]:
# Utility we'll use later.
def getAnimeName(animeID):
  if int(animeID) in animeID_to_name:
    return animeID_to_name[int(animeID)]
  else:
      return ""

In [30]:
getAnimeName("5231")

'Inazuma Eleven'

In [31]:
# Build a dictionary of movies the user has watched
watched = {}
for itemID, rating in trainset.ur[test_subject_iid]:
  watched[itemID] = 1

# Add items to list of user's recommendations
# If they are similar to their favorite movies,
# AND have not already been watched.
recommendations = []

position = 0
for itemID, rating_sum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
  if not itemID in watched:
    recommendations.append(getAnimeName(trainset.to_raw_iid(itemID)))
    position += 1
    if (position > 10): break # We only want top 10

for rec in recommendations:
  print("Anime: ", rec)

Anime:  Perfect Blue
Anime:  Oshiete! Galko-chan
Anime:  Aa! Megami-sama! Movie
Anime:  Psycho-Pass 2
Anime:  Re:Zero kara Hajimeru Isekai Seikatsu
Anime:  Appleseed
Anime:  Dragon Ball Z Movie 08: Moetsukiro!! Nessen, Ressen, Chougekisen
Anime:  Gekkan Shoujo Nozaki-kun
Anime:  Motto To LOVE-Ru
Anime:  Shokugeki no Souma OVA
Anime:  Astarotte no Omocha!


In [None]:
watched