Here I want to make a simple recommender system to find the similarity between shows, users and to help me predict whether a user will enjoy a particular anime.

In [21]:
# Import relevant libraries 

import pandas as pd
import numpy as np
import scipy as sp
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import operator
%matplotlib inline

In [56]:
anime = pd.read_csv('animes.csv').drop(columns=['synopsis','link'])
rating = pd.read_csv('reviews.csv').drop(columns=['anime_uid','text','link','scores'])

In [57]:
anime.head()

Unnamed: 0,uid,title,genre,aired,episodes,members,popularity,ranked,score,img_url
0,28891,Haikyuu!! Second Season,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...
1,23273,Shigatsu wa Kimi no Uso,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...
2,34599,Made in Abyss,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...
3,5114,Fullmetal Alchemist: Brotherhood,"['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...
4,31758,Kizumonogatari III: Reiketsu-hen,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...


In [58]:
rating.head()

Unnamed: 0,uid,profile,score
0,255938,DesolatePsyche,8
1,259117,baekbeans,10
2,253664,skrn,7
3,8254,edgewalker00,9
4,291149,aManOfCulture99,10


Join the two dataframes on the anime_id columns

In [71]:
# Merging anime_show and rating dataframe with inner join
merged = rating.merge(anime, on = 'uid', suffixes= ['_user', ''], how='inner')
# Renaming column
merged.rename(columns = {'uid':'anime_id'}, inplace = True)
merged.rename(columns = {'score_user':'user_rating'}, inplace = True)
# Dropping extra column
print(merged.shape)
merged.head()

(12783, 12)


Unnamed: 0,anime_id,profile,user_rating,title,genre,aired,episodes,members,popularity,ranked,score,img_url
0,29323,Slushpuppy282,7,"Oyaji no, Imo no Kamisama.",['Slice of Life'],"Dec 31, 2014",1.0,360,11732,8664.0,5.9,https://cdn.myanimelist.net/images/anime/2/705...
1,29323,Slushpuppy282,7,"Oyaji no, Imo no Kamisama.",['Slice of Life'],"Dec 31, 2014",1.0,360,11732,8664.0,5.9,https://cdn.myanimelist.net/images/anime/2/705...
2,30968,ParaParaJMo,9,Kokoro no Catchball,"['Kids', 'Sports']",2005,1.0,100,15323,12764.0,6.7,https://cdn.myanimelist.net/images/anime/2/745...
3,30968,ParaParaJMo,9,Kokoro no Catchball,"['Kids', 'Sports']",2005,1.0,100,15323,12764.0,6.7,https://cdn.myanimelist.net/images/anime/2/745...
4,38440,Jolon,9,Shikizakura,"['Action', 'Sci-Fi', 'Drama']",2021 to ?,12.0,1419,8859,,,https://cdn.myanimelist.net/images/anime/1203/...


In [72]:
merged2 = merged.dropna()
print(merged2.shape)
merged2.head()

(10636, 12)


Unnamed: 0,anime_id,profile,user_rating,title,genre,aired,episodes,members,popularity,ranked,score,img_url
0,29323,Slushpuppy282,7,"Oyaji no, Imo no Kamisama.",['Slice of Life'],"Dec 31, 2014",1.0,360,11732,8664.0,5.9,https://cdn.myanimelist.net/images/anime/2/705...
1,29323,Slushpuppy282,7,"Oyaji no, Imo no Kamisama.",['Slice of Life'],"Dec 31, 2014",1.0,360,11732,8664.0,5.9,https://cdn.myanimelist.net/images/anime/2/705...
2,30968,ParaParaJMo,9,Kokoro no Catchball,"['Kids', 'Sports']",2005,1.0,100,15323,12764.0,6.7,https://cdn.myanimelist.net/images/anime/2/745...
3,30968,ParaParaJMo,9,Kokoro no Catchball,"['Kids', 'Sports']",2005,1.0,100,15323,12764.0,6.7,https://cdn.myanimelist.net/images/anime/2/745...
8,27501,Mimi_Taylor,9,Sore Ike! Anpanman: Anpanman to Tanoshii Nakam...,"['Kids', 'Adventure', 'Fantasy']","Jul 24, 1999",1.0,171,13933,13953.0,7.1,https://cdn.myanimelist.net/images/anime/9/666...


In [8]:
# For computing reasons I'm limiting the dataframe length to 50,000 users

# merged2=merged2[['user_id', 'name', 'user_rating']]
merged_sub= merged2[merged2.user_id <= 50000]
merged_sub.head()

Unnamed: 0,user_id,anime_id,user_rating,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
1,3,20,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
2,5,20,6.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
5,21,20,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
6,28,20,9.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
7,34,20,9.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356


Pivoting table of users on one axis and tv show names along the other. This helps us in defining the similarity between users and shows to better predict who will like what.

In [73]:
piv = merged_sub.pivot_table(index=['user_id'], columns=['name'], values='user_rating',fill_value=0) #

In [74]:
print(piv.shape)
piv.head()

(46792, 2922)


name,.hack//Roots,.hack//Sign,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,12-sai.: Chicchana Mune no Tokimeki 2nd Season,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,Zombie-Loan,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,7,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Transposing the Matrix (eg. pivoted table)

In [75]:
X = piv.T
X.shape

(2922, 46792)

Decomposing the Matrix

In [76]:
# Truncates matrix in 12 Synthetic features
SVD = TruncatedSVD(n_components=12,random_state=12)

resultant_matrix = SVD.fit_transform(X)

resultant_matrix.shape

(2922, 12)

Generating a Correlation Matrix

In [77]:
corr_mat = np.corrcoef(resultant_matrix)
corr_mat.shape


(2922, 2922)

In [78]:
anime_titles = piv.columns
anime_list = list(anime_titles)

naruto_corr = anime_list.index('Naruto')
naruto_corr

1780

In [79]:
corr_naruto = corr_mat[1780]
corr_naruto.shape

(2922,)

Recommending top 10% of Hightly Correlated Anime

In [80]:
list(anime_titles[(corr_naruto<1.0)&(corr_naruto>0.9)])

['Bleach',
 'Hajime no Ippo',
 'Hajime no Ippo: New Challenger',
 'Hajime no Ippo: Rising',
 'Shaman King']

Top 15.0 Percentile Correlational match for Anime Naruto


['Bleach',
 'Death Note',
 'Dragon Ball Kai',
 'Fairy Tail',
 'Fullmetal Alchemist: Brotherhood',
 'Giant Killing',
 'Hajime no Ippo',
 'Hajime no Ippo: New Challenger',
 'Hajime no Ippo: Rising',
 'Hunter x Hunter',
 'Hunter x Hunter (2011)',
 'Initial D Fifth Stage',
 'Initial D Final Stage',
 'Major S1',
 'Major S3',
 'Major S4',
 'Major S5',
 'Major S6',
 'One Outs',
 'Rainbow: Nisha Rokubou no Shichinin',
 'Shaman King',
 'Shijou Saikyou no Deshi Kenichi',
 'Slam Dunk']

In [36]:
# Note: As we are subtracting the mean from each rating to standardize
# all users with only one rating or who had rated everything the same will be dropped

# Normalize the values
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)


# Drop all columns containing only zeros representing users who did not rate
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]
print(piv_norm.shape)
piv_norm.head()

(2922, 46792)


user_id,1,2,3,5,7,8,9,10,11,12,...,49991,49992,49993,49994,49995,49996,49997,49998,49999,50000
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,-0.001369,-0.000342,-0.017659,-0.045414,-0.057769,-0.003422,-0.000342,-0.000958,-0.014853,-0.005784,...,-0.009343,-0.019781,-0.034497,-0.029535,-0.005818,-0.008453,-0.019986,-0.007453,-0.004997,-0.007392
.hack//Sign,-0.001369,-0.000342,-0.017659,-0.045414,-0.057769,-0.003422,-0.000342,-0.000958,-0.014853,-0.005784,...,-0.009343,-0.019781,-0.034497,-0.029535,-0.005818,-0.008453,-0.019986,-0.007453,-0.004997,-0.007392
009-1,-0.001369,-0.000342,-0.017659,-0.045414,-0.057769,-0.003422,-0.000342,-0.000958,-0.014853,-0.005784,...,-0.009343,-0.019781,-0.034497,-0.029535,-0.005818,-0.008453,-0.019986,-0.007453,-0.004997,-0.007392
07-Ghost,-0.001369,-0.000342,-0.017659,-0.045414,-0.057769,-0.003422,-0.000342,-0.000958,-0.014853,-0.005784,...,-0.009343,-0.019781,0.665503,-0.029535,-0.005818,-0.008453,-0.019986,-0.007453,-0.004997,-0.007392
11eyes,-0.001369,-0.000342,-0.017659,-0.045414,-0.057769,-0.003422,-0.000342,-0.000958,-0.014853,-0.005784,...,-0.009343,-0.019781,-0.034497,-0.029535,-0.005818,-0.008453,-0.019986,-0.007453,-0.004997,-0.007392


In [11]:
# Our data needs to be in a sparse matrix format to be read by the following functions

piv_sparse = sp.sparse.csr_matrix(piv_norm.values)

These matrices show us the computed cosine similarity values 
between each user/user array pair and item/item array pair.

In [12]:
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

In [13]:
# Inserting the similarity matricies into dataframe objects

item_sim_df = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)

In [14]:
# This function will return the top 10 shows with the highest cosine similarity value

def top_animes(anime_title):
    count = 1
    print(f'Similar shows to {anime_title} include:\n')
    for item in item_sim_df.sort_values(by = anime_name, ascending = False).index[1:11]:
        print(f'No. {count}: {item}')
        count +=1

In [15]:
# This function will return the top 5 users with the highest similarity value 

def top_users(user_x):
    
    if user not in piv_norm.columns:
        return(f'No data available on user {user_x}')
    
    print('Most Similar Users:\n')
    sim_values = user_sim_df.sort_values(by=user_x, ascending=False).loc[:,user_x].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by=user_x, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values,)
    for user_x, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user_x, sim))

In [16]:
# This function constructs a list of lists containing the highest rated shows per similar user
# and returns the name of the show along with the frequency it appears in the list

def similar_user_recs(user):
    
    if user not in piv_norm.columns:
        return(f'No data available on user {user}')
    
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = piv_norm.loc[:, i].max()
        best.append(piv_norm[piv_norm.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]

In [23]:
# This function calculates the weighted average of similar users
# to determine a potential rating for an input user and show

def predicted_rating(anime_name, user_x):
    sim_users = user_sim_df.sort_values(by=user_x, ascending=False).index[1:1000]
    user_values = user_sim_df.sort_values(by=user_x, ascending=False).loc[:,user].tolist()[1:1000]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, anime_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    return print(f"The predicted score for user_id: {user_x} is {sum(rating_list)/sum(weight_list)}")

In [31]:
# Function to list every show watched by user_id 
def watchlist_by_user(user_x):
    return piv.T[piv.loc[user_x,:]>0].index.tolist()

In [18]:
top_animes('Naruto')

Similar shows to Naruto include:

No. 1: Bleach
No. 2: Dragon Ball GT
No. 3: Dragon Ball Z
No. 4: Fairy Tail
No. 5: Dragon Ball
No. 6: Pokemon
No. 7: InuYasha
No. 8: Highschool of the Dead
No. 9: Yu☆Gi☆Oh! Duel Monsters
No. 10: Sword Art Online


In [19]:
top_users(3)

Most Similar Users:

User #32218, Similarity value: 0.43
User #11631, Similarity value: 0.42
User #48978, Similarity value: 0.38
User #36430, Similarity value: 0.37
User #2986, Similarity value: 0.37
User #31541, Similarity value: 0.37
User #43242, Similarity value: 0.37
User #2411, Similarity value: 0.36
User #18079, Similarity value: 0.36
User #3681, Similarity value: 0.36


In [20]:
similar_user_recs(3)

[('Shingeki no Kyojin', 6),
 ('Death Note', 3),
 ('Sword Art Online', 3),
 ('Fullmetal Alchemist: Brotherhood', 3),
 ('Clannad: After Story', 3)]

In [24]:
predicted_rating('Naruto', 3)

8.083576090963145

Below we'll see how the predict_rating function performs compared to the observed rated values for user 3.

In [None]:
# Creates a list of every show watched by user_id

watched = watchlist_by_user(3)
# Showing watchlist 
watchlist_by_user(3)

In [35]:
# Make a list of the squared errors between actual and predicted value

errors = []
for i in watched:
    actual=piv.loc[3, i]
    predicted = predicted_rating(i, 3)
    errors.append((actual-predicted)**2)

KeyboardInterrupt: 

In [None]:
# This is the average squared error for user 3
np.mean(errors)