Here I want to make a simple recommender system to find the similarity between shows, users and to help me predict whether a user will enjoy a particular anime.

In [66]:
# Import relevant libraries 

import pandas as pd
import numpy as np
import scipy as sp
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt
import operator
%matplotlib inline


In [2]:
# Loading CVS files as Dataframes
anime = pd.read_csv('myAnimelist-No_Hentai 2.csv')
rating = pd.read_csv('rating.csv')

Replacing -1 value with a null value

In [3]:
rating.rating.replace({-1: np.nan}, inplace = True)
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [4]:
rating.head()
print(rating.shape)

(7813737, 3)


Join the two dataframes on the anime_id columns

In [5]:
# For this analysis I'm only interest in finding recommendations for the TV category

anime_show = anime[anime['type']=='TV']
anime_show.head()
print(anime_show.shape)

(4103, 14)


In [6]:
# Merging anime_show and rating dataframe with inner join
merged = rating.merge(anime_show, left_on = 'anime_id', right_on = 'animeID', suffixes= ['_user', ''], how='inner')
# Renaming column
merged.rename(columns = {'rating_user':'user_rating'}, inplace = True)
# Dropping extra column
merged = merged.drop(columns="animeID")
print(merged.shape)
merged.head()

(5185071, 16)


Unnamed: 0,user_id,anime_id,user_rating,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
0,1,20,,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
1,3,20,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
2,5,20,6.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
3,6,20,,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
4,10,20,,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356


In [7]:
merged2 = merged.dropna()
print(merged2.shape)
merged2.head()

(4284319, 16)


Unnamed: 0,user_id,anime_id,user_rating,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
1,3,20,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
2,5,20,6.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
5,21,20,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
6,28,20,9.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
7,34,20,9.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356


In [16]:
# For computing reasons I'm limiting the dataframe length to 25,000 users

merged2=merged2[['user_id', 'name', 'user_rating']]
merged_sub= merged2[merged2.user_id <= 25000]
merged_sub.head()

Unnamed: 0,user_id,name,user_rating
1,3,Naruto,8.0
2,5,Naruto,6.0
5,21,Naruto,8.0
6,28,Naruto,9.0
7,34,Naruto,9.0


Pivoting table of users on one axis and tv show names along the other. This helps us in defining the similarity between users and shows to better predict who will like what.

In [17]:
piv = merged_sub.pivot_table(index=['user_id'], columns=['name'], values='user_rating',fill_value=0)

In [18]:
print(piv.shape)
piv.head()

(23408, 2811)


name,.hack//Roots,.hack//Sign,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,12-sai.: Chicchana Mune no Tokimeki 2nd Season,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,Zombie-Loan,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,7,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Transposing the Matrix (eg. pivoted table)

In [19]:
# Note: As we are subtracting the mean from each rating to standardize
# all users with only one rating or who had rated everything the same will be dropped

# Normalize the values
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
# # Drop all columns containing only zeros representing users who did not rate
piv_norm.fillna(0, inplace=True)


# Transposing the Matrix
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]

print(piv_norm.shape)
piv_norm.head()

(2811, 23408)


user_id,1,2,3,5,7,8,9,10,11,12,...,24990,24991,24992,24993,24994,24995,24996,24997,24999,25000
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,-0.001423,-0.000356,-0.018356,-0.047207,-0.06005,-0.003557,-0.000356,-0.000996,-0.015439,-0.006012,...,-0.009,-0.028104,-0.052437,-0.01608,-0.000356,-0.048061,-0.000996,-0.017538,-0.003415,-0.002953
.hack//Sign,-0.001423,-0.000356,-0.018356,-0.047207,-0.06005,-0.003557,-0.000356,-0.000996,-0.015439,-0.006012,...,-0.009,-0.028104,-0.052437,-0.01608,-0.000356,-0.048061,-0.000996,-0.017538,-0.003415,-0.002953
009-1,-0.001423,-0.000356,-0.018356,-0.047207,-0.06005,-0.003557,-0.000356,-0.000996,-0.015439,-0.006012,...,-0.009,-0.028104,-0.052437,-0.01608,-0.000356,-0.048061,-0.000996,-0.017538,-0.003415,-0.002953
07-Ghost,-0.001423,-0.000356,-0.018356,-0.047207,-0.06005,-0.003557,-0.000356,-0.000996,-0.015439,-0.006012,...,-0.009,-0.028104,-0.052437,0.98392,-0.000356,-0.048061,-0.000996,-0.017538,-0.003415,-0.002953
11eyes,-0.001423,-0.000356,-0.018356,-0.047207,-0.06005,-0.003557,-0.000356,-0.000996,-0.015439,-0.006012,...,-0.009,-0.028104,-0.052437,-0.01608,-0.000356,0.351939,-0.000996,-0.017538,-0.003415,-0.002953


In [20]:
# Our data needs to be in a sparse matrix format to be read by the following functions
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)
print(piv_sparse.shape)
piv_sparse

(2811, 23408)


<2811x23408 sparse matrix of type '<class 'numpy.float64'>'
	with 65799888 stored elements in Compressed Sparse Row format>

In [21]:
# Matrix showing the computed cosine similarity of anime to anime 
anime_similarity = cosine_similarity(piv_sparse)
# print(anime_similarity.shape)

In [22]:
# Matrix showing the computed cosine similarity of user to user 
user_similarity = cosine_similarity(piv_sparse.T)
# print(user_similarity.shape)

In [23]:
# Inserting the 2 similarity matricies into separate dataframe objects

# Item to Item cosine similarity 
anime_sim_df = pd.DataFrame(anime_similarity, index = piv_norm.index, columns = piv_norm.index)
# User to User cosine similarity 
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)

In [24]:
# This function will return the top 10 shows with the highest cosine similarity value

def top_animes(anime_title):
    count = 1
    print(f'Top 10 similar Anime shows compared to {anime_title}:\n')
    for anime in anime_sim_df.sort_values(by = anime_title, ascending = False).index[1:11]: 
        print(f'No. {count}: {anime}')
        count +=1

In [25]:
# This function will return the top 5 users with the highest similarity value 

def top_users(user):
    
    if user not in piv_norm.columns:
        return(f'No data available on user {user}')
    
    print(f'Top 10 similar Users compared to {user}:\n')
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user, sim))

In [26]:
# This function constructs a list of lists containing the highest rated shows per similar user
# and returns the name of the show along with the frequency it appears in the list

def similar_user_recs(user):
    
    if user not in piv_norm.columns:
        return(f'No data available on user {user}')
    
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = piv_norm.loc[:, i].max()
        best.append(piv_norm[piv_norm.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]

In [39]:
# This function calculates the weighted average of similar users
# to determine a potential rating for an input user and show

def predicted_rating(anime_name, user):
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:1000]
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:1000]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, anime_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    return sum(rating_list)/sum(weight_list)

In [29]:
top_animes('Naruto')

Top 10 similar Anime shows compared to Naruto:

No. 1: Death Note
No. 2: Fullmetal Alchemist
No. 3: Fullmetal Alchemist: Brotherhood
No. 4: Sword Art Online
No. 5: Bleach
No. 6: Shingeki no Kyojin
No. 7: Code Geass: Hangyaku no Lelouch
No. 8: Dragon Ball Z
No. 9: Ao no Exorcist
No. 10: Code Geass: Hangyaku no Lelouch R2


In [30]:
top_users(3)

Top 10 similar Users compared to 3:

User #1918, Similarity value: 0.57
User #13920, Similarity value: 0.53
User #12887, Similarity value: 0.52
User #3376, Similarity value: 0.51
User #12905, Similarity value: 0.51
User #19885, Similarity value: 0.49
User #14741, Similarity value: 0.49
User #20655, Similarity value: 0.49
User #9615, Similarity value: 0.49
User #21233, Similarity value: 0.48


In [31]:
similar_user_recs(3)

[('Death Note', 6),
 ('Fullmetal Alchemist: Brotherhood', 5),
 ('Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai.', 3),
 ('Dragon Ball Z', 3),
 ('Hunter x Hunter (2011)', 3)]

In [32]:
predicted_rating('Naruto', 3)

The predicted score for user_id: 3 is 5.8888152927874655


In [40]:
# Function to list every show watched by user_id 
def watchlist_of_user(user):
    return piv.T[piv.loc[user,:]>0].index.tolist()

In [67]:
# Make a list of the mean squared errors between actual and predicted value
def sq_errors(anime_title,user):
    watchlist_of_user(user)
    errors = []
    for anime_title in watchlist_of_user(user):
        actual=piv.loc[user, anime_title]
        predicted = predicted_rating(anime_title, user)
        errors.append((actual-predicted)**2)
#         RMSE = np.sqrt(((predicted - actual) ** 2).mean())
        MSE = np.mean((errors))
        RMSE = sqrt(MSE)
    print(f'The MSE:{MSE}')
    print(f'The RMSE:{RMSE}')

In [None]:
sq_errors('Naruto',3)