Here I want to make a simple recommender system to find the similarity between shows, users and to help me predict whether a user will enjoy a particular anime.

In [1]:
# Import relevant libraries 

import pandas as pd
import numpy as np
import scipy as sp
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import operator
%matplotlib inline

In [2]:
anime = pd.read_csv('myAnimelist-No_Hentai 2.csv')
rating = pd.read_csv('/Users/SDMAN/Documents/Flatiron/Projects/rating.csv')

Replacing -1 value with a null value

In [3]:
rating.rating.replace({-1: np.nan}, inplace = True)
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [4]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


Join the two dataframes on the anime_id columns

In [5]:
# For this analysis I'm only interest in finding recommendations for the TV category

anime_show = anime[anime['type']=='TV']
anime_show.head()

Unnamed: 0,animeID,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
0,1,Cowboy Bebop,TV,Original,26,0:24:00,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460
2,6,Trigun,TV,Manga,26,0:24:00,"['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'D...",PG-13 - Teens 13 or older,8.3,212537,255,146,408548,10432
3,7,Witch Hunter Robin,TV,Original,26,0:25:00,"['Action', 'Magic', 'Police', 'Supernatural', ...",PG-13 - Teens 13 or older,7.33,32837,2371,1171,79397,537
4,8,Bouken Ou Beet,TV,Manga,52,0:23:00,"['Adventure', 'Fantasy', 'Shounen', 'Supernatu...",PG - Children,7.03,4894,3544,3704,11708,14
5,16,Hachimitsu to Clover,TV,Manga,24,0:23:00,"['Comedy', 'Drama', 'Josei', 'Romance', 'Slice...",PG-13 - Teens 13 or older,8.12,57065,419,536,172274,3752


In [6]:
# Merging anime_show and rating dataframe with inner join
merged = rating.merge(anime_show, left_on = 'anime_id', right_on = 'animeID', suffixes= ['_user', ''], how='inner')
# Renaming column
merged.rename(columns = {'rating_user':'user_rating'}, inplace = True)
# Dropping extra column
merged = merged.drop(columns="animeID")
print(merged.shape)
merged.head()

(5185071, 16)


Unnamed: 0,user_id,anime_id,user_rating,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
0,1,20,,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
1,3,20,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
2,5,20,6.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
3,6,20,,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
4,10,20,,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356


In [7]:
merged2 = merged.dropna()
print(merged2.shape)
merged2.head()

(4284319, 16)


Unnamed: 0,user_id,anime_id,user_rating,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
1,3,20,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
2,5,20,6.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
5,21,20,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
6,28,20,9.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
7,34,20,9.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356


Pivoting table of users on one axis and tv show names along the other. This helps us in defining the similarity between users and shows to better predict who will like what.

In [8]:
piv = merged2.pivot_table(index=['user_id'], columns=['name'], values='user_rating',fill_value=0)

In [9]:
print(piv.shape)
piv.head()

(68840, 2968)


name,.hack//Roots,.hack//Sign,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,12-sai.: Chicchana Mune no Tokimeki 2nd Season,2020 Nyeon Ujuui Wonder Kiddy,21 Emon,3 Choume no Tama: Uchi no Tama Shirimasenka?,...,Zombie-Loan,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,7,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# For computing reasons I'm limiting the dataframe length to 50,000 users

# merged2=merged2[['user_id', 'name', 'user_rating']]
# merged_sub= merged2[merged2.user_id <= 50000]
# merged_sub.head()

Transposing the Matrix (eg. pivoted table)

In [11]:
# Note: As we are subtracting the mean from each rating to standardize
# all users with only one rating or who had rated everything the same will be dropped

# Normalize the values
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
# # Drop all columns containing only zeros representing users who did not rate
piv_norm.fillna(0, inplace=True)


# Transposing the Matrix
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]

print(piv_norm.shape)
piv_norm.head()

(2968, 68840)


user_id,1,2,3,5,7,8,9,10,11,12,...,73505,73506,73507,73508,73510,73511,73512,73513,73515,73516
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,-0.001348,-0.000337,-0.017385,-0.04471,-0.056873,-0.003369,-0.000337,-0.000943,-0.014623,-0.005694,...,-0.000636,-0.005627,-0.036287,-0.000337,-0.021867,-0.003976,-0.002864,-0.005559,-0.037264,-0.000674
.hack//Sign,-0.001348,-0.000337,-0.017385,-0.04471,-0.056873,-0.003369,-0.000337,-0.000943,-0.014623,-0.005694,...,-0.000636,-0.005627,-0.036287,-0.000337,-0.021867,-0.003976,-0.002864,-0.005559,-0.037264,-0.000674
009-1,-0.001348,-0.000337,-0.017385,-0.04471,-0.056873,-0.003369,-0.000337,-0.000943,-0.014623,-0.005694,...,-0.000636,-0.005627,-0.036287,-0.000337,-0.021867,-0.003976,-0.002864,-0.005559,-0.037264,-0.000674
07-Ghost,-0.001348,-0.000337,-0.017385,-0.04471,-0.056873,-0.003369,-0.000337,-0.000943,-0.014623,-0.005694,...,-0.000636,-0.005627,-0.036287,-0.000337,-0.021867,-0.003976,-0.002864,-0.005559,-0.037264,-0.000674
11eyes,-0.001348,-0.000337,-0.017385,-0.04471,-0.056873,-0.003369,-0.000337,-0.000943,-0.014623,-0.005694,...,-0.000636,-0.005627,0.263713,-0.000337,-0.021867,-0.003976,-0.002864,-0.005559,-0.037264,-0.000674


In [13]:
# Our data needs to be in a sparse matrix format to be read by the following functions
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)
print(piv_sparse.shape)
piv_sparse

(2968, 68840)


<2968x68840 sparse matrix of type '<class 'numpy.float64'>'
	with 204317120 stored elements in Compressed Sparse Row format>

In [14]:
# Matrix showing the computed cosine similarity of anime to anime 
anime_similarity = cosine_similarity(piv_sparse)
print(anime_similarity.shape)

NameError: name 'item_similarity' is not defined

In [None]:
# Matrix showing the computed cosine similarity of user to user 
user_similarity = cosine_similarity(piv_sparse.T)
print(user_similarity.shape)

In [None]:
# Inserting the 2 similarity matricies into separate dataframe objects

# Item to Item cosine similarity 
anime_sim_df = pd.DataFrame(anime_similarity, index = piv_norm.index, columns = piv_norm.index)
# User to User cosine similarity 
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)

In [None]:
# This function will return the top 10 shows with the highest cosine similarity value

def top_animes(anime_title):
    count = 1
    print(f'Top 10 similar Anime shows compared to {anime_title}:\n')
    for anime in anime_sim_df.sort_values(by = anime_name, ascending = False).index[1:11]: 
        print(f'No. {count}: {item}')
        count +=1

In [None]:
# This function will return the top 5 users with the highest similarity value 

def top_users(user):
    
    if user not in piv_norm.columns:
        return(f'No data available on user {user}')
    
    print(f'Top 10 similar Users compared to {user}:\n')
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user, sim))

In [None]:
# This function constructs a list of lists containing the highest rated shows per similar user
# and returns the name of the show along with the frequency it appears in the list

def similar_user_recs(user):
    
    if user not in piv_norm.columns:
        return(f'No data available on user {user}')
    
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = piv_norm.loc[:, i].max()
        best.append(piv_norm[piv_norm.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]

In [None]:
# This function calculates the weighted average of similar users
# to determine a potential rating for an input user and show

def predicted_rating(anime_name, user):
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:1000]
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:1000]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, anime_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    return print(f"The predicted score for user_id: {user} is {sum(rating_list)/sum(weight_list)}")

In [None]:
# Function to list every show watched by user_id 
def watchlist_of_user(user):
    return piv.T[piv.loc[user,:]>0].index.tolist()

In [None]:
top_animes('Naruto')

In [None]:
top_users(3)

In [None]:
similar_user_recs(3)

In [None]:
predicted_rating('Naruto', 3)