Here I want to make a simple recommender system to find the similarity between shows, users and to help me predict whether a user will enjoy a particular anime.

In [None]:
# Import relevant libraries 

import pandas as pd
import numpy as np
import scipy as sp
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import operator
%matplotlib inline

In [None]:
anime = pd.read_csv('myAnimelist-No_Hentai 2.csv')
rating = pd.read_csv('/Users/SDMAN/Documents/Flatiron/Projects/rating.csv')

Replacing -1 value with a null value

In [None]:
rating.rating.replace({-1: np.nan}, inplace = True)
rating.head()

In [None]:
rating.head()

Join the two dataframes on the anime_id columns

In [None]:
# For this analysis I'm only interest in finding recommendations for the TV category

anime_show = anime[anime['type']=='TV']
anime_show.head()

In [None]:
# Merging anime_show and rating dataframe with inner join
merged = rating.merge(anime_show, left_on = 'anime_id', right_on = 'animeID', suffixes= ['_user', ''], how='inner')
# Renaming column
merged.rename(columns = {'rating_user':'user_rating'}, inplace = True)
# Dropping extra column
merged = merged.drop(columns="animeID")
print(merged.shape)
merged.head()

In [None]:
merged2 = merged.dropna()
print(merged2.shape)
merged2.head()

Pivoting table of users on one axis and tv show names along the other. This helps us in defining the similarity between users and shows to better predict who will like what.

In [None]:
piv = merged2.pivot_table(index=['user_id'], columns=['name'], values='user_rating',fill_value=0)

In [None]:
print(piv.shape)
piv.head()

In [None]:
# For computing reasons I'm limiting the dataframe length to 50,000 users

# merged2=merged2[['user_id', 'name', 'user_rating']]
# merged_sub= merged2[merged2.user_id <= 50000]
# merged_sub.head()

Transposing the Matrix (eg. pivoted table)

In [None]:
# Note: As we are subtracting the mean from each rating to standardize
# all users with only one rating or who had rated everything the same will be dropped

# Normalize the values
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
# # Drop all columns containing only zeros representing users who did not rate
piv_norm.fillna(0, inplace=True)


# Transposing the Matrix
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]

print(piv_norm.shape)
piv_norm.head()

In [None]:
# Our data needs to be in a sparse matrix format to be read by the following functions
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)
print(piv_sparse.shape)
piv_sparse.head()

In [None]:
# Matrix showing the computed cosine similarity of anime to anime 
anime_similarity = cosine_similarity(piv_sparse)
print(item_similarity.shape)

In [None]:
# Matrix showing the computed cosine similarity of user to user 
user_similarity = cosine_similarity(piv_sparse.T)
print(user_similarity.shape)

In [None]:
# Inserting the 2 similarity matricies into separate dataframe objects

# Item to Item cosine similarity 
item_sim_df = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)
# User to User cosine similarity 
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)

In [None]:
# This function will return the top 10 shows with the highest cosine similarity value

def top_animes(anime_title):
    count = 1
    print(f'Top 10 similar Anime shows compared to {anime_title}:\n')
    for item in item_sim_df.sort_values(by = anime_name, ascending = False).index[1:11]: 
        print(f'No. {count}: {item}')
        count +=1

In [None]:
# This function will return the top 5 users with the highest similarity value 

def top_users(user):
    
    if user not in piv_norm.columns:
        return(f'No data available on user {user}')
    
    print('Top 10 similar Users compared to :\n')
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user, sim))

In [None]:
# This function constructs a list of lists containing the highest rated shows per similar user
# and returns the name of the show along with the frequency it appears in the list

def similar_user_recs(user):
    
    if user not in piv_norm.columns:
        return(f'No data available on user {user}')
    
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = piv_norm.loc[:, i].max()
        best.append(piv_norm[piv_norm.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]

In [None]:
# This function calculates the weighted average of similar users
# to determine a potential rating for an input user and show

def predicted_rating(anime_name, user):
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:1000]
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:1000]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, anime_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    return print(f"The predicted score for user_id: {user} is {sum(rating_list)/sum(weight_list)}")

In [None]:
# Function to list every show watched by user_id 
def watchlist_of_user(user):
    return piv.T[piv.loc[user,:]>0].index.tolist()

In [None]:
top_animes('Naruto')

In [None]:
top_users(3)

In [None]:
similar_user_recs(3)

In [None]:
predicted_rating('Naruto', 3)