Here I want to make a simple recommender system to find the similarity between shows, users and to help me predict whether a user will enjoy a particular anime.

In [129]:
# Import relevant libraries 

import pandas as pd
import numpy as np
import scipy as sp
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import operator
%matplotlib inline

In [130]:
anime = pd.read_csv('myAnimelist-No_Hentai 2.csv')
rating = pd.read_csv('/Users/SDMAN/Documents/Flatiron/Projects/rating.csv')

Replacing -1 value with a null value

In [141]:
rating.rating.replace({-1: np.nan}, inplace = True)
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [142]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


Join the two dataframes on the anime_id columns

In [143]:
# For this analysis I'm only interest in finding recommendations for the TV category

anime_show = anime[anime['type']=='TV']
anime_show.head()

Unnamed: 0,animeID,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
0,1,Cowboy Bebop,TV,Original,26,0:24:00,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460
2,6,Trigun,TV,Manga,26,0:24:00,"['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'D...",PG-13 - Teens 13 or older,8.3,212537,255,146,408548,10432
3,7,Witch Hunter Robin,TV,Original,26,0:25:00,"['Action', 'Magic', 'Police', 'Supernatural', ...",PG-13 - Teens 13 or older,7.33,32837,2371,1171,79397,537
4,8,Bouken Ou Beet,TV,Manga,52,0:23:00,"['Adventure', 'Fantasy', 'Shounen', 'Supernatu...",PG - Children,7.03,4894,3544,3704,11708,14
5,16,Hachimitsu to Clover,TV,Manga,24,0:23:00,"['Comedy', 'Drama', 'Josei', 'Romance', 'Slice...",PG-13 - Teens 13 or older,8.12,57065,419,536,172274,3752


In [144]:
# Merging anime_show and rating dataframe with inner join
merged = rating.merge(anime_show, left_on = 'anime_id', right_on = 'animeID', suffixes= ['_user', ''], how='inner')
# Renaming column
merged.rename(columns = {'rating_user':'user_rating'}, inplace = True)
# Dropping extra column
merged = merged.drop(columns="animeID")
print(merged.shape)
merged.head()

(5185071, 16)


Unnamed: 0,user_id,anime_id,user_rating,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
0,1,20,,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
1,3,20,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
2,5,20,6.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
3,6,20,,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
4,10,20,,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356


In [145]:
merged2 = merged.dropna()
print(merged2.shape)
merged2.head()

(4284319, 16)


Unnamed: 0,user_id,anime_id,user_rating,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
1,3,20,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
2,5,20,6.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
5,21,20,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
6,28,20,9.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
7,34,20,9.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356


Pivoting table of users on one axis and tv show names along the other. This helps us in defining the similarity between users and shows to better predict who will like what.

In [156]:
piv = merged.pivot_table(index=['user_id'], columns=['name'], values='user_rating',fill_value=0)

In [157]:
print(piv.shape)
piv.head()

(68840, 2968)


name,.hack//Roots,.hack//Sign,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,12-sai.: Chicchana Mune no Tokimeki 2nd Season,2020 Nyeon Ujuui Wonder Kiddy,21 Emon,3 Choume no Tama: Uchi no Tama Shirimasenka?,...,Zombie-Loan,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,7,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [158]:
# For computing reasons I'm limiting the dataframe length to 50,000 users

# merged2=merged2[['user_id', 'name', 'user_rating']]
# merged_sub= merged2[merged2.user_id <= 50000]
# merged_sub.head()

Unnamed: 0,user_id,anime_id,user_rating,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
1,3,20,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
2,5,20,6.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
5,21,20,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
6,28,20,9.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
7,34,20,9.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356


Transposing the Matrix (eg. pivoted table)

In [159]:
X = piv.T
X.shape

(2968, 68840)

Decomposing the Matrix - Lowering the total features

In [161]:
# Truncates matrix in 12 Synthetic features
SVD = TruncatedSVD(n_components=12,random_state=12)

resultant_matrix = SVD.fit_transform(X)

resultant_matrix.shape

(2968, 12)

Generating a Correlation Matrix

In [162]:
corr_mat = np.corrcoef(resultant_matrix)
corr_mat.shape


(2968, 2968)

In [164]:
anime_titles = piv.columns
anime_list = list(anime_titles)
naruto_corr = anime_list.index('naruto'.title())
print(naruto_corr)

1808


In [165]:
corr_naruto = corr_mat[int(naruto_corr)]
corr_naruto.shape

(2968,)

Recommending top 10% of Hightly Correlated Anime

In [206]:
list(anime_titles[(corr_naruto<1.0)&(corr_naruto>0.9)])

['Bleach', 'Ghost in the Shell: Nyuumon Arise', 'Shaman King']

Function to recommend the top correlated Anime, (top inputted percentage)

In [266]:
def rec_anime(x,float):
    x_corr = anime_list.index(f'{x.title()}')
    x_corr
    corr_x = corr_mat[x_corr]
    print(f'{100-int(float*100)}% Correlational match to Anime: {x.title()}\n')
    corr_anime = list(anime_titles[(corr_x < 1.0) & (corr_x > (1.0-float))])
    count = 1
    for item in corr_anime:
        print(f'No. {count}: {item}')
        count +=1



In [267]:
rec_anime('naruto',.1)

90% Correlational match to Anime: Naruto

No. 1: Bleach
No. 2: Ghost in the Shell: Nyuumon Arise
No. 3: Shaman King


In [268]:
rec_anime('naruto',0.2)

80% Correlational match to Anime: Naruto

No. 1: Afro Samurai
No. 2: Air Gear
No. 3: Ao no Exorcist
No. 4: Blade
No. 5: Bleach
No. 6: Blue Dragon
No. 7: Cybersix
No. 8: Deadman Wonderland
No. 9: Death Note
No. 10: Devil May Cry
No. 11: Dragon Ball
No. 12: Dragon Ball GT
No. 13: Dragon Ball Kai
No. 14: Dragon Ball Z
No. 15: Fairy Tail
No. 16: Fullmetal Alchemist
No. 17: Fullmetal Alchemist: Brotherhood
No. 18: Gantz
No. 19: Ghost in the Shell: Nyuumon Arise
No. 20: Ghost in the Shell: Stand Alone Complex - Tachikoma na Hibi (TV)
No. 21: Ginga Densetsu Weed
No. 22: Hellsing
No. 23: Highschool of the Dead
No. 24: Hokuto no Ken: Raoh Gaiden Ten no Haoh
No. 25: Hunter x Hunter
No. 26: IGPX: Immortal Grand Prix (2005)
No. 27: IGPX: Immortal Grand Prix (2005) 2nd Season
No. 28: InuYasha
No. 29: InuYasha: Kanketsu-hen
No. 30: Juusen Battle Monsuno
No. 31: Konjiki no Gash Bell!!
No. 32: Maou Dante
No. 33: Metal Fight Beyblade
No. 34: Oban Star-Racers
No. 35: Plawres Sanshirou
No. 36: Pokemon
No

In [241]:

item_similarity = cosine_similarity(corr_mat)
user_similarity = cosine_similarity(corr_mat.T)

In [243]:
# Inserting the similarity matricies into dataframe objects

item_sim_df = pd.DataFrame(item_similarity, index = corr_mat.index, columns = corr_mat.index)
user_sim_df = pd.DataFrame(user_similarity, index = corr_mat.columns, columns = corr_mat.columns)

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [224]:
# This function will return the top 5 users with the highest similarity value 

def top_users(user):
    
    if user not in piv.columns:
        return(f'No data available on user {user}')
    
    print('Most Similar Users:\n')
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user, sim))

In [225]:
# This function constructs a list of lists containing the highest rated shows per similar user
# and returns the name of the show along with the frequency it appears in the list

def similar_user_recs(user):
    
    if user not in piv.columns:
        return(f'No data available on user {user}')
    
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = piv.loc[:, i].max()
        best.append(piv[piv.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]

In [226]:
# This function calculates the weighted average of similar users
# to determine a potential rating for an input user and show

def predicted_rating(anime_name, user):
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:1000]
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:1000]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, anime_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    return print(f"The predicted score for user_id: {user} is {sum(rating_list)/sum(weight_list)}")

In [230]:
# Function to list every show watched by user_id 
def watchlist_by_user(user):
    return piv.T[piv.loc[user,:]>0].index.tolist()

In [231]:
top_animes('Naruto')

Similar shows to Naruto include:

No. 1: Bleach
No. 2: Dragon Ball GT
No. 3: Dragon Ball Z
No. 4: Fairy Tail
No. 5: Dragon Ball
No. 6: Pokemon
No. 7: InuYasha
No. 8: Highschool of the Dead
No. 9: Yu☆Gi☆Oh! Duel Monsters
No. 10: Sword Art Online


In [232]:
top_users(3)

'No data available on user 3'

In [233]:
similar_user_recs(3)

'No data available on user 3'

In [234]:
predicted_rating('Naruto', 3)

KeyboardInterrupt: 

Below we'll see how the predict_rating function performs compared to the observed rated values for user 3.

In [None]:
# Creates a list of every show watched by user_id

watched = watchlist_by_user(3)
# Showing watchlist 
watchlist_by_user(3)

In [None]:
# This function will return the top 10 shows with the highest cosine similarity value

def top_animes(anime_title):
    count = 1
    print(f'Similar shows to {anime_title} include:\n')
    for item in item_sim_df.sort_values(by = anime_name, ascending = False).index[1:11]:
        print(f'No. {count}: {item}')
        count +=1

In [None]:
watchlist_by_user(3)