In [1]:
#import packages that needed

import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
% matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#import the dataset used in this project

anime = pd.read_csv('/content/drive/My Drive/My Mini Projects/Recommender System/anime.csv')
rate = pd.read_csv('/content/drive/My Drive/My Mini Projects/Recommender System/rating.csv', sep = ';', error_bad_lines = False, encoding = 'latin-1')



  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
print(anime.shape)
print(rate.shape)

(12294, 7)
(1048575, 3)


# **Exploring Anime Dataset**

In [5]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [6]:
# some attributes dont seem to be required for this analysis, so can be dropped off

anime.drop(['type', 'episodes', 'rating', 'members'], axis = 1, inplace = True)

In [7]:
anime.dtypes

anime_id     int64
name        object
genre       object
dtype: object

In [8]:
#Every genre is separated by a ',' so we simply have to call the split function on ','

anime['genre'] = anime.genre.str.split(',')
anime.head()

Unnamed: 0,anime_id,name,genre
0,32281,Kimi no Na wa.,"[Drama, Romance, School, Supernatural]"
1,5114,Fullmetal Alchemist: Brotherhood,"[Action, Adventure, Drama, Fantasy, Magic,..."
2,28977,Gintama°,"[Action, Comedy, Historical, Parody, Samur..."
3,9253,Steins;Gate,"[Sci-Fi, Thriller]"
4,9969,Gintama&#039;,"[Action, Comedy, Historical, Parody, Samur..."


In [9]:
print(type(anime.genre))
anime.genre.isnull().sum()

<class 'pandas.core.series.Series'>


62

In [10]:
#drop null rows because the data is still enough

anime = anime.dropna(axis=0)

In [11]:
#use one-hot-encoding to store every different genre in columns that contain either 1 or 0
#1 shows that the movie has that genre, 0 otherwise

#copying original dataset to new dataset
animeWithGenres = anime.copy()

#For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index,row in anime.iterrows():
  for genre in row['genre']:
    animeWithGenres.at[index,genre] = 1
  
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
animeWithGenres = animeWithGenres.fillna(0)

animeWithGenres.head()

Unnamed: 0,anime_id,name,genre,Drama,Romance,School,Supernatural,Action,Adventure,Drama.1,...,Supernatural.1,Samurai,Super Power,Vampire,Space,Hentai,Yaoi,Hentai.1,Yuri,Yaoi.1
0,32281,Kimi no Na wa.,"[Drama, Romance, School, Supernatural]",1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5114,Fullmetal Alchemist: Brotherhood,"[Action, Adventure, Drama, Fantasy, Magic,...",0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,28977,Gintama°,"[Action, Comedy, Historical, Parody, Samur...",0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9253,Steins;Gate,"[Sci-Fi, Thriller]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9969,Gintama&#039;,"[Action, Comedy, Historical, Parody, Samur...",0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **Exploring Rate Dataset**

In [12]:
rate.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


# **Collaborative Filtering Based Recommender System**

In [28]:
#user input about anime that he has watched and how he rates the anime

userInput = [
            {'name':'Fullmetal Alchemist: Brotherhood', 'rating':5},
            {'name':'Gintama', 'rating':3.5},
            {'name':'Koe no Katachi', 'rating':2},
            {'name':'Bakemonogatari', 'rating':5},
            {'name':'Nodame Cantabile Finale', 'rating':4.5}
         ] 
inputAnime = pd.DataFrame(userInput)
inputAnime = inputAnime.join(anime['anime_id'])
inputAnime

Unnamed: 0,name,rating,anime_id
0,Fullmetal Alchemist: Brotherhood,5.0,32281
1,Gintama,3.5,5114
2,Koe no Katachi,2.0,28977
3,Bakemonogatari,5.0,9253
4,Nodame Cantabile Finale,4.5,9969


In [29]:
#Filtering out users that have watched anime that the input has watched and storing it
userSubset = rate[rate['anime_id'].isin(inputAnime['anime_id'].tolist())]
userSubset.head(10)

Unnamed: 0,user_id,anime_id,rating
183,3,5114,10
516,5,9253,9
525,5,9969,9
1165,10,5114,10
1166,10,9253,-1
1250,11,5114,8
1259,11,9253,7
1295,12,5114,9
1301,12,9253,10
1400,13,9253,-1


In [30]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['user_id'])

In [31]:
userSubsetGroup.get_group(46)

Unnamed: 0,user_id,anime_id,rating
4466,46,5114,9
4511,46,9253,9
4521,46,9969,10
4680,46,28977,10


In [32]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [33]:
userSubsetGroup[0:3]

[(1309,         user_id  anime_id  rating
  129155     1309      5114       8
  129181     1309      9253       8
  129183     1309      9969       9
  129332     1309     28977      10
  129367     1309     32281       9), (1522,         user_id  anime_id  rating
  151408     1522      5114       9
  151480     1522      9253       9
  151504     1522      9969       9
  151854     1522     28977       9
  151904     1522     32281       8), (2428,         user_id  anime_id  rating
  235810     2428      5114      10
  235820     2428      9253      10
  235824     2428      9969       8
  235886     2428     28977       8
  235894     2428     32281      10)]

# **Similarity of users to input user**

In [34]:
userSubsetGroup = userSubsetGroup[0:1000]

In [35]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='anime_id')
    inputAnime = inputAnime.sort_values(by='anime_id')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputAnime[inputAnime['anime_id'].isin(group['anime_id'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [37]:
pearsonCorrelationDict.items()

dict_items([(1309, -0.5860090386731182), (1522, -0.4385290096535115), (2428, 0.537086155529574), (2489, -0.8951435925492954), (2820, -0.6020183016345595), (2951, 0.8204126541423654), (3643, 0), (4437, -0.41602514716892186), (6253, 0), (6714, -0.3516054232038709), (7448, 0), (9842, -0.1703748109239964), (39, -0.12598815766974242), (43, 0.0), (46, -0.4364357804719848), (274, 0), (296, 0.9177105704032728), (342, 0), (392, -0.629940788348712), (593, 0), (670, 0.1259881576697424), (750, -0.629940788348712), (784, -0.7106690545187014), (956, -0.4364357804719848), (1019, 0), (1069, 0), (1116, -0.9759000729485332), (1176, -0.4364357804719848), (1237, -0.1259881576697424), (1344, 0.19738550848793068), (1435, -0.4364357804719848), (1494, -0.629940788348712), (1530, 0.8819171036881969), (1551, -0.3779644730092272), (1623, -0.9869275424396534), (1749, -0.592156525463792), (1842, 0), (1889, 0), (1938, 0.1259881576697424), (2141, -0.19738550848793068), (2143, 0.2182178902359924), (2185, 0.1259881576

In [38]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['user_id'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head(10)

Unnamed: 0,similarityIndex,user_id
0,-0.586009,1309
1,-0.438529,1522
2,0.537086,2428
3,-0.895144,2489
4,-0.602018,2820
5,0.820413,2951
6,0.0,3643
7,-0.416025,4437
8,0.0,6253
9,-0.351605,6714


In [39]:
#the Top X similar users to input user

topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:500]
topUsers.head(10)

Unnamed: 0,similarityIndex,user_id
395,1.0,10070
507,1.0,652
857,1.0,2646
871,1.0,2741
676,1.0,1638
680,1.0,1664
866,1.0,2712
864,1.0,2698
862,1.0,2678
858,1.0,2660


In [40]:
#rating of selected users to all animes

topUsersRating = topUsers.merge(rate, left_on='user_id', right_on='user_id', how='inner')
topUsersRating.head(10)

Unnamed: 0,similarityIndex,user_id,anime_id,rating
0,1.0,10070,20,8
1,1.0,10070,30,9
2,1.0,10070,31,7
3,1.0,10070,32,9
4,1.0,10070,199,8
5,1.0,10070,223,8
6,1.0,10070,225,5
7,1.0,10070,263,9
8,1.0,10070,264,9
9,1.0,10070,265,9


In [41]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head(10)

Unnamed: 0,similarityIndex,user_id,anime_id,rating,weightedRating
0,1.0,10070,20,8,8.0
1,1.0,10070,30,9,9.0
2,1.0,10070,31,7,7.0
3,1.0,10070,32,9,9.0
4,1.0,10070,199,8,8.0
5,1.0,10070,223,8,8.0
6,1.0,10070,225,5,5.0
7,1.0,10070,263,9,9.0
8,1.0,10070,264,9,9.0
9,1.0,10070,265,9,9.0


In [42]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('anime_id').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,84.111998,663.679087
5,28.219414,193.669311
6,45.867622,343.46918
7,7.553543,43.301166
8,1.67364,12.471406


In [43]:
#Creates an empty dataframe
recommendation = pd.DataFrame()

#Now we take the weighted average
recommendation['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation['anime_id'] = tempTopUsersRating.index
recommendation.head(10)

Unnamed: 0_level_0,weighted average recommendation score,anime_id
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7.890421,1
5,6.862981,5
6,7.488271,6
7,5.732564,7
8,7.451668,8
15,8.150916,15
16,8.261242,16
17,7.214498,17
18,7.720939,18
19,7.776454,19


In [44]:
# recommendation for the input user
anime.loc[anime['anime_id'].isin(recommendation.head(10)['anime_id'].tolist())]

Unnamed: 0,anime_id,name,genre
22,1,Cowboy Bebop,"[Action, Adventure, Comedy, Drama, Sci-Fi,..."
38,19,Monster,"[Drama, Horror, Mystery, Police, Psycholog..."
152,5,Cowboy Bebop: Tengoku no Tobira,"[Action, Drama, Mystery, Sci-Fi, Space]"
214,6,Trigun,"[Action, Comedy, Sci-Fi]"
263,18,Initial D Fourth Stage,"[Action, Cars, Drama, Seinen, Sports]"
325,16,Hachimitsu to Clover,"[Comedy, Drama, Josei, Romance]"
433,15,Eyeshield 21,"[Action, Comedy, Shounen, Sports]"
976,17,Hungry Heart: Wild Striker,"[Comedy, Shounen, Slice of Life, Sports]"
2095,7,Witch Hunter Robin,"[Action, Drama, Magic, Mystery, Police, S..."
3159,8,Beet the Vandel Buster,"[Adventure, Fantasy, Shounen, Supernatural]"
