In [1]:
#loading relevant packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from sklearn.linear_model import LinearRegression

In [2]:
#loading the data
ratings = pd.read_csv('rating.csv')
anime = pd.read_csv('anime.csv')
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [3]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
#for computational efficiency
ratings_short = ratings.iloc[:999999,:]

#999 denotes an anime watched by the user but not rated
ratings_short['rating'] = ratings_short.apply(lambda x: 999 if x['rating'] == -1 else x['rating'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [5]:
ratings_pivot = ratings_short.pivot_table(index = 'user_id', columns = 'anime_id', values = 'rating')
#NaN indicates that the anime was not watched by the concerned user
ratings_pivot.head()

anime_id,1,5,6,7,8,15,16,17,18,19,...,34048,34085,34103,34107,34136,34173,34240,34283,34324,34325
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,999.0,,,,,,,,...,,,,,,,,,,
5,,,8.0,,,6.0,,6.0,6.0,,...,,,,,,,,,,


In [6]:
#calculating the mean user ratings ignoring the 999 values
def avg_rating(x):
    y = x[x!=999]
    return y.mean()    

In [7]:
#series to store the mean rating of each user
mean_rating = ratings_pivot.apply(avg_rating,axis=1)
mean_rating.head()

user_id
1    10.000000
2    10.000000
3     7.565217
4          NaN
5     4.355120
dtype: float64

In [8]:
#centering the user ratings to 0 by subtracting each rating by the mean user rating
ratings_centered = ratings_pivot.sub(mean_rating, axis=0)
ratings_centered = ratings_centered.fillna(0)

#preparing the dataframe for cosine similarity; '999' would distort similarity values
ratings_centered = ratings_centered.replace(999,0)
ratings_centered.head()

anime_id,1,5,6,7,8,15,16,17,18,19,...,34048,34085,34103,34107,34136,34173,34240,34283,34324,34325
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,3.64488,0.0,0.0,1.64488,0.0,1.64488,1.64488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#calculating cosine similarities of each user to all others
ratings_cosine = pd.DataFrame(cosine_similarity(ratings_centered), index = ratings_centered.index, columns = ratings_centered.index)
ratings_cosine.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,9349,9350,9351,9352,9353,9354,9355,9356,9357,9358
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,5.6e-05,0.0,0.025241,0.0,-0.029132,0.010576,0.0,0.073262,...,-0.033453,-0.024801,0.08217651,0.00012,0.146199,0.0,0.059098,0.0,0.109911,0.0
2,0.0,1.0,0.0,0.0,-0.000178,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-5.3e-05,0.0,-0.000364,0.0,0.0,0.0
3,5.6e-05,0.0,1.0,0.0,1.2e-05,0.0,0.044174,0.000232,0.0,0.001692,...,0.000412,0.000708,-0.0002889745,-6.6e-05,0.000388,0.0,0.000148,0.0,0.000187,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.025241,-0.000178,1.2e-05,0.0,1.0,0.0,0.011805,0.000235,0.0,0.000943,...,0.001102,-0.034778,-7.864336e-07,0.00075,0.056053,-0.000843,-0.001162,0.0,-0.000282,0.0


In [10]:
#collaborative filtering: recommend movies that were rated highest by similar users
def get_recommendations(y):
    global anime
    #series to store 10 most similar users
    sim_users = ratings_cosine.iloc[y].sort_values(ascending=False).head(10)
    
    #creating dataframe containing movies not yet watched by user 
    anime_unwatched =  ratings_short[(ratings_short.user_id!=y) & (ratings_short.rating!=999)]
    
    #filtering dataframe to only include data about 10 most similar users
    anime_unwatched = anime_unwatched[anime_unwatched['user_id'].isin(sim_users.index)]
    
    #including anime that were watched by 3 or more users 
    anime_by_viewers = anime_unwatched.groupby('anime_id')['user_id'].count()
    pop_anime = anime_by_viewers[anime_by_viewers>=3]
    anime_unwatched = anime_unwatched[anime_unwatched.anime_id.isin(pop_anime.index)]
    
    #fetching names of the animes
    anime_names = anime_unwatched.merge(anime, left_on = 'anime_id', right_on = 'anime_id')
    
    #returning animes with highest mean rating given by similar users
    return anime_names.groupby(['anime_id','name'])[['rating_x']].mean().sort_values(by = 'rating_x',ascending=False).head(10)

In [11]:
get_recommendations(2345)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating_x
anime_id,name,Unnamed: 2_level_1
13601,Psycho-Pass,10.0
2904,Code Geass: Hangyaku no Lelouch R2,9.666667
5114,Fullmetal Alchemist: Brotherhood,9.666667
16498,Shingeki no Kyojin,9.666667
1535,Death Note,9.5
11757,Sword Art Online,9.4
1575,Code Geass: Hangyaku no Lelouch,9.25
6547,Angel Beats!,8.25
20,Naruto,8.0
6880,Deadman Wonderland,7.8
