In [45]:
#loading relevant packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from sklearn.linear_model import LinearRegression

In [46]:
#loading the data
ratings = pd.read_csv('rating.csv')
anime = pd.read_csv('anime.csv')

In [47]:
#for computational efficiency
ratings_short = ratings.iloc[:999999,:]

#999 denotes an anime watched by the user but not rated
ratings_short['rating'] = ratings_short.apply(lambda x: 999 if x['rating'] == -1 else x['rating'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [48]:
ratings_pivot = ratings_short.pivot_table(index = 'user_id', columns = 'anime_id', values = 'rating')

In [49]:
#calculating the mean user ratings ignoring the 999 values
def avg_rating(x):
    y = x[x!=999]
    return y.mean()    

In [50]:
#series to store the mean rating of each user
mean_rating = ratings_pivot.apply(avg_rating,axis=1)

In [51]:
#centering the user ratings to 0 by subtracting each rating by the mean user rating
ratings_centered = ratings_pivot.sub(mean_rating, axis=0)
ratings_centered = ratings_centered.fillna(0)

#preparing the dataframe for cosine similarity; '999' would distort similarity values
ratings_centered = ratings_centered.replace(999,0)

In [91]:
#calculating cosine similarities of each user to all others
ratings_cosine = pd.DataFrame(cosine_similarity(ratings_centered), index = ratings_centered.index, columns = ratings_centered.index)

In [180]:
#collaborative filtering: recommend movies that were rated highest by similar users
def get_recommendations(y):
    global anime
    #series to store 10 most similar users
    sim_users = ratings_cosine.iloc[y].sort_values(ascending=False).head(10)
    
    #creating dataframe containing movies not yet watched by user 
    anime_unwatched =  ratings_short[(ratings_short.user_id!=y) & (ratings_short.rating!=999)]
    
    #filtering dataframe to only include data about 10 most similar users
    anime_unwatched = anime_unwatched[anime_unwatched['user_id'].isin(sim_users.index)]
    
    #including anime that were watched by 3 or more users 
    anime_by_viewers = anime_unwatched.groupby('anime_id')['user_id'].count()
    pop_anime = anime_by_viewers[anime_by_viewers>=3]
    anime_unwatched = anime_unwatched[anime_unwatched.anime_id.isin(pop_anime.index)]
    
    #fetching names of the animes
    anime_names = anime_unwatched.merge(anime, left_on = 'anime_id', right_on = 'anime_id')
    
    #returning animes with highest mean rating given by similar users
    return anime_names.groupby(['anime_id','name'])[['rating_x']].mean().sort_values(by = 'rating_x',ascending=False).head(10)

In [184]:
get_recommendations(2345)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating_x
anime_id,name,Unnamed: 2_level_1
13601,Psycho-Pass,10.0
2904,Code Geass: Hangyaku no Lelouch R2,9.666667
5114,Fullmetal Alchemist: Brotherhood,9.666667
16498,Shingeki no Kyojin,9.666667
1535,Death Note,9.5
11757,Sword Art Online,9.4
1575,Code Geass: Hangyaku no Lelouch,9.25
6547,Angel Beats!,8.25
20,Naruto,8.0
6880,Deadman Wonderland,7.8
