# Anime Recommender System  
<font size="3">@Cicily Wu</font>

![](https://image.slidesharecdn.com/anime-recommendation-1-180121040742/95/anime-recommendation-big-data-certification6-1-638.jpg?cb=1516507702)

# 1. Problem Statement

Recommender systems aim to predict users’ interests and recommend product items that quite likely are interesting for them. They are among the most powerful machine learning systems that online retailers implement in order to drive sales.

Data required for recommender systems stems from explicit user ratings after watching a movie or listening to a song, from implicit search engine queries and purchase histories, or from other knowledge about the users/items themselves.

In this project, users data from MyAnimeList is being used to build an anime recommendor system based on user viewing and rating history. Several approaches were implemented.

# 2. Dataset Description

This data set contains information on user preference data from 73,516 users on 12,294 anime. Each user is able to add anime to their completed list and give it a rating and this data set is a compilation of those ratings.

Anime.csv:

anime_id - myanimelist.net's unique id identifying an anime.  
name - full name of anime.  
genre - comma separated list of genres for this anime.  
type - movie, TV, OVA, etc.  
episodes - how many episodes in this show. (1 if movie).  
rating - average rating out of 10 for this anime.  
members - number of community members that are in this anime's "group".

Rating.csv:

user_id - non identifiable randomly generated user id.  
anime_id - the anime that this user has rated.  
rating - rating out of 10 this user has assigned (-1 if the user watched it but didn't assign a rating).  

# 3. Build Recommendor System

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import os
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
rating=pd.read_csv('/kaggle/input/anime-recommendations-database/rating.csv')
anime=pd.read_csv('/kaggle/input/anime-recommendations-database/anime.csv')

<font size="3"><font color='purple'>This is a function I found online to deal with memory usage issues when I used Kaggle kernels. </font>

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
anime=reduce_mem_usage(anime)
rating=reduce_mem_usage(rating)

In [None]:
anime.head()

In [None]:
anime.shape

<font size="3"><font color='purple'>In order to make the computation faster and easer I want to decrease half of anime dataset. In this case, I chose them based on the number of group members. If an anime has more members which means it's more popular, I'll keep it.</font>

In [None]:
print(np.median(anime['members']))
anime=anime[anime['members']>(np.percentile(anime['members'], 50))]
anime.dropna(axis=0, how='any', subset = ['rating'] ,inplace=True)

In [None]:
rating.head()

In [None]:
rating.nunique()

In [None]:
rating['rating'] = rating['rating'].replace(-1,np.nan)
rating["user_id"].unique()
user=rating.loc[:,'user_id'].value_counts()
user=user.to_frame()
user = user.drop(user[user.user_id < 150].index)#drop users who rated less than 150 times to decrease the dataset size
user=user.rename(columns={"user_id": "count"})
user['user_id']=user.index

In [None]:
user.head()

In [None]:
rating=pd.merge(user,rating,on='user_id',how='left')
users=rating[['user_id','anime_id','rating']]
users=users.reset_index()
anime=anime.reset_index()
df = pd.merge(anime,users,on='anime_id',how='inner')
df=df.drop(['index_x','index_y'], axis=1)
df = df.rename(columns={'rating_x': 'anime_rating','rating_y':'user_rating'})
df.head(10)

In [None]:
rating_counts=df.loc[:,'anime_id'].value_counts()#每个动漫打分人数 number of raters each anime
rating_counts=rating_counts.to_frame()
rating_counts=rating_counts.rename(columns={'anime_id': 'count_ratings'})
rating_counts['anime_id']=rating_counts.index
rating_counts=rating_counts[rating_counts['count_ratings']>300]#只保留300个人以上评分的动漫only keep anime with more than 300 raters.
rating_counts.head()

In [None]:
df1=pd.merge(rating_counts,df,on='anime_id',how='left')
df_p = df1.pivot_table(index='user_id', columns='anime_id', values='user_rating')
print('Shape User-Movie-Matrix:\t{}'.format(df_p.shape))
df_p.sample(3)

<font size="3"><font color='purple'>The user-anime matrix is obtained. I will fill NaN value with 0 later.</font>

## 3.1 Weighted Average rating
This is a general method to recommend same high rating movies to users. The idea is to calculate a new score to each movie based on number of raters and each person's rating, which makes more sense than just averaging ratings.

In [None]:
df2=df1.dropna(subset=['user_rating','anime_rating','members'])
df2=df2.drop_duplicates(subset='name')
df2.head()

In [None]:
def weighted_rating(x): #x is the dataframe's name
    m=300
    C=df2.anime_rating.mean()
    v = x['count_ratings']
    R = x['anime_rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
df2['wr'] = df2.apply(weighted_rating, axis=1)
df2=df2.sort_values(by='wr',ascending=False)
df2.head(10)

<font size="3"><font color='purple'>These are the examples of 10 movies with the highest weited average ratings. We would recommend movied to users based on this list. In this case, we only consider the ratings of movies and personal taste is not in consideration so this is a good method but it can also be limited.</font>

## 3.2 User-rating Based Using Cosine Similarity
Cosine similarity is the measure of similarity between two vectors, by computing the cosine of the angle between two vectors projected into multidimensional space. It can be applied to items available on a dataset to compute similarity to one another via keywords or other metrics.

<img src="https://www.oreilly.com/library/view/statistics-for-machine/9781788295758/assets/2b4a7a82-ad4c-4b2a-b808-e423a334de6f.png" width="400px">

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
df_p=df_p.fillna(0)
user_similarity = cosine_similarity(df_p) #ger similarity matrix for users
user_similarity.shape

In [None]:
item_similarity = cosine_similarity(df_p.T)#get similarity matrix for animes
item_similarity.shape

<font size="3"><font color='purple'>The anime similarity table is shown below. We have 2826 animes in total in this recommendor system.</font>

In [None]:
item_sim_df = pd.DataFrame(item_similarity, index = df_p.columns, columns = df_p.columns)
item_sim_df.head(3) #show similarity matrix for animes

<font size="3"><font color='purple'>The users similarity table is shown below. We have 15711 users in total in this recommendor system.</font>

In [None]:
user_sim_df = pd.DataFrame(user_similarity, index = df_p.index, columns = df_p.index)
user_sim_df.head(3) #show similarity matrix forusers

#### Similar Users Based 
**Use Case 1: If I am the user 73, get the number of users who are similar to me.**

In [None]:
def similar_users(user):
    
    if user not in df_p.index:
        return('No data available on user {}'.format(user))
    
    print('Most Similar Users:\n')
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:6] # sort the similar score and get top5
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:6]  # get the user_id of those top 5.  
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user, sim)) 

In [None]:
similar_users(3) 

In [None]:
similar_users(73)

#### Similar Animes Based 
**Use Case 2: If I like the anime 19, get me more animes I might be interested in.**

In [None]:
def similar_animes(anime):
    
    if anime not in df_p.columns:
        return('No anime called {}'.format(anime))
    
    print('Most Similar Animes:\n')
    sim_values = item_sim_df.sort_values(by=anime, ascending=False).loc[:,anime].tolist()[1:6]
    sim_animes = item_sim_df.sort_values(by=anime, ascending=False).index[1:6]
    zipped = zip(sim_animes, sim_values,)
    for anime, sim in zipped:
        print('Anime #{0}, Similarity value: {1:.2f}'.format(anime, sim)) 

In [None]:
similar_animes(19)

[<font size="3"><font color='purple'>After sorting by the dataframe, the results will only show the users/animes with the highest similarity number, which is basically how this system works. I built another one with name which might be easier to check.</font></font>](http://)

In [None]:
anime_id_name_match=df1[['anime_id','name']].drop_duplicates()
anime_id_name_match=anime_id_name_match.sort_values(by='anime_id')
item_sim_df_name=item_sim_df.copy()
item_sim_df_name.index = anime_id_name_match['name']
item_sim_df_name.columns = anime_id_name_match['name']
item_sim_df_name.head(3)

In [None]:
def similar_animes_name(anime_name):
    count = 1
    print('Similar shows to {} include:\n'.format(anime_name))
    for item in item_sim_df_name.sort_values(by = anime_name, ascending = False).index[1:11]:
        print('No. {}: {}'.format(count, item))
        count +=1 

In [None]:
import re
#This function is to find exact anime name by inputing in keywords
def find_real_name(x):
    df1_anime=df1.drop_duplicates(subset='name')
    find_name=df1_anime[df1_anime['name'].str.contains(x, flags=re.IGNORECASE)] #case non-sensitive
    return find_name

<font size="3"><font color='purple'>This function is actually very useful when you can only remember the key words of the anime name. You can find the full accurate name with this function and input the correct name to find similar animes recommended to you.</font>

**Use Case 3: I recalled one intersting anime with "ping" in the name, can you find that out and provide me other recommendations?**

In [None]:
find_real_name('ping')

In [None]:
similar_animes_name('Ping Pong The Animation')

#### User with the highest similarity
**Use Case 4: I am user 3324. I want to find out the only one user who is the most like me and what is him watching .**

In [None]:
def user_like_me(user):
    # get the user's row
    s1 = df_p.loc[user,:]

    # get the index of max values in s1, might be more than 1
    s1_argmax = s1[s1 == s1.max()].index.tolist()

    # randomly choose 1 index
    #s1_argmax = np.random.choice(s1_argmax) 
    s1_argmax
    animes=[]
    for i in s1_argmax:
        name_list=anime_id_name_match[anime_id_name_match.anime_id==i]['name'].tolist()
        animes.append(name_list)  
    print('The user like you the most is also watching:')
    print(*animes, sep='\n')

In [None]:
user_like_me(3324)

#### Predict ratings
**Use Case 5: I am user5. I want to find out how will I like the famous anime "Cowboy Bebop".**

In [None]:
def predicted_rating(anime_name, user):
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:500] 
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:500]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        item_sim_df_name_2=df_p.copy()
        item_sim_df_name_2.columns = anime_id_name_match['name']
        rating = item_sim_df_name_2.loc[i, anime_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    return sum(rating_list)/sum(weight_list)   

<font size="3"><font color='purple'>In order to minimize the calculation time, I only chose first 500 users rating to make the prediction. </font>

In [None]:
predicted_rating('Cowboy Bebop', 5)

## 3.3 Content Based Using Cosine Similarity
The dataset does not have desciption of each anime so I will only use "genre" and "type" to do this content-based recommendation. We will still use codsine similarity.

In [None]:
df['genre_and_type']=df['genre']+','+df['type']
df_anime_name_match=df[['anime_id','name','genre_and_type']].drop_duplicates()
df_anime_name_match.head()

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import seaborn as sns
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import string
import nltk
import matplotlib.pyplot as plt
pd.set_option("display.max_colwidth", 200)
import spacy
import gensim
from gensim import corpora
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim
%matplotlib inline

import itertools
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:
# words to be removed from vocabulary
blockwords = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at','since','paid','don','doesn','close',
 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'doing', "don't", 'down', 'during',
 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's",
 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself',
 "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours' 'ourselves', 'out', 'over', 'own',
 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 'like',
 'than', 'that',"that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", 'also','can','could','should',
 "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'not','bit','much',
 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where','within','quite','really','just','together',
 "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",'will', 'with', "won't", 'would', "wouldn't", 'hole','furniture','put',
 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', "s'yourself'", 'yourselves', 'drawer','sure',
 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'hundred', 'thousand', '1st', '2nd', '3rd','nightstand','nightstands','night',
 '4th', '5th', '6th', '7th', '8th', '9th', '10th']

In [None]:
df_anime_name_match['genre_and_type']=df_anime_name_match['genre_and_type'].apply(str)

In [None]:
stop_words = set(stopwords.words('english'))                      # set of all stop words
lem=WordNetLemmatizer()
#p=inflect.engine()

def process(comment):
  sent = comment.lower()                                          # lower case all words 
  words = nltk.word_tokenize(sent)
  words =  [word for word in words if not word in blockwords]     # remove words present in blockwords
  words = [word for word in words if not word.isdigit()]          # remove digit characters
  #words = [word for word in words if len(word) > 3]               # remove words with length less than 3
  #words = [word for word in words if word.isalpha()]              # remove non alphabetic words
  words = [lem.lemmatize(word) for word in words]                 # lemmatize words to root word
  sent = ' '.join(words)
  sent = re.sub(r'\(', '', sent)
  sent = re.sub(r'\)', '', sent)
  sent = re.sub(r"'", '', sent)
  return sent

def num_words(sent):                                              # returns number of words in the sentence
  word_tok=nltk.word_tokenize(sent)
  return len(word_tok)

df_anime_name_match['Cleaned_g_t']=df_anime_name_match['genre_and_type'].apply(process)
df_anime_name_match['Unclean_len']=df_anime_name_match['genre_and_type'].apply(num_words)                     # word length of uncleaned comments
df_anime_name_match['Clean_len']=df_anime_name_match.Cleaned_g_t.apply(num_words)               # word length of cleaned comments
df_anime_name_match['percentage reduction']=(df_anime_name_match['Unclean_len']-df_anime_name_match['Clean_len'])/df_anime_name_match['Unclean_len']*100 # percentage of reduction

In [None]:
text= " ".join(df_anime_name_match['Cleaned_g_t'])
# Display the generated image:
wordcloud = WordCloud(max_font_size=35, max_words=40, background_color="white",collocations=False).generate(text)
plt.figure(figsize=(8,6))
plt.imshow(wordcloud, interpolation="gaussian")
plt.title('Top words in anime descriptions',size=19)
plt.axis("off")
plt.show()

In [None]:
tf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tf.fit_transform(df_anime_name_match['Cleaned_g_t'])

In [None]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim.shape[0]

In [None]:
tf_sim=pd.DataFrame(data=cosine_sim) 
tf_sim.index = df_anime_name_match['name']
tf_sim.columns = df_anime_name_match['name']
tf_sim.head()

<font size="3"><font color='purple'>Below is the similarity based on application of TF-IDF model.</font>

#### Content-based animes 
**Use Case 6: I like the anime with the "doro" in the name. Recommend me more animes with the similar genre/type.**

In [None]:
def similar_animes_content_based(anime):
    
    if anime not in tf_sim.columns:
        return('No anime called {}'.format(anime))
    
    print('Most Similar Animes:\n')
    sim_values = tf_sim.sort_values(by=anime, ascending=False).loc[:,anime].tolist()[1:11]
    sim_animes = tf_sim.sort_values(by=anime, ascending=False).index[1:11]
    zipped = zip(sim_animes, sim_values,)
    for anime, sim in zipped:
        print('{0}, {1:.2f}'.format(anime, sim)) 

In [None]:
find_real_name('doro')

In [None]:
similar_animes_content_based('Dororon Enma-kun Meeramera')

# 4. User Group Clustering

<font size="3"><font color='purple'>Principal component analysis(PCA) is being used. First, I will decrease the dimensions to 3 and see how it goes.</font>

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=3)
pca.fit(df_p)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_)

<font size="3"><font color='purple'>The influence factors of these three groups are not that obvious. The first group seems like stand out while I cannot see a huge difference between the other two.</font>

In [None]:
pca_df_p = pca.transform(df_p)
pca_df_p = pd.DataFrame(pca_df_p)
pca_df_p.head(2)

In [None]:
cluster_3 = pd.DataFrame(pca_df_p[[0,1,2]])

In [None]:
plt.rcParams['figure.figsize'] = (10, 6)


fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(cluster_3[0],cluster_3[1],cluster_3[2])

plt.title('Data Distribution PCA in 3D', fontsize=20)
plt.show()

<font size="3"><font color='purple'>I want to chose the best k-how many groups should we separate these users.  
The fist method I chose is “elbow" method. I called it "elbow" since the shape of this figure is like a elbow and we need to find the elbow joint, and the corresponding k number is the best. The indicator here is SSE-sum of the squared errors. The SSE will decrease sharpenly as the k number grows. Then at a certain point right after k, it will still decrease but the trend is not that obvious. This k is what we are looking for.</font>

In [None]:
from sklearn.cluster import KMeans
 
'利用SSE选择k'
SSE = []  # 存放每次结果的误差平方和 
for k in range(1,9):
    estimator = KMeans(n_clusters=k)  # 构造聚类器
    estimator.fit(pca_df_p[[0,1,2]])
    SSE.append(estimator.inertia_)
X = range(1,9)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X,SSE,'o-')


<font size="3"><font color='purple'>In the "elbow" method, it's apparent that 2 groups is the best.  
I chose another method called "silhouette scores", in which case we only need to find the k with the highest scores in the figure. As the result shows below, 2 is still the best.</font>

In [None]:
from sklearn.metrics import silhouette_score
Scores = []  # 存放轮廓系数 put silhouette scores here
for k in range(2, 14):
    estimator = KMeans(n_clusters=k)  # 构造聚类器 build k-means model
    estimator.fit(np.array(pca_df_p[[0,1,2]]))
    Scores.append(silhouette_score(np.array(pca_df_p[[0,1,2]]), estimator.labels_, metric='euclidean'))
X = range(2, 14)
plt.xlabel('k')
plt.ylabel('Silhouette Coefficient')
plt.plot(X, Scores, 'o-')
plt.show()

<font size="3"><font color='purple'>I'll just devide all users into 2 groups.</font>

In [None]:
clusterer = KMeans(n_clusters=2,random_state=30).fit(cluster_3)
centers = clusterer.cluster_centers_
c_preds = clusterer.predict(cluster_3)

In [None]:
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(cluster_3[0],cluster_3[1],cluster_3[2], c = c_preds)
plt.title('Data points in 3D PCA axis', fontsize=20)
plt.show()

In [None]:
fig = plt.figure(figsize=(10,8))
plt.scatter(cluster_3[0],cluster_3[1],cluster_3[2],c = c_preds)
for ci,c in enumerate(centers):
    plt.plot(c[1], c[0], c[2],'o', markersize=8, color='red', alpha=1)

plt.xlabel('x_values')
plt.ylabel('y_values')

plt.title('Data points in 2D PCA axis', fontsize=20)
plt.show()

In [None]:
df_p_anime=df_p.columns.tolist()
df_p_anime = pd.DataFrame (df_p_anime,columns=['anime_id'])
df_p_anime=pd.merge(df_p_anime,df1,on='anime_id',how='left')#2826 animes name match
df_p_anime=df_p_anime[['anime_id','name','anime_rating']].drop_duplicates()
df_p_name=df_p.copy()
df_p_name.columns = df_p_anime['name']#2826 animes
df_p_name['cluster'] = c_preds
df_p_name.head()

In [None]:
group1 = df_p_name[df_p_name['cluster']==0]
group2 = df_p_name[df_p_name['cluster']==1]

In [None]:
group1_mean=group1.mean().to_frame()
group1_mean.head(10)

In [None]:
group2_mean=group2.mean().to_frame()
group2_mean.head(10)

In [None]:
c=df_p_name.reset_index()
c=c[['user_id','cluster']]
df1_c=pd.merge(df1,c,on='user_id',how='left')
df1_c=df1_c.dropna(subset=['cluster'])
df1_c['cluster']=df1_c['cluster'].apply(int)
df1_c.head()

In [None]:
df1_c0 = df1_c[df1_c['cluster']==0]
df1_c1 = df1_c[df1_c['cluster']==1]

In [None]:
df1_c0['members'].mean()

In [None]:
df1_c1['members'].mean()

In [None]:
df1_c0['user_rating'].mean()

In [None]:
df1_c1['user_rating'].mean()