In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

In [None]:
anime = pd.read_csv('../input/anime-recommendations-database/Anime_data.csv', encoding='latin')

print('anime (shape):', anime.shape)
anime.head()

In [None]:
display(anime[['Title', 'Rating', 'Producer', 'Studio']].loc[anime['Type'] == 'Movie'])

In [None]:
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(r'Â°', '',text)
    
    return text

anime['Title'] = anime['Title'].apply(text_cleaning)


In [None]:
anime.isnull().sum()

In [None]:
anime.describe()

IMDB's weighted rating (WR) which is given as :

WR = (v/(v+m)R) + (m/(v+m)C)

where, v is the number of votes for the movie; m is the minimum votes required to be listed in the chart; R is the average rating of the movie; And C is the mean vote across the whole report

In [None]:
C = anime['Rating'].mean()
C

In [None]:
anime['ScoredBy'].describe()

In [None]:
m = anime['ScoredBy'].quantile(0.85)
m

In [None]:
q_animes = anime.copy().loc[anime['ScoredBy'] >= m]
q_animes.shape

In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['ScoredBy']
    R = x['Rating']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
q_animes['Score'] = q_animes.apply(weighted_rating, axis=1)

In [None]:
q_animes = q_animes.sort_values('Score', ascending=False)
q_animes[['Title', 'ScoredBy', 'Rating', 'Score']].head(15)

In [None]:
plt.figure(figsize=(12, 3), dpi=100)


best_score = q_animes.sort_values(by=['Score'], ascending=False)[:10]


g = sns.barplot(best_score["Title"], best_score['Score'], palette="spring_r")
plt.ylabel("Score", color = 'b')
plt.xticks(rotation=45, horizontalalignment='right', color = 'b')
plt.title('Really good animes', fontweight='bold', fontsize=15, color = 'b');

In [None]:
best_scores = best_score[['Score','Title','Genre', 'Studio', 'Type']].set_index('Title')
display(best_scores)

## Recommendation System

### (1) Content Based filtering

In [None]:
anime['Synopsis'].isnull().sum()

In [None]:
anime['Synopsis'] = anime['Synopsis'].fillna('')

### Now we'll compute Term Frequency-Inverse Document Frequency (TF-IDF) vectors for each Synopsis.

Since we have used the TF-IDF vectorizer, calculating the dot product will directly give us the cosine similarity score. Therefore, we will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tfidf.fit_transform(anime['Synopsis'])

tfidf_matrix.shape

In [None]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

In [None]:
indices = pd.Series(anime.index, index=anime['Title']).drop_duplicates()
indices

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
    
    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 15 most similar movies
    sim_scores = sim_scores[1:16]
    
    movie_indices = [i[0] for i in sim_scores]

    return anime['Title'].iloc[movie_indices]

In [None]:
get_recommendations('Sen to Chihiro no Kamikakushi')

In [None]:
get_recommendations('Koe no Katachi')

Genres, Producer and Studio Based Recommender
It goes without saying that the quality of our recommender would be increased with the usage of better metadata. That is exactly what we are going to do in this section. We are going to build a recommender based on the following metadata: the producer, related genres and the studio.

In [None]:
anime.head(2)

In [None]:
features = ['Genre','Producer', 'Studio']

print(anime[features].isnull().sum())

In [None]:
anime[features] = anime[features].fillna('[' ']')

In [None]:
from ast import literal_eval

for feature in features:
    anime[feature] = anime[feature].apply(literal_eval)

In [None]:
anime.Type.unique()

In [None]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ","")) for i in x]
    
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ",""))
        else:
            return ""

In [None]:
features = ['Genre','Producer', 'Studio', 'Type']

for feature in features:
    anime[feature] = anime[feature].apply(clean_data)

In [None]:
anime.head(2)

In [None]:
features

In [None]:
def create_soup(x):
    return " ".join(x['Genre']) + " " + x['Type'] + " " + " ".join(x['Producer']) + " " + " ".join(x['Studio']) + " " + x['Synopsis'] + " " + " ".join(x['Studio']) 

In [None]:
anime['soup'] = anime.apply(create_soup, axis=1)

In [None]:
anime['soup']

The next steps are the same as what we did with our plot description based recommender. One important difference is that we use the CountVectorizer() instead of TF-IDF. This is because we do not want to down-weight the presence of an producer if he or she has acted or directed in relatively more movies. It doesn't make much intuitive sense.


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(anime['soup'])

from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
anime = anime.reset_index()
indices = pd.Series(anime.index, index=anime['Title'])

In [None]:
indices

In [None]:
get_recommendations('Cowboy Bebop', cosine_sim2)

In [None]:
get_recommendations('Sen to Chihiro no Kamikakushi', cosine_sim2)

In [None]:
get_recommendations('Mirai no Mirai', cosine_sim2)

In [None]:
display(anime[['Title', 'Rating', 'Producer', 'Studio']].loc[anime['Title'] == 'Sen to Chihiro no Kamikakushi'])

### (2) Collaborative filtering using k-Nearest Neighbors (kNN)

In [None]:
rating = pd.read_csv('../input/anime-recommendations-database/rating.csv', encoding='latin')

print('rating (shape):', rating.shape)

rating.head(5)

In [None]:
print('anime (shape):', anime.shape)

anime.head(2)

In [None]:
rating.columns = ['User_id', 'Anime_id', 'Rating']

anime.isnull().sum()

In [None]:
anime.fillna({'Rating':0}, inplace=True)

Remove anime with low count of ratings and users who gave low count of ratings

In [None]:
anime_rating = rating.groupby(by = 'Anime_id').count()
anime_rating = anime_rating['Rating'].reset_index().rename(columns={'Rating':'Rating_count'})
anime_rating

In [None]:
anime_rating['Rating_count'].describe()

In [None]:
final_anime = anime_rating[anime_rating['Rating_count']>50]
final_anime.shape

In [None]:
user_rating = rating.groupby(by='User_id').count()
user_rating = user_rating['Rating'].reset_index().rename(columns={'Rating':'Rating_count'})
user_rating

In [None]:
user_rating['Rating_count'].describe()

In [None]:
final_user = user_rating[user_rating['Rating_count']>80]
final_user.shape

Lets create dataset with popular anime and high rating counts from users

In [None]:
final_anime_dt = rating[rating['Anime_id'].isin(final_anime['Anime_id'])]
final_dt = final_anime_dt[final_anime_dt['User_id'].isin(final_user['User_id'])]
final_dt.head()

### Construct Rating Matrix
We will construct a matrix wherein Anime id will be indexes and User id in columns and then Convert rating matrix to csr matrix to save memory

In [None]:
rating_matrix = final_dt.pivot_table(index='Anime_id',columns='User_id',values='Rating').fillna(0)

print(rating_matrix.shape)
rating_matrix.head()

In [None]:
from scipy.sparse import csr_matrix
csr_rating_matrix =  csr_matrix(rating_matrix.values)

print(csr_rating_matrix)

### Fit the matrix into k-Nearest Neighbors (kNN)
We will also use cosine similarity as the metric for the algorithm.

In [None]:
from sklearn.neighbors import NearestNeighbors

recommender = NearestNeighbors(metric='cosine')

recommender.fit(csr_rating_matrix)

In [None]:
user_anime = anime[anime['Title']=='Sen to Chihiro no Kamikakushi']
user_anime

In [None]:
user_anime_index = np.where(rating_matrix.index==int(user_anime['Anime_id']))[0][0]

# this index is from rating matrix not from the anime dataset!

print('rating natrix index:', user_anime_index)

# getting the ratings based on the index

user_anime_ratings = rating_matrix.iloc[user_anime_index]
user_anime_ratings

In [None]:
# now we need to convert this into 2d array (with only 1 row)

user_anime_ratings_reshaped = user_anime_ratings.values.reshape(1,-1)
user_anime_ratings_reshaped

In [None]:
# the ratings will be plotted and will return 11 indices and distances of nearest neighbors
distances, indices = recommender.kneighbors(user_anime_ratings_reshaped,n_neighbors=16)

print(distances)

In [None]:
print(indices)

In [None]:
# the returned indices will be used to get anime id(index) on rating matrix, 
# excluding the first element since the first nearest neighbor is itself

nearest_neighbors_indices = rating_matrix.iloc[indices[0]].index[1:]

In [None]:
nearest_neighbors = pd.DataFrame({'Anime_id': nearest_neighbors_indices})
pd.merge(nearest_neighbors,anime,on='Anime_id',how='left')