In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.mode.chained_assignment = None  # default='warn'

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/anime-dataset/anime.csv", engine='python')
display(df.head())
print(df.describe())

I'll be using IMDB's weighted rating (wr):

![](https://image.ibb.co/jYWZp9/wr.png)

* v is the number of votes for the movie;
* m is the minimum votes required to be listed in the chart;
* R is the average rating of the movie; And
* C is the mean vote across the whole report

This is a good way to avoid weight more the animes that has been recently released and have more stars and less votes.
   

In [None]:
C = df['rating'].mean()
m = df['votes'].quantile(0.85)
print('Mean rating {:.2}, quantite of votes needes to stay {:.0f}'.format(C,m))


In [None]:
df2 = df.loc[df['votes'] >= m]
print(df.shape)
print(df2.shape)

Make a function to obtain the score of each anime

In [None]:
def weight_rating(x, m=m, C=C):
    v = x['votes']
    R = x['rating']
    
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
df2['score'] = df2.apply(weight_rating, axis=1)

![](http://)Base on this type of rating the number of anime decreased a lot, but the main idea is to keep the rating equally as possible to (first) recommend the best animes.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter


plt.figure(figsize=(12, 3), dpi=100)


color_map = ['#3CB7F1' for _ in range(10)]
color_map[0] = '#5DF13C'


best_score = df2.sort_values(by=['score'], ascending=False)[:10]



g = plt.bar(best_score["title"], best_score['score'], color=color_map)
plt.ylabel("Score", color='green')
plt.xticks(rotation=45, horizontalalignment='right')
plt.title('Really good animes', fontweight='bold', fontsize=15);


Just to check, let's do the same but with all the ratings in the original data.

In [None]:
best_rating_not_filter = df.sort_values(by=['rating'], ascending=False)[:10]


plt.figure(figsize=(12, 3), dpi=100)
g = sns.barplot(best_rating_not_filter["title"], best_rating_not_filter['rating'], palette="Oranges_r")
plt.ylabel("Rating", color='orange', fontweight='bold')
plt.xlabel("")
g.set_xticklabels(g.get_xticklabels(), rotation=45,  horizontalalignment='right')
plt.title('Really good recent rated animes', fontweight='bold', fontsize=15);

**So, if a friend tells you "*hey, I wanna watch a anime, any recommendations?*" You could say, yes:**

In [None]:
best_scores = best_score[['score','title','watched', 'studios']].set_index('title')
display(best_scores)

But, maybe, your friend hasn't watched too much anime, and also he doesn't want to take a look at your list, so he could just end up watching and anime and dropping it. Let's make a plot with the most dropped animes to (if possible) avoid at the beginning of his anime career.

In [None]:
dropped = df2.sort_values(by=['dropped', 'score'], ascending=[False, False])

plt.figure(figsize=(12, 3), dpi=100)

color_map = ['#f59dd0' for _ in range(5)]
color_map[2] = '#5DF13C'


plt.barh(dropped['title'].head(5), dropped['score'].head(5), align='center', color=color_map)
plt.ylabel('Animes')
plt.xlabel('Scores')
plt.title('Most dropped animes');



In [None]:
display(dropped[['score','title','dropped', 'studios']].set_index('title').head(10))

#These are very good animes, but they're also dropped very frequently.

# Recommendation System

Now you need a recommendation system for people who have already watched anime. First we are going to use the review of each anime to make a matrix that will compute the Term Frequency-Inverse Document Frequency (TF-IDF).

In [None]:
#use all the data.

df['description'].head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#create the object vector 
tfidf = TfidfVectorizer(stop_words='english')

#fill nans
df['description'] = df['description'].fillna('')

#fit and transform the description in a Term Frequency-Inverse Document Frequency (TF-IDF) matrix
tfidf_matrix =  tfidf.fit_transform(df['description'])

tfidf_matrix.shape



31.384 words were used to describe the 14.578 animes. Next, compute the [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity)


In [None]:
from sklearn.metrics.pairwise import linear_kernel

#use the tfidf_matrix to pass into a linear kernel and get the cosine similarity matrix 
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [None]:
#create a index to pass the anime and get the idx
indice = pd.Series(df.index, index=df['title']).drop_duplicates()

display(indice.head())
print("A Silent Voice is in index: ", indice['A Silent Voice'])

In [None]:
#define a function to pass the anime and return the recommendations

def recommendation(title, cosine_sim=cosine_sim):
    #Get the index of the anime pass
    idx = indice[title]
    
    #make the pairwise similarity score
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    
    #sort base on similarity
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse =True)
    
    #get the 10 most similar
    sim_scores = sim_scores[1:11]
    
    #get the index in df
    anime_index = [i[0] for i in sim_scores]
    
    #return the animes
    return df['title'].iloc[anime_index]
    
    

In [None]:
recommendation('Attack on Titan 3rd Season: Part II')

In [None]:
recommendation('One-Punch Man')

Good, but this system get the similar words and structure in the review. That's why we get all the anime from the franchise in first place and also why we get animes from the Saitama Prefecture in the case of One-Punch Man. It's not a bad idea to watch everything that the anime has for you, but we are more sophisticated and want to get similar animes from the studio, the content warning and tags.

# Recommendation System Part.2

In [None]:
#create a copy of studios, we'll use this in the future to explore the data
df['copy_studios'] = df['studios']

features = ['studios','contentWarn', 'tags']

print(df[features].isna().sum())

In [None]:
from ast import literal_eval

for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [None]:
#make a function to prepare the date
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ","")) for i in x]
    
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ",""))
        else:
            return ""

In [None]:
for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [None]:
#create a function to put all the words in one 'soup'
def soup(x):
    return " ".join(x['studios']) + " " + " ".join(x['contentWarn']) + " " +" ".join(x['tags'])

df['soup'] = df.apply(soup, axis=1)
    

In [None]:
print(df['soup'][10])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')

count_matrix = count.fit_transform(df['soup'])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the Cosine Similarity matrix based on the count_matrix,
#the count matrix don't down-weight the number of times a tag appears,
#in this case this is better


cosine_2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
#create a index Serie to pass the anime
df = df.reset_index()
indice_2 = pd.Series(df.index, index=df['title'])

In [None]:
#get the recommendation 
recommendation('One-Punch Man', cosine_2)

In [None]:
recommendation('Paprika', cosine_2)

In [None]:
display(df[['title', 'mediaType', 'eps', 'duration', 'studios', 'tags', 'contentWarn', 'rating']].loc[df['title'] == 'Paprika'])
display(df[['title', 'mediaType', 'eps', 'duration', 'studios', 'tags', 'contentWarn', 'rating']].iloc[[5453, 6113, 3394, 3877]])

# **Pretty goods animes to watch, if you like Paprika. Now we can extract some info from the data.**

In [None]:
#get rid of the animes that hasn't been released 
df = df.loc[df['startYr'] <= 2020]

In [None]:
#make bins and labels for decades
bins = [i for i in range(1910,2021,10)]
labels = [str(i)+str("-")+str(i+10) for i in range(1910,2020,10)]

df['decade_of_released'] = pd.cut(df['startYr'], bins=bins, labels=labels)


In [None]:
tv_data = df.loc[df['mediaType'] == 'TV']
tv_data = tv_data.groupby('decade_of_released').count()['title']


In [None]:

#plot

plt.figure(figsize=(12, 4), dpi=120)
sns.set_style("ticks")

splot = sns.countplot(x='decade_of_released', hue='mediaType', data=df)

for p in splot.patches:
    if p.get_height() in tv_data.values:
        splot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), 
                       ha = 'right', va = 'center', xytext = (0, 4), textcoords = 'offset points', color='blue')
        
plt.ylabel("Amount", color='orange', fontweight='bold')
plt.xlabel("Decade of Released", color='orange', fontweight='bold', labelpad=15)
plt.legend(loc='upper left', facecolor='#42d3ff', framealpha=1)
sns.despine()
plt.show();

In [None]:
plt.figure(figsize=(12, 4), dpi=120)
scatter = sns.violinplot(data=df, x='decade_of_released', y='rating')

plt.ylabel("Rating", color='orange', fontweight='bold')
plt.xlabel("Decade of Released", color='orange', fontweight='bold', labelpad=15)
plt.title('Decade and ratings', fontweight='bold');

In [None]:
plt.figure(figsize=(12, 4), dpi=120)
scatter = sns.violinplot(data=df, x='mediaType', y='rating')
plt.ylabel("Rating", color='orange', fontweight='bold')
plt.xlabel("Media Type", color='orange', fontweight='bold', labelpad=15)
plt.title('Media Type', fontweight='bold');

In [None]:
plt.figure(figsize=(18, 4), dpi=120)
sns.violinplot(data=df, x='decade_of_released', y='rating', color="white")
sns.stripplot(data=df, x='decade_of_released', y='rating', hue='mediaType', jitter=True,
                   dodge=True, 
                   marker='o', 
                   alpha=0.2)



plt.ylabel("Rating", color='orange', fontweight='bold')
plt.xlabel("Decade of Released", color='orange', fontweight='bold', labelpad=15)
plt.title('Decade, Ratings and Media Type', fontweight='bold');

It's been a long way since 1910 to 2020, starting with just a few movies to end up with a really good amount of material to watch. Take a look how Web was just 2 little dots in 1990-2000 to be a large a tick line en 2000-2010, and it hasn't "eat" the other types, because there's still Movies, Tv Specials, OVA's, and DVD

# Data Analysis
Now, let's take a look at the studios.

In [None]:
#fill the NaN with zeros
df['rating'] = df['rating'].fillna(0)


In [None]:
#The Copy Studios is not a list, but a string, so we need to clean that.
df['copy_studios'] = df['copy_studios'].str.replace('[', '')
df['copy_studios'] = df['copy_studios'].str.replace(']', '')
df['copy_studios'] = df['copy_studios'].str.replace("'", "")
df['copy_studios'] = df['copy_studios'].str.split(",")


In [None]:
#create a dict with each studio
cnt = {}

for idx, row in df.iterrows():
    rating = row['rating']
    studios = row['copy_studios']
    for studio in studios:      
        if not studio in cnt:
            cnt[studio] = {}
            cnt[studio].setdefault('productions', 1)
            score = float(rating) 
            cnt[studio]['rating'] = []
            cnt[studio]['rating'].append(score)
        else:
            score = float(rating)
            cnt[studio]['productions'] += 1
            cnt[studio]['rating'].append(score)
            



In [None]:
import numpy as np
#get the mean rating of the studios
for studio in cnt:
    cnt[studio]['rating'] = round(np.mean(cnt[studio]['rating']),2)
    

In [None]:
#make the dict a data frame
studios = pd.DataFrame.from_dict(cnt, orient='index')


In [None]:
#let's see the most prolific studios

more_productive_st = studios.sort_values(by=['productions', 'rating'], ascending = [False, False])[:20]
more_productive_st

In [None]:
#plot the results

more_productive_st['studios'] = more_productive_st.index

sns.set_style(style="whitegrid")
plt.figure(figsize=(12, 6), dpi=100)


gx2 = sns.scatterplot(x='studios', y="productions",data= more_productive_st[1:], size='rating', sizes=(20, 200), color="skyblue")


plt.xticks(rotation=45, horizontalalignment='right')
plt.xlabel('Studios', fontweight='bold', labelpad=10, color='green')
plt.ylabel('Productions 1917-2020', fontweight='bold', labelpad=5, color='green')
plt.title('Studios, Productions and Ratings', fontweight='bold', color='darkblue', fontsize=13)


plt.show()

**That's all. If you any recommendation to improve the notebook, please comment and
*Thanks!!***