In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

In [None]:
anime=pd.read_csv("../input/anime-recommendations-database/anime.csv")
rating=pd.read_csv("../input/anime-recommendations-database/rating.csv")
anime[:5]

In [None]:
anime.info()

genre, type and rating all have missing values.

anime_id is the id of anime, it has no statistical significance, and the type will be changed to str.

In [None]:
anime['anime_id']=anime['anime_id'].astype('str')
anime['anime_id'].dtype

Because episodes represent numeric values, so change the type to int.

In [None]:
anime[anime['episodes']=='Unknown']

In [None]:
anime=anime[anime['episodes']!='Unknown']
anime['episodes']=anime['episodes'].astype('int')
anime['episodes'].dtype

# Descriptive analysis

In [None]:
anime.describe()

In [None]:
anime['type'].value_counts()

In [None]:
data=anime['type'].value_counts()
y=data.values
plt.figure(figsize=(8,8))
plt.title('Anime distribution by type',fontsize=20)
patches,l_text,p_text=plt.pie(y,labels=data.index,autopct='%.1f%%',startangle=-20)
for i in p_text:
    i.set_size(15)
    i.set_color('w')
for i in l_text:
    i.set_size(15)
plt.figlegend()
plt.show()

TV, OVA, and Movie have a high proportion of anime.

In [None]:
data=anime['rating'].groupby(anime.type).mean()
x=data.index
y=data.values

plt.figure(figsize=(12,8))
plt.title('Average rating of type',fontsize=20)
plt.bar(x,y,color='g',alpha=0.5)
for i,j in zip(x,y):
    plt.text(i,j+0.1,'%.2f'%j,ha='center',va='bottom',fontsize=12)
plt.tick_params(labelsize=14)
plt.show()

TV has the highest score, with an average score close to 7.

In [None]:
def count_genre(f):
    count_of_genre=[]
    f['genre']=f['genre'].astype(str)
    for genre in f['genre']:
        if genre!='NaN':
            count=len(genre.split(','))
            count_of_genre.append(count)
        else:
            count_of_genre.append(-1)
    return count_of_genre

anime['count_genre']=count_genre(anime)
anime.head()

In [None]:
data=anime['count_genre'].value_counts()
x=data.index
y=data.values

plt.figure(figsize=(12,8))
plt.bar(x,y,color='g',alpha=0.5)
for i,j in zip(x,y):
    plt.text(i,j+0.1,j,ha='center',va='bottom',fontsize=12)
plt.title('Anime distribution by genre',fontsize=20)
plt.tick_params(labelsize=14)
plt.show()

There are more animations of a single genre.

In [None]:
data=anime['rating'].groupby(anime.count_genre).mean()
x=data.index
y=data.values

plt.figure(figsize=(14,8))
plt.plot(x,y,'r')
plt.tick_params(labelsize=14)
plt.title('Average rating by genre',fontsize=20)
for i,j in zip(x,y):
    plt.text(i,j-0.1,'%.2f'%j,ha='center',va='bottom',fontsize=12)
plt.show()

As the genre increases, the average rating is higher.

In [None]:
import collections

In [None]:
genre_count=collections.defaultdict(int)
for genres in anime['genre']:
    if not type(genres) is str:
        continue
    data=set()
    for genre1 in genres.split(','):
        if genre1 in data:
            continue
        data.add(genre1)
        genre_count[genre1]+=1
sorted(genre_count.items(),key=lambda x:x[1], reverse=True)

In [None]:
dict1=dict(genre_count)
x=dict1.keys()
y=dict1.values()
temp={'genre1':list(x),'count1':list(y)}
df=pd.DataFrame.from_dict(temp)
df=df.sort_values(by='count1',ascending=False)[:10]
df

In [None]:
x=df.genre1
y=df.count1

plt.figure(figsize=(14,10))
plt.barh(x,y,color='g',alpha=0.5)
plt.title('TOP10 of Anime by genre',fontsize=20)
plt.tick_params(labelsize=14)
for i,j in zip(y,x):
    plt.text(i+0.1,j,i,fontsize=12)
plt.show()

The most animes are Action and Comedy.

In [None]:
x=anime['members']
y=anime['rating']

plt.figure(figsize=(14,8))
ax=plt.gca()
ax.scatter(x,y,color='c')
ax.set_xscale('log')
plt.title('rating VS members',fontsize=20)
plt.tick_params(labelsize=14)
plt.xlabel('members',fontsize=14)
plt.ylabel('rating',fontsize=14)
plt.show()

As the number of people who like anime increases, the anime ratings increases.

In [None]:
%pylab inline

In [None]:
data=anime[['episodes','members','rating']]
result=pd.plotting.scatter_matrix(data,diagonal='kde',color='k',alpha=0.3,figsize=(15,15))

In [None]:
import seaborn as sns
corr=data.corr()
corr=abs(corr)

fig=plt.figure(figsize=(15,10))
ax=fig.add_subplot()

ax=sns.heatmap(corr,vmax=1,vmin=0,annot=True,annot_kws={'size':13,'weight':'bold'},linewidths=0.05)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.show()

Animation ratings are more in line with the normal distribution, the number of members has a certain correlation with the animation ratings.

# Anime recommendation system

Calculate the weight coefficient

Calculate the weight coefficient based on the third quartile

In [None]:
anime['score']=round(anime['members']/(anime['members']+anime['members'].quantile(0.75))*anime['rating']+anime['members'].quantile(0.75)/(anime['members']+anime['members'].quantile(0.75))*anime['rating'].mean(),2)
anime[:10]

In [None]:
anime.sort_values(by=['score'],ascending=False)[:20]

This is the list of top 20 anime based on weighted rating calculation.