In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import re

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
usecols = ['MAL_ID', 'Name','Japanese name', 'Score', 'Genres', 'Type', 'Aired', 'Premiered']
anime =  pd.read_csv('../input/anime-recommendation-database-2020/anime.csv', usecols=usecols)

# pre-processing

In [None]:
anime.shape

In [None]:
anime.head()

In [None]:
anime.info()

In [None]:
anime['Aired'].value_counts()

In [None]:
def extract_year(series):
    if series != 'Unknown':
        m = re.search(r'[0-9]{4}', series)
        return m.group()
    else:
        return 0
    
anime['Year'] = anime['Aired'].map(extract_year).astype(int)

In [None]:
anime['Premiered'].value_counts()

In [None]:
anime['Type'].value_counts()

* Only `Type` TV has the `Premiered` information.

In [None]:
anime[anime['Type'] == 'Movie'].head(3)

In [None]:
print(17562 - 12817)
anime['Premiered'].str.match('(Spring|Summer|Fall|Winter) [0-9]{4}').sum()

In [None]:
anime['Season'] = anime['Premiered'].str.split(pat=' ', expand=True)[0]
anime['Season'].value_counts().reset_index()

In [None]:
anime['Score'].value_counts()

In [None]:
# replace Unknown
anime['Score'] = anime['Score'].replace('Unknown', 0).astype(float)

In [None]:
def split_labels(series):
    series = series.split(",")
    if "Unknown" in series:
        series.remove("Unknown")
    return series

anime["Genres"] = anime["Genres"].map(split_labels)

# EDA

## How many anime in each year

In [None]:
anime_year = pd.DataFrame(anime.groupby('Year').count()['MAL_ID']).reset_index()
anime_year.head()

In [None]:
tmp = anime_year.drop(index=0)
tmp = tmp.sort_values('Year')

plt.plot(tmp['Year'], tmp['MAL_ID'])
plt.xlim(1910, 2020)
plt.show()

## See the top 100 anime are in which year

In [None]:
anime_top100 = anime.sort_values('Score', ascending=False).head(100)
anime_top100.head(3)

In [None]:
# Just want to see the result in 2011
anime_top100[anime_top100['Year'] == 2011]

In [None]:
anime_top100_year =  pd.DataFrame(anime_top100.groupby('Year').count()['MAL_ID']).reset_index()
anime_top100_year.sort_values('MAL_ID', ascending = False).head(5)

In [None]:
plt.plot(anime_top100_year['Year'], anime_top100_year['MAL_ID'])

x_major_locator = MultipleLocator(5)
y_major_locator = MultipleLocator(1)
ax = plt.gca()
ax.xaxis.set_major_locator(x_major_locator)
ax.yaxis.set_major_locator(y_major_locator)
plt.xlim(1980, 2020)
plt.grid(color='r', linestyle='dotted', linewidth=1)
plt.show()

# See the top 100 anime genre

In [None]:
from collections import defaultdict

all_genres = defaultdict(int)

for genres in anime['Genres']:
    for genre in genres:
        all_genres[genre.strip()] += 1
        
from wordcloud import WordCloud

genres_cloud = WordCloud(width=1200, height=800, background_color='white', colormap='gnuplot').generate_from_frequencies(all_genres)
plt.imshow(genres_cloud, interpolation='bilinear')
plt.axis('off')