### Snippet of the code used for following presentation:
https://github.com/pmandrik/AnimeRecommendationSystem2020/raw/78ec99295bc1fdc87f9e1fb9bff487ce630f84b2/pmandrik_anime_I.pdf

### The full version of code is available at (or will be):
https://github.com/pmandrik/AnimeRecommendationSystem2020

In [None]:
import pandas
from matplotlib import pyplot as plt
from matplotlib import colors
from collections import defaultdict
import numpy as np

###################################### Explore data from "anime.csv"
all_data = pandas.read_csv("../input/anime-recommendation-database-2020/anime.csv", dtype=str) 
print ("anime.csv feathes = ", list(all_data.columns))

In [None]:
################### get genres
Genders = defaultdict(int)
for val in all_data['Genders']:
  for v in val.split(','):
    Genders[ v.strip() ] += 1

print("anime genres = ", list(dict(Genders).keys()) )

################### get episodes
Episodes = defaultdict(int)
for val in all_data['Episodes']:
  if val == "Unknown" : 
    Episodes[ 0 ] += 1;
  else : Episodes[ int(val.strip()) ] += 1

print("anime Episodes = ", list(dict(Episodes)) )
#for val in sorted(dict(Episodes).keys()):  print(val, Episodes[val] )

################### get durations
Durations = defaultdict(int)
for val in all_data['Duration']:
  Durations[ val ] += 1

print("anime Durations = ", list(dict(Durations)) )

################### get Types
Types = defaultdict(int)
for val in all_data['Type']:
  Types[ val.strip() ] += 1
print("anime types = ", list(dict(Types)) )

In [None]:
################### create release date
years  = []
months = []
for val in all_data['Aired']:
  vr = val.split()
  y = 'Unknown'
  m = 'Unknown'
  for v in vr:
    if v.isdigit() and len(v) == 4 :
      y = v
      break
  for v in vr:
    if not v.isdigit() and len(v) >= 3 and v[0].isupper() and v != 'Unknown' :
      m = v[:3]
      break
        
  years += [ y ]
  months += [ m ]

all_data['Year'] = years
all_data['Month'] = months

In [None]:
###################################### Plots
score_var  = 'Score'
for f in ['Genders', 'Type', 'Source', 'Rating'] :
  all_data[f] = all_data[f].str.split(', ')
  datas = defaultdict(list)
  for score, types in zip( all_data[score_var], all_data[f] ):
    if score == "Unknown": continue
    for type in types :
      datas[ type ] += [ float(score) ]

  sorted_datas = sorted(datas.items(), key=lambda f : sum(f[1])/len(f[1]) )

  fig, ax = plt.subplots()
  ax.set_title( f )
  if f == "Genders" : 
    ax.set_title( "Genres" )
    fig.set_figheight( 2*fig.get_figheight() )
  ax.boxplot( [f[1] for f in sorted_datas], vert=False, flierprops=dict(markerfacecolor='g', marker='D') )
  ax.set_yticklabels( [f[0] for f in sorted_datas] )
  ax.set_xlabel('MyAnimelist Score')
  plt.show()

In [None]:
################### Stackplots per Year
for f in ['Genders', 'Type', 'Source', 'Rating'] :
  datas = {}
  for year in [ str(y) for y in range(1910, 2030) ]:
    datas[ year ] = defaultdict(list)

  all_types = []
  for score, types, year in zip( all_data[score_var], all_data[f], all_data["Year"] ):
    if year == "Unknown":  continue
    if score == "Unknown": continue
    for type in types :
      datas[year][ type ] += [ float(score) ]
      all_types += [ type ]

  x_data = defaultdict(list)
  for type in list(set(all_types)):
    for year in [ str(y) for y in range(1980, 2021) ]:
      x_data[ type ] += [ len( datas[ year ][ type ] ) ]

  sorted_datas = sorted(x_data.items(), key=lambda f : -sum(f[1]) )

  for i in range(len(sorted_datas[0][1])) :
    summ = sum( [ ff[1][i] for ff in sorted_datas ] )
    for j in range(len(sorted_datas)):
      sorted_datas[j][1][i] /= summ

  if len(sorted_datas) > 18:
    others = []
    for i in range(len(sorted_datas[0][1])) :
      summ = sum( [ ff[1][i] for ff in sorted_datas[18:] ] )
      others += [ summ ]
    sorted_datas = sorted_datas[:18] + [ ["...", others] ]

  fig, ax = plt.subplots()
  ax.stackplot([y for y in range(1980, 2021)], [f[1] for f in sorted_datas], labels=[f[0] for f in sorted_datas] )
  ax.set_xlabel('Year')
  plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
  fig.subplots_adjust(right=0.5)
  fig.set_figwidth( 2*fig.get_figwidth() )

  ax.set_title( "Anime " + f )
  if f == "Genders" :    ax.set_title( "Anime Genre" )
  if f == "Rating" :     ax.set_title( "Anime Age rating" )
  if f == 'Year_class' : ax.set_title( "Anime Release year" )
  if f == 'Month' :      ax.set_title( "Anime Release month" )

  ax.set_ylabel('Fraction of Anime')

In [None]:
  ################### check number of 'Episodes'
  N_episodes_regions = [1, 5, 10, 20, 30, 45, 70, 100, 9000]
  for N in N_episodes_regions :
    all_data[ 'Episodes_' + str(N) ] = 0
    
  episodes = []
  for i in all_data.index:
    eps = all_data.at[i, 'Episodes']
    if eps == "Unknown" : continue
    eps = int(eps)
    for N in N_episodes_regions :
      if eps > N : continue
      all_data.at[i, 'Episodes_' + str(N) ] = 1
      break
    if eps < 2 : continue
    if eps > 100 : continue
    episodes += [ eps ]

  fig, ax = plt.subplots(tight_layout=True)
  hist = ax.hist(episodes, bins=50, facecolor='g')
  ax.set_xlabel('Number of series (>1)')
  ax.set_ylabel('Number of Anime')

  datas = defaultdict(list)
  for i, N in enumerate(N_episodes_regions) :
    title  = 'Episodes_' + str(N)
    xtitle = str( ([0] + N_episodes_regions)[i] ) + "-" + str(N_episodes_regions[i])
    tmp_df = all_data[all_data[title] == 1]
    tmp_df = tmp_df[tmp_df["Score"] != "Unknown"]
    datas[ xtitle ] = tmp_df["Score"].tolist()
  # print( datas )

  sorted_datas = sorted(datas.items(), key=lambda f : int(f[0].split("-")[0]) )

  fig, ax = plt.subplots()
  ax.set_title( "Anime Number of episodes" )
  ax.boxplot( [[float(x) for x in f[1]] for f in sorted_datas], vert=False, flierprops=dict(markerfacecolor='g', marker='D') )
  ax.set_yticklabels( [f[0] for f in sorted_datas] )
  ax.set_xlabel('MyAnimelist Score')

  # pie
  fig, ax = plt.subplots()

  data_x = [len(f[1]) for f in sorted_datas]
  data_y = [f[0] for f in sorted_datas]

  patches, texts, autotexts = ax.pie(data_x, labels = data_y, autopct='%d%%')
  ax.set(aspect="equal", title="Fraction of Anime films")
  plt.setp(autotexts, size=12, weight="bold", color="white")

  plt.title("% of Anime per Number of episodes")

In [None]:
  # Add number of voters
  def get_voters(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10):
    x = [x1, x2, x3, x4, x5, x6, x7, x8, x9, x10]
    def f(val) :
      if val == "Unknown" : return 0
      return float(val)
    return int(sum(map(f, x)))
  all_data["N_votes"] = all_data.apply(lambda x: get_voters(x['Score-1'], x['Score-2'], x['Score-3'], x['Score-4'], x['Score-5'], x['Score-6'], x['Score-7'], x['Score-8'], x['Score-9'], x['Score-10']), axis=1) 

  for f in ['Genders', 'Type', 'Source', 'Rating'] : 
    datas = defaultdict(list)
    for score, types in zip( all_data["N_votes"], all_data[f] ):
      if score == "Unknown": continue
      for type in types :
        datas[ type ] += [ float(score) ]

    sorted_datas = sorted(datas.items(), key=lambda f : sum(f[1]) )
    data_x = [sum(f[1]) for f in sorted_datas]
    data_xx = [len(f[1]) for f in sorted_datas]
    data_y = [f[0] for f in sorted_datas]

    if f == 'Source':
      data_x = data_x[-9:] + [ sum(data_x[:-9]) ]
      data_y = data_y[-9:] + [ '...' ]
    if f == 'Genders':
      data_x = data_x[-16:] + [ sum(data_x[:-18]) ]
      data_y = data_y[-16:] + [ '...' ]
    if f == 'Year_class':
      data_x = data_x[-8:] + [ sum(data_x[:-8]) ]
      data_y = data_y[-8:] + [ '...' ]

    data_x = list(reversed(data_x))
    data_y = list(reversed(data_y))

    fig, ax = plt.subplots()
    data_y = data_y[::2] + list(data_y[1::2])
    wedges, texts, autotexts = ax.pie(data_x[::2] + list(data_x[1::2]), wedgeprops=dict(width=0.5), startangle=-40, autopct='%d%%', pctdistance=0.80)

    bbox_props = dict(boxstyle="square,pad=0.2", fc="w", ec="k", lw=0.72)
    kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")

    for i, p in enumerate(wedges):
      ang = (p.theta2 - p.theta1)/2. + p.theta1
      y = np.sin(np.deg2rad(ang))
      x = np.cos(np.deg2rad(ang))
      horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
      connectionstyle = "angle,angleA=0,angleB={}".format(ang)
      kw["arrowprops"].update({"connectionstyle": connectionstyle})
      if data_y[i] == "Unknown" : continue
      if data_y[i] == "Music" : continue
      extra = 0
      if data_y[i] == "Rx - Hentai" : extra = 0.1
      if data_y[i] == "OVA" : extra = 0.1
      ax.annotate(data_y[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.1*y+extra), horizontalalignment=horizontalalignment, **kw)
  
    plt.setp(autotexts, size=10, weight="bold", color="black")
    ax.set_aspect(aspect=1.25)

    plt.title("% of User Votes per Anime " + f)
    if f == "Genders" : plt.title( "% of User Votes per Anime Genre" )
    if f == "Rating"  : plt.title( "% of User Votes per Anime Age rating" )
    if f == 'Year_class' : plt.title( "% of User Votes per Anime Release year" )
    if f == 'Month' :      plt.title( "% of User Votes per Anime Release month" )