In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pprint as pp
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer

plt.style.use('seaborn')

In [2]:
df = pd.read_json('data/anime_content.json', lines=True)

In [3]:
df.head()

Unnamed: 0,_id,errors,data
0,{'$oid': '5ed06d2b64d930385059a26d'},"[{'message': 'Not Found.', 'status': 404, 'loc...",{'Media': None}
1,{'$oid': '5ed06d3564d930385059a26e'},,"{'Media': {'id': 10161, 'title': {'romaji': 'N..."
2,{'$oid': '5ed06d3f64d930385059a26f'},,"{'Media': {'id': 99726, 'title': {'romaji': 'N..."
3,{'$oid': '5ed06d4964d930385059a270'},,"{'Media': {'id': 98526, 'title': {'romaji': 'R..."
4,{'$oid': '5ed06d5364d930385059a271'},,"{'Media': {'id': 101240, 'title': {'romaji': '..."


In [4]:
# Some api calls returned None
no_media = []
for row in df['data']:
    if row['Media'] == None:
        no_media.append(True)
    else:
        no_media.append(False)

In [5]:
len(df[no_media])

48

In [6]:
df = df[~np.array(no_media)]

In [7]:
df[~np.array(df['errors'].isna())]

Unnamed: 0,_id,errors,data


In [8]:
# No remaining rows contain errors. Column is removed.
df = df[['_id', 'data']]

In [9]:
df.head()

Unnamed: 0,_id,data
1,{'$oid': '5ed06d3564d930385059a26e'},"{'Media': {'id': 10161, 'title': {'romaji': 'N..."
2,{'$oid': '5ed06d3f64d930385059a26f'},"{'Media': {'id': 99726, 'title': {'romaji': 'N..."
3,{'$oid': '5ed06d4964d930385059a270'},"{'Media': {'id': 98526, 'title': {'romaji': 'R..."
4,{'$oid': '5ed06d5364d930385059a271'},"{'Media': {'id': 101240, 'title': {'romaji': '..."
5,{'$oid': '5ed06d5e64d930385059a272'},"{'Media': {'id': 966, 'title': {'romaji': 'Cra..."


In [10]:
df.tail()

Unnamed: 0,_id,data
14473,{'$oid': '5ed2ab4364d930385059daf6'},"{'Media': {'id': 99916, 'title': {'romaji': 'A..."
14474,{'$oid': '5ed2ab4d64d930385059daf7'},"{'Media': {'id': 101283, 'title': {'romaji': '..."
14475,{'$oid': '5ed2ab5864d930385059daf8'},"{'Media': {'id': 101633, 'title': {'romaji': '..."
14476,{'$oid': '5ed2ab6264d930385059daf9'},"{'Media': {'id': 21742, 'title': {'romaji': 'K..."
14477,{'$oid': '5ed2ab6c64d930385059dafa'},"{'Media': {'id': 101089, 'title': {'romaji': '..."


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14430 entries, 1 to 14477
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   _id     14430 non-null  object
 1   data    14430 non-null  object
dtypes: object(2)
memory usage: 338.2+ KB


In [12]:
df['data'][1]['Media']

{'id': 10161,
 'title': {'romaji': 'No.6',
  'english': 'No.6',
  'native': 'NO.6 ナンバー・シックス',
  'userPreferred': 'No.6'},
 'startDate': {'year': 2011, 'month': 7, 'day': 8},
 'endDate': {'year': 2011, 'month': 9, 'day': 16},
 'season': 'SUMMER',
 'seasonYear': 2011,
 'type': 'ANIME',
 'format': 'TV',
 'status': 'FINISHED',
 'episodes': 11,
 'duration': 23,
 'chapters': None,
 'volumes': None,
 'isAdult': False,
 'genres': ['Action', 'Sci-Fi', 'Drama'],
 'tags': [{'name': 'Dystopian', 'rank': 92, 'category': 'Setting-Time'},
  {'name': "Boys' Love", 'rank': 70, 'category': 'Theme-Romance'},
  {'name': 'Shoujo', 'rank': 70, 'category': 'Demographic'},
  {'name': 'Cyberpunk', 'rank': 66, 'category': 'Theme-Sci-Fi'},
  {'name': 'Crossdressing', 'rank': 50, 'category': 'Cast-Traits'},
  {'name': 'Yandere', 'rank': 44, 'category': 'Cast-Traits'}],
 'isLicensed': True,
 'averageScore': 71,
 'popularity': 27608,
 'source': 'OTHER',
 'countryOfOrigin': 'JP',
 'staff': {'edges': [{'id': 12992,
 

In [13]:
# for show in df['data'][:5]:
#     print('anilist_id', show['Media']['id'])
#     print('title_romaji', show['Media']['title']['romaji'])
#     print('title_english', show['Media']['title']['english'])
#     print('title_native', show['Media']['title']['native'])
#     print('title_userPreferred', show['Media']['title']['userPreferred'])
#     print('start_date', pd.to_datetime(str(show['Media']['startDate']['year']) + '-' +
#                          str(show['Media']['startDate']['month']) + '-' +
#                          str(show['Media']['startDate']['day'])).date())
#     print('end_date', pd.to_datetime(str(show['Media']['endDate']['year']) + '-' +
#                          str(show['Media']['endDate']['month']) + '-' +
#                          str(show['Media']['endDate']['day'])).date())
#     print('season', show['Media']['season'])
#     print('type', show['Media']['type'])
#     print('format', show['Media']['format'])
#     print('status', show['Media']['status'])
#     print('episodes', show['Media']['episodes'])
#     print('duration', show['Media']['duration'])
#     print('chapters', show['Media']['chapters'])
#     print('volumes', show['Media']['volumes'])
#     print('isAdult', show['Media']['isAdult'])
#     print('genres', show['Media']['genres'])
#     print('type', show['Media']['tags'])

    
#     print('------------------')

In [14]:
media_df = pd.DataFrame(list(df['data']))

In [15]:
media_df.head()

Unnamed: 0,Media
0,"{'id': 10161, 'title': {'romaji': 'No.6', 'eng..."
1,"{'id': 99726, 'title': {'romaji': 'Net-juu no ..."
2,"{'id': 98526, 'title': {'romaji': 'Robomasters..."
3,"{'id': 101240, 'title': {'romaji': 'Dokidoki L..."
4,"{'id': 966, 'title': {'romaji': 'Crayon Shin-c..."


In [16]:
media_df = pd.DataFrame(list(media_df['Media']))

In [17]:
media_df.head(3)

Unnamed: 0,id,title,startDate,endDate,season,seasonYear,type,format,status,episodes,...,genres,tags,isLicensed,averageScore,popularity,source,countryOfOrigin,staff,studios,characters
0,10161,"{'romaji': 'No.6', 'english': 'No.6', 'native'...","{'year': 2011, 'month': 7, 'day': 8}","{'year': 2011, 'month': 9, 'day': 16}",SUMMER,2011.0,ANIME,TV,FINISHED,11.0,...,"[Action, Sci-Fi, Drama]","[{'name': 'Dystopian', 'rank': 92, 'category':...",True,71.0,27608,OTHER,JP,"{'edges': [{'id': 12992, 'node': {'id': 101012...","{'edges': [{'id': 11804, 'node': {'id': 4, 'na...","{'edges': [{'id': 4370, 'role': 'SUPPORTING', ..."
1,99726,"{'romaji': 'Net-juu no Susume', 'english': 'Re...","{'year': 2017, 'month': 10, 'day': 6}","{'year': 2017, 'month': 12, 'day': 8}",FALL,2017.0,ANIME,TV,FINISHED,10.0,...,"[Comedy, Fantasy, Romance, Adventure, Slice of...","[{'name': 'Video Games', 'rank': 95, 'category...",True,75.0,41540,MANGA,JP,"{'edges': [{'id': 99978, 'node': {'id': 110641...","{'edges': [{'id': 17713, 'node': {'id': 6101, ...","{'edges': [{'id': 149462, 'role': 'MAIN', 'nod..."
2,98526,"{'romaji': 'Robomasters The Animated Series', ...","{'year': 2017, 'month': 10, 'day': 13}","{'year': 2017, 'month': 11, 'day': 17}",FALL,2017.0,ANIME,TV,FINISHED,6.0,...,"[Action, Drama]","[{'name': 'Real Robot', 'rank': 60, 'category'...",True,58.0,1901,ORIGINAL,JP,"{'edges': [{'id': 93476, 'node': {'id': 107517...","{'edges': [{'id': 17375, 'node': {'id': 6154, ...",{'edges': []}


In [18]:
genres_df = pd.DataFrame(list(media_df['genres']))
genres_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Action,Sci-Fi,Drama,,,,
1,Comedy,Fantasy,Romance,Adventure,Slice of Life,,
2,Action,Drama,,,,,
3,Hentai,,,,,,
4,Comedy,Ecchi,Slice of Life,,,,


In [19]:
# this project does not aim to be a hentai recommender
hentai_idxs = []
for idx, row in genres_df.iterrows():
    if 'Hentai' in row.values:
        hentai_idxs.append(idx)

In [20]:
len(hentai_idxs)

1181

In [21]:
media_df = media_df.drop(hentai_idxs)
media_df.head()

Unnamed: 0,id,title,startDate,endDate,season,seasonYear,type,format,status,episodes,...,genres,tags,isLicensed,averageScore,popularity,source,countryOfOrigin,staff,studios,characters
0,10161,"{'romaji': 'No.6', 'english': 'No.6', 'native'...","{'year': 2011, 'month': 7, 'day': 8}","{'year': 2011, 'month': 9, 'day': 16}",SUMMER,2011.0,ANIME,TV,FINISHED,11.0,...,"[Action, Sci-Fi, Drama]","[{'name': 'Dystopian', 'rank': 92, 'category':...",True,71.0,27608,OTHER,JP,"{'edges': [{'id': 12992, 'node': {'id': 101012...","{'edges': [{'id': 11804, 'node': {'id': 4, 'na...","{'edges': [{'id': 4370, 'role': 'SUPPORTING', ..."
1,99726,"{'romaji': 'Net-juu no Susume', 'english': 'Re...","{'year': 2017, 'month': 10, 'day': 6}","{'year': 2017, 'month': 12, 'day': 8}",FALL,2017.0,ANIME,TV,FINISHED,10.0,...,"[Comedy, Fantasy, Romance, Adventure, Slice of...","[{'name': 'Video Games', 'rank': 95, 'category...",True,75.0,41540,MANGA,JP,"{'edges': [{'id': 99978, 'node': {'id': 110641...","{'edges': [{'id': 17713, 'node': {'id': 6101, ...","{'edges': [{'id': 149462, 'role': 'MAIN', 'nod..."
2,98526,"{'romaji': 'Robomasters The Animated Series', ...","{'year': 2017, 'month': 10, 'day': 13}","{'year': 2017, 'month': 11, 'day': 17}",FALL,2017.0,ANIME,TV,FINISHED,6.0,...,"[Action, Drama]","[{'name': 'Real Robot', 'rank': 60, 'category'...",True,58.0,1901,ORIGINAL,JP,"{'edges': [{'id': 93476, 'node': {'id': 107517...","{'edges': [{'id': 17375, 'node': {'id': 6154, ...",{'edges': []}
4,966,"{'romaji': 'Crayon Shin-chan', 'english': 'Shi...","{'year': 1992, 'month': 4, 'day': 13}","{'year': None, 'month': None, 'day': None}",SPRING,1992.0,ANIME,TV,RELEASING,,...,"[Comedy, Ecchi, Slice of Life]","[{'name': 'Episodic', 'rank': 85, 'category': ...",True,73.0,4681,MANGA,JP,"{'edges': [{'id': 966, 'node': {'id': 95148, '...","{'edges': [{'id': 2601, 'node': {'id': 55, 'na...","{'edges': [{'id': 2281, 'role': 'SUPPORTING', ..."
5,4876,"{'romaji': 'Hashire Melos!', 'english': 'Run M...","{'year': 1992, 'month': 6, 'day': 25}","{'year': 1992, 'month': 6, 'day': 25}",SUMMER,1992.0,ANIME,MOVIE,FINISHED,1.0,...,"[Action, Adventure, Drama]","[{'name': 'Classic Literature', 'rank': 70, 'c...",True,61.0,921,OTHER,JP,"{'edges': [{'id': 5907, 'node': {'id': 97762, ...","{'edges': [{'id': 7865, 'node': {'id': 141, 'n...","{'edges': [{'id': 2540, 'role': 'MAIN', 'node'..."


In [22]:
media_df['id']

0         10161
1         99726
2         98526
4           966
5          4876
          ...  
14425     99916
14426    101283
14427    101633
14428     21742
14429    101089
Name: id, Length: 13249, dtype: int64

In [23]:
len(media_df['id'].unique())

9832

While retrieving shows from the AniList API, different search terms sometimes returned the same shows. Duplicates need to be dropped.

In [24]:
media_df = media_df.drop_duplicates(subset='id')

In [41]:
# Every row has a unique id
len(media_df['id']) == len(media_df['id'].unique())

True

In [44]:
media_df.set_index('id', inplace=True)

In [46]:
media_df.head()

Unnamed: 0_level_0,title,startDate,endDate,season,seasonYear,type,format,status,episodes,duration,...,genres,tags,isLicensed,averageScore,popularity,source,countryOfOrigin,staff,studios,characters
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,"{'romaji': 'No.6', 'english': 'No.6', 'native'...","{'year': 2011, 'month': 7, 'day': 8}","{'year': 2011, 'month': 9, 'day': 16}",SUMMER,2011.0,ANIME,TV,FINISHED,11.0,23.0,...,"[action, sci_fi, drama]","[{'name': 'Dystopian', 'rank': 92, 'category':...",True,71.0,27608,OTHER,JP,"{'edges': [{'id': 12992, 'node': {'id': 101012...","{'edges': [{'id': 11804, 'node': {'id': 4, 'na...","{'edges': [{'id': 4370, 'role': 'SUPPORTING', ..."
99726,"{'romaji': 'Net-juu no Susume', 'english': 'Re...","{'year': 2017, 'month': 10, 'day': 6}","{'year': 2017, 'month': 12, 'day': 8}",FALL,2017.0,ANIME,TV,FINISHED,10.0,24.0,...,"[comedy, fantasy, romance, adventure, slice_of...","[{'name': 'Video Games', 'rank': 95, 'category...",True,75.0,41540,MANGA,JP,"{'edges': [{'id': 99978, 'node': {'id': 110641...","{'edges': [{'id': 17713, 'node': {'id': 6101, ...","{'edges': [{'id': 149462, 'role': 'MAIN', 'nod..."
98526,"{'romaji': 'Robomasters The Animated Series', ...","{'year': 2017, 'month': 10, 'day': 13}","{'year': 2017, 'month': 11, 'day': 17}",FALL,2017.0,ANIME,TV,FINISHED,6.0,25.0,...,"[action, drama]","[{'name': 'Real Robot', 'rank': 60, 'category'...",True,58.0,1901,ORIGINAL,JP,"{'edges': [{'id': 93476, 'node': {'id': 107517...","{'edges': [{'id': 17375, 'node': {'id': 6154, ...",{'edges': []}
966,"{'romaji': 'Crayon Shin-chan', 'english': 'Shi...","{'year': 1992, 'month': 4, 'day': 13}","{'year': None, 'month': None, 'day': None}",SPRING,1992.0,ANIME,TV,RELEASING,,21.0,...,"[comedy, ecchi, slice_of_life]","[{'name': 'Episodic', 'rank': 85, 'category': ...",True,73.0,4681,MANGA,JP,"{'edges': [{'id': 966, 'node': {'id': 95148, '...","{'edges': [{'id': 2601, 'node': {'id': 55, 'na...","{'edges': [{'id': 2281, 'role': 'SUPPORTING', ..."
4876,"{'romaji': 'Hashire Melos!', 'english': 'Run M...","{'year': 1992, 'month': 6, 'day': 25}","{'year': 1992, 'month': 6, 'day': 25}",SUMMER,1992.0,ANIME,MOVIE,FINISHED,1.0,110.0,...,"[action, adventure, drama]","[{'name': 'Classic Literature', 'rank': 70, 'c...",True,61.0,921,OTHER,JP,"{'edges': [{'id': 5907, 'node': {'id': 97762, ...","{'edges': [{'id': 7865, 'node': {'id': 141, 'n...","{'edges': [{'id': 2540, 'role': 'MAIN', 'node'..."


In [61]:
genres_df = pd.DataFrame(media_df['genres'])
genres_df.head()

Unnamed: 0_level_0,genres
id,Unnamed: 1_level_1
10161,"[action, sci_fi, drama]"
99726,"[comedy, fantasy, romance, adventure, slice_of..."
98526,"[action, drama]"
966,"[comedy, ecchi, slice_of_life]"
4876,"[action, adventure, drama]"


In [62]:
for row in genres_df['genres']:
    for idx, genre in enumerate(row):
        row[idx] = (genre.replace(' ', '_')).lower()
        row[idx] = row[idx].replace('-', '_')

In [63]:
list(genres_df['genres'])

[['action', 'sci_fi', 'drama'],
 ['comedy', 'fantasy', 'romance', 'adventure', 'slice_of_life'],
 ['action', 'drama'],
 ['comedy', 'ecchi', 'slice_of_life'],
 ['action', 'adventure', 'drama'],
 ['action', 'adventure', 'drama', 'fantasy', 'comedy'],
 ['comedy', 'romance', 'supernatural'],
 ['romance', 'drama', 'supernatural'],
 ['action', 'comedy', 'sci_fi', 'drama'],
 ['drama', 'sci_fi', 'thriller'],
 ['psychological', 'sci_fi', 'thriller', 'drama'],
 ['action', 'adventure', 'fantasy'],
 ['action', 'comedy', 'sci_fi', 'drama'],
 ['drama', 'sci_fi'],
 ['drama', 'slice_of_life'],
 ['action', 'comedy', 'sci_fi', 'drama'],
 ['action', 'comedy', 'sci_fi'],
 ['drama', 'romance', 'slice_of_life'],
 ['action', 'comedy', 'sci_fi', 'drama'],
 ['comedy', 'drama', 'fantasy', 'slice_of_life'],
 ['drama', 'romance', 'slice_of_life', 'supernatural'],
 ['comedy', 'drama', 'sports', 'action'],
 ['action', 'drama', 'mecha', 'sci_fi', 'thriller'],
 ['adventure', 'drama', 'romance', 'supernatural', 'fanta

In [64]:
[' '.join(row) for row in genres_df['genres']]

['action sci_fi drama',
 'comedy fantasy romance adventure slice_of_life',
 'action drama',
 'comedy ecchi slice_of_life',
 'action adventure drama',
 'action adventure drama fantasy comedy',
 'comedy romance supernatural',
 'romance drama supernatural',
 'action comedy sci_fi drama',
 'drama sci_fi thriller',
 'psychological sci_fi thriller drama',
 'action adventure fantasy',
 'action comedy sci_fi drama',
 'drama sci_fi',
 'drama slice_of_life',
 'action comedy sci_fi drama',
 'action comedy sci_fi',
 'drama romance slice_of_life',
 'action comedy sci_fi drama',
 'comedy drama fantasy slice_of_life',
 'drama romance slice_of_life supernatural',
 'comedy drama sports action',
 'action drama mecha sci_fi thriller',
 'adventure drama romance supernatural fantasy',
 'adventure fantasy drama sci_fi mystery',
 'drama music romance slice_of_life',
 'drama',
 'comedy ecchi romance',
 'action mystery supernatural ecchi psychological drama',
 'action comedy sci_fi drama',
 'adventure mystery 

In [65]:
vectorizer = CountVectorizer()

In [66]:
vectorizer.fit_transform(list([' '.join(row) for row in genres_df['genres']])).todense()

matrix([[1, 0, 0, ..., 0, 0, 0],
        [0, 1, 1, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [67]:
vectorizer.get_feature_names()

['action',
 'adventure',
 'comedy',
 'drama',
 'ecchi',
 'fantasy',
 'horror',
 'mahou_shoujo',
 'mecha',
 'music',
 'mystery',
 'psychological',
 'romance',
 'sci_fi',
 'slice_of_life',
 'sports',
 'supernatural',
 'thriller']

In [68]:
genres_item_matrix = pd.DataFrame(vectorizer.fit_transform(list([' '.join(row) for row in genres_df['genres']])).todense(), 
                                  columns=vectorizer.get_feature_names(), 
                                  index=media_df.index)

In [69]:
genres_item_matrix

Unnamed: 0_level_0,action,adventure,comedy,drama,ecchi,fantasy,horror,mahou_shoujo,mecha,music,mystery,psychological,romance,sci_fi,slice_of_life,sports,supernatural,thriller
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
10161,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
99726,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0
98526,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
966,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
4876,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99586,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0
99916,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
101283,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0
101633,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0


## That's a beautiful matrix!

In [71]:
tags_df = pd.DataFrame(list(media_df['tags']), index=media_df.index)
tags_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,"{'name': 'Dystopian', 'rank': 92, 'category': ...","{'name': 'Boys' Love', 'rank': 70, 'category':...","{'name': 'Shoujo', 'rank': 70, 'category': 'De...","{'name': 'Cyberpunk', 'rank': 66, 'category': ...","{'name': 'Crossdressing', 'rank': 50, 'categor...","{'name': 'Yandere', 'rank': 44, 'category': 'C...",,,,,...,,,,,,,,,,
99726,"{'name': 'Video Games', 'rank': 95, 'category'...","{'name': 'Virtual World', 'rank': 92, 'categor...","{'name': 'Hikikomori', 'rank': 88, 'category':...","{'name': 'Primarily Adult Cast', 'rank': 83, '...","{'name': 'Female Protagonist', 'rank': 80, 'ca...","{'name': 'Parody', 'rank': 70, 'category': 'Th...","{'name': 'Gender Bending', 'rank': 55, 'catego...","{'name': 'Reverse Harem', 'rank': 20, 'categor...",,,...,,,,,,,,,,
98526,"{'name': 'Real Robot', 'rank': 60, 'category':...","{'name': 'Robots', 'rank': 60, 'category': 'Ca...","{'name': 'College', 'rank': 20, 'category': 'S...",,,,,,,,...,,,,,,,,,,
966,"{'name': 'Episodic', 'rank': 85, 'category': '...","{'name': 'Family Life', 'rank': 73, 'category'...","{'name': 'Kids', 'rank': 72, 'category': 'Demo...","{'name': 'School', 'rank': 66, 'category': 'Se...","{'name': 'Male Protagonist', 'rank': 60, 'cate...","{'name': 'Shounen', 'rank': 52, 'category': 'D...",,,,,...,,,,,,,,,,
4876,"{'name': 'Classic Literature', 'rank': 70, 'ca...","{'name': 'Historical', 'rank': 70, 'category':...","{'name': 'Male Protagonist', 'rank': 66, 'cate...","{'name': 'Foreign', 'rank': 66, 'category': 'S...","{'name': 'Politics', 'rank': 60, 'category': '...","{'name': 'Tragedy', 'rank': 20, 'category': 'T...","{'name': 'Swordplay', 'rank': 20, 'category': ...","{'name': 'Gore', 'rank': 20, 'category': 'Them...",,,...,,,,,,,,,,


In [72]:
tag_categories = {}
for idx, row in tags_df.iterrows():
    for tag in row:
        if tag == None:
            continue
        tag_categories.setdefault(tag['category'], set()).add(tag['name'])

In [73]:
pp.pprint(tag_categories)

{'': {'Reformation'},
 'Cast-Main Cast': {'Anti-Hero',
                    'Ensemble Cast',
                    'Female Protagonist',
                    'Male Protagonist',
                    'Office Lady',
                    'Primarily Adult Cast',
                    'Primarily Child Cast',
                    'Primarily Female Cast',
                    'Primarily Male Cast'},
 'Cast-Traits': {'Age Regression',
                 'Agender',
                 'Aliens',
                 'Amnesia',
                 'Artificial Intelligence',
                 'Asexual',
                 'Centaur',
                 'Chuunibyou',
                 'Cosplay',
                 'Crossdressing',
                 'Delinquents',
                 'Demons',
                 'Detective',
                 'Dinosaurs',
                 'Dissociative Identities',
                 'Dragons',
                 'Dullahan',
                 'Elf',
                 'Ghost',
                 'Goblin',
      

In [74]:
staff_df = pd.DataFrame(media_df['staff'])
staff_df.head()

Unnamed: 0_level_0,staff
id,Unnamed: 1_level_1
10161,"{'edges': [{'id': 12992, 'node': {'id': 101012..."
99726,"{'edges': [{'id': 99978, 'node': {'id': 110641..."
98526,"{'edges': [{'id': 93476, 'node': {'id': 107517..."
966,"{'edges': [{'id': 966, 'node': {'id': 95148, '..."
4876,"{'edges': [{'id': 5907, 'node': {'id': 97762, ..."


In [None]:
staff_df = staff_df['staff'].apply(pd.Series)

In [None]:
staff_df = staff_df['edges'].apply(pd.Series)
staff_df.head(3)

In [None]:
studios_df = pd.DataFrame(media_df['studios'])
studios_df.head()

In [None]:
studios_df = studios_df['studios'].apply(pd.Series)
studios_df.head(3)

In [None]:
studios_df = studios_df['edges'].apply(pd.Series)
studios_df.head(3)

In [None]:
characters_df = pd.DataFrame(media_df['characters'])
characters_df.head()

In [None]:
characters_df = characters_df['characters'].apply(pd.Series)
characters_df.head()

In [None]:
characters_df = characters_df['edges'].apply(pd.Series)
characters_df.head()