In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pprint as pp
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from difflib import get_close_matches

plt.style.use('seaborn')

In [2]:
df = pd.read_json('data/anime_content.json', lines=True)

In [3]:
df.head()

Unnamed: 0,_id,errors,data
0,{'$oid': '5ed06d2b64d930385059a26d'},"[{'message': 'Not Found.', 'status': 404, 'loc...",{'Media': None}
1,{'$oid': '5ed06d3564d930385059a26e'},,"{'Media': {'id': 10161, 'title': {'romaji': 'N..."
2,{'$oid': '5ed06d3f64d930385059a26f'},,"{'Media': {'id': 99726, 'title': {'romaji': 'N..."
3,{'$oid': '5ed06d4964d930385059a270'},,"{'Media': {'id': 98526, 'title': {'romaji': 'R..."
4,{'$oid': '5ed06d5364d930385059a271'},,"{'Media': {'id': 101240, 'title': {'romaji': '..."


In [4]:
# Some api calls returned None
no_media = []
for row in df['data']:
    if row['Media'] == None:
        no_media.append(True)
    else:
        no_media.append(False)

In [5]:
len(df[no_media])

48

In [6]:
df = df[~np.array(no_media)]

In [7]:
df[~np.array(df['errors'].isna())]

Unnamed: 0,_id,errors,data


In [8]:
# No remaining rows contain errors. Column is removed.
df = df[['_id', 'data']]

In [9]:
df.head()

Unnamed: 0,_id,data
1,{'$oid': '5ed06d3564d930385059a26e'},"{'Media': {'id': 10161, 'title': {'romaji': 'N..."
2,{'$oid': '5ed06d3f64d930385059a26f'},"{'Media': {'id': 99726, 'title': {'romaji': 'N..."
3,{'$oid': '5ed06d4964d930385059a270'},"{'Media': {'id': 98526, 'title': {'romaji': 'R..."
4,{'$oid': '5ed06d5364d930385059a271'},"{'Media': {'id': 101240, 'title': {'romaji': '..."
5,{'$oid': '5ed06d5e64d930385059a272'},"{'Media': {'id': 966, 'title': {'romaji': 'Cra..."


In [10]:
df.tail()

Unnamed: 0,_id,data
14473,{'$oid': '5ed2ab4364d930385059daf6'},"{'Media': {'id': 99916, 'title': {'romaji': 'A..."
14474,{'$oid': '5ed2ab4d64d930385059daf7'},"{'Media': {'id': 101283, 'title': {'romaji': '..."
14475,{'$oid': '5ed2ab5864d930385059daf8'},"{'Media': {'id': 101633, 'title': {'romaji': '..."
14476,{'$oid': '5ed2ab6264d930385059daf9'},"{'Media': {'id': 21742, 'title': {'romaji': 'K..."
14477,{'$oid': '5ed2ab6c64d930385059dafa'},"{'Media': {'id': 101089, 'title': {'romaji': '..."


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14430 entries, 1 to 14477
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   _id     14430 non-null  object
 1   data    14430 non-null  object
dtypes: object(2)
memory usage: 338.2+ KB


In [12]:
# here's an example of what the data looks like for each anime
df['data'][1]['Media']

{'id': 10161,
 'title': {'romaji': 'No.6',
  'english': 'No.6',
  'native': 'NO.6 ナンバー・シックス',
  'userPreferred': 'No.6'},
 'startDate': {'year': 2011, 'month': 7, 'day': 8},
 'endDate': {'year': 2011, 'month': 9, 'day': 16},
 'season': 'SUMMER',
 'seasonYear': 2011,
 'type': 'ANIME',
 'format': 'TV',
 'status': 'FINISHED',
 'episodes': 11,
 'duration': 23,
 'chapters': None,
 'volumes': None,
 'isAdult': False,
 'genres': ['Action', 'Sci-Fi', 'Drama'],
 'tags': [{'name': 'Dystopian', 'rank': 92, 'category': 'Setting-Time'},
  {'name': "Boys' Love", 'rank': 70, 'category': 'Theme-Romance'},
  {'name': 'Shoujo', 'rank': 70, 'category': 'Demographic'},
  {'name': 'Cyberpunk', 'rank': 66, 'category': 'Theme-Sci-Fi'},
  {'name': 'Crossdressing', 'rank': 50, 'category': 'Cast-Traits'},
  {'name': 'Yandere', 'rank': 44, 'category': 'Cast-Traits'}],
 'isLicensed': True,
 'averageScore': 71,
 'popularity': 27608,
 'source': 'OTHER',
 'countryOfOrigin': 'JP',
 'staff': {'edges': [{'id': 12992,
 

In [13]:
media_df = pd.DataFrame(list(df['data']))

In [14]:
media_df.head()

Unnamed: 0,Media
0,"{'id': 10161, 'title': {'romaji': 'No.6', 'eng..."
1,"{'id': 99726, 'title': {'romaji': 'Net-juu no ..."
2,"{'id': 98526, 'title': {'romaji': 'Robomasters..."
3,"{'id': 101240, 'title': {'romaji': 'Dokidoki L..."
4,"{'id': 966, 'title': {'romaji': 'Crayon Shin-c..."


In [15]:
media_df = pd.DataFrame(list(media_df['Media']))

In [16]:
media_df.head()

Unnamed: 0,id,title,startDate,endDate,season,seasonYear,type,format,status,episodes,...,genres,tags,isLicensed,averageScore,popularity,source,countryOfOrigin,staff,studios,characters
0,10161,"{'romaji': 'No.6', 'english': 'No.6', 'native'...","{'year': 2011, 'month': 7, 'day': 8}","{'year': 2011, 'month': 9, 'day': 16}",SUMMER,2011.0,ANIME,TV,FINISHED,11.0,...,"[Action, Sci-Fi, Drama]","[{'name': 'Dystopian', 'rank': 92, 'category':...",True,71.0,27608,OTHER,JP,"{'edges': [{'id': 12992, 'node': {'id': 101012...","{'edges': [{'id': 11804, 'node': {'id': 4, 'na...","{'edges': [{'id': 4370, 'role': 'SUPPORTING', ..."
1,99726,"{'romaji': 'Net-juu no Susume', 'english': 'Re...","{'year': 2017, 'month': 10, 'day': 6}","{'year': 2017, 'month': 12, 'day': 8}",FALL,2017.0,ANIME,TV,FINISHED,10.0,...,"[Comedy, Fantasy, Romance, Adventure, Slice of...","[{'name': 'Video Games', 'rank': 95, 'category...",True,75.0,41540,MANGA,JP,"{'edges': [{'id': 99978, 'node': {'id': 110641...","{'edges': [{'id': 17713, 'node': {'id': 6101, ...","{'edges': [{'id': 149462, 'role': 'MAIN', 'nod..."
2,98526,"{'romaji': 'Robomasters The Animated Series', ...","{'year': 2017, 'month': 10, 'day': 13}","{'year': 2017, 'month': 11, 'day': 17}",FALL,2017.0,ANIME,TV,FINISHED,6.0,...,"[Action, Drama]","[{'name': 'Real Robot', 'rank': 60, 'category'...",True,58.0,1901,ORIGINAL,JP,"{'edges': [{'id': 93476, 'node': {'id': 107517...","{'edges': [{'id': 17375, 'node': {'id': 6154, ...",{'edges': []}
3,101240,"{'romaji': 'Dokidoki Little Ooya-san', 'englis...","{'year': 2018, 'month': 5, 'day': 25}","{'year': 2019, 'month': 12, 'day': 27}",SPRING,2018.0,ANIME,OVA,FINISHED,6.0,...,[Hentai],"[{'name': 'Age Gap', 'rank': 86, 'category': '...",True,67.0,1228,VIDEO_GAME,JP,{'edges': []},"{'edges': [{'id': 18136, 'node': {'id': 6155, ...","{'edges': [{'id': 151698, 'role': 'MAIN', 'nod..."
4,966,"{'romaji': 'Crayon Shin-chan', 'english': 'Shi...","{'year': 1992, 'month': 4, 'day': 13}","{'year': None, 'month': None, 'day': None}",SPRING,1992.0,ANIME,TV,RELEASING,,...,"[Comedy, Ecchi, Slice of Life]","[{'name': 'Episodic', 'rank': 85, 'category': ...",True,73.0,4681,MANGA,JP,"{'edges': [{'id': 966, 'node': {'id': 95148, '...","{'edges': [{'id': 2601, 'node': {'id': 55, 'na...","{'edges': [{'id': 2281, 'role': 'SUPPORTING', ..."


In [17]:
genres_df = pd.DataFrame(list(media_df['genres']))
genres_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Action,Sci-Fi,Drama,,,,
1,Comedy,Fantasy,Romance,Adventure,Slice of Life,,
2,Action,Drama,,,,,
3,Hentai,,,,,,
4,Comedy,Ecchi,Slice of Life,,,,


In [18]:
# this project does not aim to be a hentai recommender
hentai_idxs = []
for idx, row in genres_df.iterrows():
    if 'Hentai' in row.values:
        hentai_idxs.append(idx)

In [19]:
print('There are', len(hentai_idxs), '"anime" labeled as hentai.')
print('All will be removed.')

There are 1181 "anime" labeled as hentai.
All will be removed.


In [20]:
media_df = media_df.drop(hentai_idxs)
media_df.head()

Unnamed: 0,id,title,startDate,endDate,season,seasonYear,type,format,status,episodes,...,genres,tags,isLicensed,averageScore,popularity,source,countryOfOrigin,staff,studios,characters
0,10161,"{'romaji': 'No.6', 'english': 'No.6', 'native'...","{'year': 2011, 'month': 7, 'day': 8}","{'year': 2011, 'month': 9, 'day': 16}",SUMMER,2011.0,ANIME,TV,FINISHED,11.0,...,"[Action, Sci-Fi, Drama]","[{'name': 'Dystopian', 'rank': 92, 'category':...",True,71.0,27608,OTHER,JP,"{'edges': [{'id': 12992, 'node': {'id': 101012...","{'edges': [{'id': 11804, 'node': {'id': 4, 'na...","{'edges': [{'id': 4370, 'role': 'SUPPORTING', ..."
1,99726,"{'romaji': 'Net-juu no Susume', 'english': 'Re...","{'year': 2017, 'month': 10, 'day': 6}","{'year': 2017, 'month': 12, 'day': 8}",FALL,2017.0,ANIME,TV,FINISHED,10.0,...,"[Comedy, Fantasy, Romance, Adventure, Slice of...","[{'name': 'Video Games', 'rank': 95, 'category...",True,75.0,41540,MANGA,JP,"{'edges': [{'id': 99978, 'node': {'id': 110641...","{'edges': [{'id': 17713, 'node': {'id': 6101, ...","{'edges': [{'id': 149462, 'role': 'MAIN', 'nod..."
2,98526,"{'romaji': 'Robomasters The Animated Series', ...","{'year': 2017, 'month': 10, 'day': 13}","{'year': 2017, 'month': 11, 'day': 17}",FALL,2017.0,ANIME,TV,FINISHED,6.0,...,"[Action, Drama]","[{'name': 'Real Robot', 'rank': 60, 'category'...",True,58.0,1901,ORIGINAL,JP,"{'edges': [{'id': 93476, 'node': {'id': 107517...","{'edges': [{'id': 17375, 'node': {'id': 6154, ...",{'edges': []}
4,966,"{'romaji': 'Crayon Shin-chan', 'english': 'Shi...","{'year': 1992, 'month': 4, 'day': 13}","{'year': None, 'month': None, 'day': None}",SPRING,1992.0,ANIME,TV,RELEASING,,...,"[Comedy, Ecchi, Slice of Life]","[{'name': 'Episodic', 'rank': 85, 'category': ...",True,73.0,4681,MANGA,JP,"{'edges': [{'id': 966, 'node': {'id': 95148, '...","{'edges': [{'id': 2601, 'node': {'id': 55, 'na...","{'edges': [{'id': 2281, 'role': 'SUPPORTING', ..."
5,4876,"{'romaji': 'Hashire Melos!', 'english': 'Run M...","{'year': 1992, 'month': 6, 'day': 25}","{'year': 1992, 'month': 6, 'day': 25}",SUMMER,1992.0,ANIME,MOVIE,FINISHED,1.0,...,"[Action, Adventure, Drama]","[{'name': 'Classic Literature', 'rank': 70, 'c...",True,61.0,921,OTHER,JP,"{'edges': [{'id': 5907, 'node': {'id': 97762, ...","{'edges': [{'id': 7865, 'node': {'id': 141, 'n...","{'edges': [{'id': 2540, 'role': 'MAIN', 'node'..."


In [21]:
media_df['id']

0         10161
1         99726
2         98526
4           966
5          4876
          ...  
14425     99916
14426    101283
14427    101633
14428     21742
14429    101089
Name: id, Length: 13249, dtype: int64

In [22]:
len(media_df['id'].unique())

9832

While retrieving shows from the AniList API, different search terms sometimes returned the same shows. Duplicates need to be dropped.

In [23]:
media_df = media_df.drop_duplicates(subset='id')

In [24]:
# Every row has a unique id
len(media_df['id']) == len(media_df['id'].unique())

True

In [25]:
media_df.set_index('id', inplace=True)

In [26]:
media_df.head()

Unnamed: 0_level_0,title,startDate,endDate,season,seasonYear,type,format,status,episodes,duration,...,genres,tags,isLicensed,averageScore,popularity,source,countryOfOrigin,staff,studios,characters
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,"{'romaji': 'No.6', 'english': 'No.6', 'native'...","{'year': 2011, 'month': 7, 'day': 8}","{'year': 2011, 'month': 9, 'day': 16}",SUMMER,2011.0,ANIME,TV,FINISHED,11.0,23.0,...,"[Action, Sci-Fi, Drama]","[{'name': 'Dystopian', 'rank': 92, 'category':...",True,71.0,27608,OTHER,JP,"{'edges': [{'id': 12992, 'node': {'id': 101012...","{'edges': [{'id': 11804, 'node': {'id': 4, 'na...","{'edges': [{'id': 4370, 'role': 'SUPPORTING', ..."
99726,"{'romaji': 'Net-juu no Susume', 'english': 'Re...","{'year': 2017, 'month': 10, 'day': 6}","{'year': 2017, 'month': 12, 'day': 8}",FALL,2017.0,ANIME,TV,FINISHED,10.0,24.0,...,"[Comedy, Fantasy, Romance, Adventure, Slice of...","[{'name': 'Video Games', 'rank': 95, 'category...",True,75.0,41540,MANGA,JP,"{'edges': [{'id': 99978, 'node': {'id': 110641...","{'edges': [{'id': 17713, 'node': {'id': 6101, ...","{'edges': [{'id': 149462, 'role': 'MAIN', 'nod..."
98526,"{'romaji': 'Robomasters The Animated Series', ...","{'year': 2017, 'month': 10, 'day': 13}","{'year': 2017, 'month': 11, 'day': 17}",FALL,2017.0,ANIME,TV,FINISHED,6.0,25.0,...,"[Action, Drama]","[{'name': 'Real Robot', 'rank': 60, 'category'...",True,58.0,1901,ORIGINAL,JP,"{'edges': [{'id': 93476, 'node': {'id': 107517...","{'edges': [{'id': 17375, 'node': {'id': 6154, ...",{'edges': []}
966,"{'romaji': 'Crayon Shin-chan', 'english': 'Shi...","{'year': 1992, 'month': 4, 'day': 13}","{'year': None, 'month': None, 'day': None}",SPRING,1992.0,ANIME,TV,RELEASING,,21.0,...,"[Comedy, Ecchi, Slice of Life]","[{'name': 'Episodic', 'rank': 85, 'category': ...",True,73.0,4681,MANGA,JP,"{'edges': [{'id': 966, 'node': {'id': 95148, '...","{'edges': [{'id': 2601, 'node': {'id': 55, 'na...","{'edges': [{'id': 2281, 'role': 'SUPPORTING', ..."
4876,"{'romaji': 'Hashire Melos!', 'english': 'Run M...","{'year': 1992, 'month': 6, 'day': 25}","{'year': 1992, 'month': 6, 'day': 25}",SUMMER,1992.0,ANIME,MOVIE,FINISHED,1.0,110.0,...,"[Action, Adventure, Drama]","[{'name': 'Classic Literature', 'rank': 70, 'c...",True,61.0,921,OTHER,JP,"{'edges': [{'id': 5907, 'node': {'id': 97762, ...","{'edges': [{'id': 7865, 'node': {'id': 141, 'n...","{'edges': [{'id': 2540, 'role': 'MAIN', 'node'..."


<b>Next we need to create content item matricies from the above dataframe</b>




## Title


---

In [27]:
title_df = pd.DataFrame(list(media_df['title']), index=media_df.index)
title_df.head()

Unnamed: 0_level_0,romaji,english,native,userPreferred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10161,No.6,No.6,NO.6 ナンバー・シックス,No.6
99726,Net-juu no Susume,Recovery of an MMO Junkie,ネト充のススメ,Net-juu no Susume
98526,Robomasters The Animated Series,,ROBOMASTERS THE ANIMATED SERIES,Robomasters The Animated Series
966,Crayon Shin-chan,Shin Chan,クレヨンしんちゃん,Crayon Shin-chan
4876,Hashire Melos!,Run Melos!,走れメロス,Hashire Melos!


In [28]:
# would like to be able to search for shows w/o knowing exact title
# get_close_matches('Fullmetal Alchemist', title_df['userPreferred'], cutoff=0.25)

This notebook is primarily for data cleaning. Let's export content item matrix to a csv to be brought into another file.

In [29]:
# title_df.to_csv('data/title_df.csv')




## Genres


---

In [30]:
genres_df = pd.DataFrame(media_df['genres'])
genres_df.head()

Unnamed: 0_level_0,genres
id,Unnamed: 1_level_1
10161,"[Action, Sci-Fi, Drama]"
99726,"[Comedy, Fantasy, Romance, Adventure, Slice of..."
98526,"[Action, Drama]"
966,"[Comedy, Ecchi, Slice of Life]"
4876,"[Action, Adventure, Drama]"


In [31]:
def clean_column_of_lists(df, column_name):
    '''Take in a dataframe and a column containing lists of strings.
    Return a list of items that are lowercased and have spaces 
    replaced with underscores.
    
    SAVES IN PLACE
    '''
    
    for row in df[column_name]:
        for idx, item in enumerate(row):
            if item == None:
                continue
            row[idx] = (item.replace(' ', '_')).lower()
            row[idx] = row[idx].replace('-', '_')
            row[idx] = row[idx].replace("'", '')
    return None

In [32]:
clean_column_of_lists(genres_df, 'genres')

In [33]:
genres_df['genres']

id
10161                               [action, sci_fi, drama]
99726     [comedy, fantasy, romance, adventure, slice_of...
98526                                       [action, drama]
966                          [comedy, ecchi, slice_of_life]
4876                             [action, adventure, drama]
                                ...                        
99586                        [ecchi, sports, slice_of_life]
99916                              [romance, slice_of_life]
101283                     [horror, mystery, psychological]
101633                [music, slice_of_life, comedy, drama]
101089                                      [slice_of_life]
Name: genres, Length: 9832, dtype: object

In [34]:
vectorizer = CountVectorizer(stop_words=None)

In [35]:
genres_item_matrix = pd.DataFrame(vectorizer.fit_transform([' '.join(row) for row in genres_df['genres']]).todense(), 
                                  columns=vectorizer.get_feature_names(), 
                                  index=media_df.index)

In [36]:
genres_item_matrix

Unnamed: 0_level_0,action,adventure,comedy,drama,ecchi,fantasy,horror,mahou_shoujo,mecha,music,mystery,psychological,romance,sci_fi,slice_of_life,sports,supernatural,thriller
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
10161,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
99726,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0
98526,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
966,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
4876,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99586,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0
99916,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
101283,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0
101633,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0


<b>That's a beautiful matrix!</b>

This notebook is primarily for data cleaning. Let's export content item matrix to a csv to be brought into another file.

In [37]:
# genres_item_matrix.to_csv('data/genres_item_matrix.csv')

In [38]:
# these are the most common genres
np.mean(genres_item_matrix).sort_values(ascending=False)

comedy           0.391172
action           0.268002
adventure        0.213283
fantasy          0.198841
drama            0.190704
sci_fi           0.186127
romance          0.143714
slice_of_life    0.127339
music            0.097539
supernatural     0.096216
mecha            0.082994
ecchi            0.062449
sports           0.052990
mystery          0.052685
horror           0.034886
psychological    0.032343
mahou_shoujo     0.025325
thriller         0.011595
dtype: float64

In [39]:
# I'm surpised there aren't many thrillers. Lets investigate that.
thriller_idxs = genres_item_matrix[genres_item_matrix['thriller'] == 1].index
genres_item_matrix[genres_item_matrix['thriller'] == 1]

Unnamed: 0_level_0,action,adventure,comedy,drama,ecchi,fantasy,horror,mahou_shoujo,mecha,music,mystery,psychological,romance,sci_fi,slice_of_life,sports,supernatural,thriller
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
21127,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1
9253,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1
2904,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1
1575,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1
19,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100388,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
104382,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1
101349,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1
102649,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1


In [40]:
for idx, title in title_df[title_df.index.isin(thriller_idxs)].iterrows():
    if title['english'] == None:
        print(title['userPreferred'])
    else:
        print(title['english'])

Steins;Gate 0
Steins;Gate
Code Geass: Lelouch of the Rebellion R2
Code Geass: Lelouch of the Rebellion
Monster
Death Note
Fate/Zero Season 2
the Garden of sinners Chapter 5: Paradox Paradigm
Rainbow
Steins;Gate The Movie – Load Region of Déjà Vu
ERASED
the Garden of sinners Chapter 7: ……not nothing heart. (Murder Speculation Part B)
Puella Magi Madoka Magica the Movie Part III: Rebellion
Puella Magi Madoka Magica
Puella Magi Madoka Magica the Movie Part II: Eternal
PSYCHO-PASS
Steins;Gate: Egoistic Poriomania
When They Cry Kai
Steins;Gate 0: 23β- Divide by Zero
Kaiji - Ultimate Survivor
The Promised Neverland
Kaiji - Against All Rules
Black Lagoon: The Second Barrage
Puella Magi Madoka Magica the Movie Part I: Beginnings
Gankutsuou: The Count of Monte Cristo
Terror in Resonance
Code Geass: Lelouch of the Rebellion I - Initiation
the Garden of sinners Chapter 3: ever cry, never life. (Remaining Sense of Pain)
Black Lagoon
Black Lagoon: Roberta's Blood Trail
Paprika
Phantom: Requiem for 

  
  
  
## Tags


---

In [41]:
tags_df = pd.DataFrame(list(media_df['tags']), index=media_df.index)
tags_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,"{'name': 'Dystopian', 'rank': 92, 'category': ...","{'name': 'Boys' Love', 'rank': 70, 'category':...","{'name': 'Shoujo', 'rank': 70, 'category': 'De...","{'name': 'Cyberpunk', 'rank': 66, 'category': ...","{'name': 'Crossdressing', 'rank': 50, 'categor...","{'name': 'Yandere', 'rank': 44, 'category': 'C...",,,,,...,,,,,,,,,,
99726,"{'name': 'Video Games', 'rank': 95, 'category'...","{'name': 'Virtual World', 'rank': 92, 'categor...","{'name': 'Hikikomori', 'rank': 88, 'category':...","{'name': 'Primarily Adult Cast', 'rank': 83, '...","{'name': 'Female Protagonist', 'rank': 80, 'ca...","{'name': 'Parody', 'rank': 70, 'category': 'Th...","{'name': 'Gender Bending', 'rank': 55, 'catego...","{'name': 'Reverse Harem', 'rank': 20, 'categor...",,,...,,,,,,,,,,
98526,"{'name': 'Real Robot', 'rank': 60, 'category':...","{'name': 'Robots', 'rank': 60, 'category': 'Ca...","{'name': 'College', 'rank': 20, 'category': 'S...",,,,,,,,...,,,,,,,,,,
966,"{'name': 'Episodic', 'rank': 85, 'category': '...","{'name': 'Family Life', 'rank': 73, 'category'...","{'name': 'Kids', 'rank': 72, 'category': 'Demo...","{'name': 'School', 'rank': 66, 'category': 'Se...","{'name': 'Male Protagonist', 'rank': 60, 'cate...","{'name': 'Shounen', 'rank': 52, 'category': 'D...",,,,,...,,,,,,,,,,
4876,"{'name': 'Classic Literature', 'rank': 70, 'ca...","{'name': 'Historical', 'rank': 70, 'category':...","{'name': 'Male Protagonist', 'rank': 66, 'cate...","{'name': 'Foreign', 'rank': 66, 'category': 'S...","{'name': 'Politics', 'rank': 60, 'category': '...","{'name': 'Tragedy', 'rank': 20, 'category': 'T...","{'name': 'Swordplay', 'rank': 20, 'category': ...","{'name': 'Gore', 'rank': 20, 'category': 'Them...",,,...,,,,,,,,,,


In [42]:
# just about every tag falls into a larger category.
tag_categories = {}
for idx, row in tags_df.iterrows():
    for tag in row:
        if tag == None:
            continue
        tag_categories.setdefault(tag['category'], set()).add(tag['name'])

In [43]:
pp.pprint(tag_categories)

{'': {'Reformation'},
 'Cast-Main Cast': {'Anti-Hero',
                    'Ensemble Cast',
                    'Female Protagonist',
                    'Male Protagonist',
                    'Office Lady',
                    'Primarily Adult Cast',
                    'Primarily Child Cast',
                    'Primarily Female Cast',
                    'Primarily Male Cast'},
 'Cast-Traits': {'Age Regression',
                 'Agender',
                 'Aliens',
                 'Amnesia',
                 'Artificial Intelligence',
                 'Asexual',
                 'Centaur',
                 'Chuunibyou',
                 'Cosplay',
                 'Crossdressing',
                 'Delinquents',
                 'Demons',
                 'Detective',
                 'Dinosaurs',
                 'Dissociative Identities',
                 'Dragons',
                 'Dullahan',
                 'Elf',
                 'Ghost',
                 'Goblin',
      

Nothing is done with these categories for now but they are informative.

In [44]:
tags_df.head(3)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,"{'name': 'Dystopian', 'rank': 92, 'category': ...","{'name': 'Boys' Love', 'rank': 70, 'category':...","{'name': 'Shoujo', 'rank': 70, 'category': 'De...","{'name': 'Cyberpunk', 'rank': 66, 'category': ...","{'name': 'Crossdressing', 'rank': 50, 'categor...","{'name': 'Yandere', 'rank': 44, 'category': 'C...",,,,,...,,,,,,,,,,
99726,"{'name': 'Video Games', 'rank': 95, 'category'...","{'name': 'Virtual World', 'rank': 92, 'categor...","{'name': 'Hikikomori', 'rank': 88, 'category':...","{'name': 'Primarily Adult Cast', 'rank': 83, '...","{'name': 'Female Protagonist', 'rank': 80, 'ca...","{'name': 'Parody', 'rank': 70, 'category': 'Th...","{'name': 'Gender Bending', 'rank': 55, 'catego...","{'name': 'Reverse Harem', 'rank': 20, 'categor...",,,...,,,,,,,,,,
98526,"{'name': 'Real Robot', 'rank': 60, 'category':...","{'name': 'Robots', 'rank': 60, 'category': 'Ca...","{'name': 'College', 'rank': 20, 'category': 'S...",,,,,,,,...,,,,,,,,,,


In [45]:
def clean_tag(tag):
    result = tag
    return result.replace("'", '').replace(' ', '_').replace('-', '_').lower()

In [46]:
all_shows_tags = []
for idx, row in tags_df.iterrows():
    show_tags_dict = {idx: {}}
    for tag in row:
        if tag != None:
            show_tags_dict[idx][clean_tag(tag['name'])] = (tag['rank'] / 100)
    all_shows_tags.append(show_tags_dict)

In [47]:
all_shows_tags

[{10161: {'dystopian': 0.92,
   'boys_love': 0.7,
   'shoujo': 0.7,
   'cyberpunk': 0.66,
   'crossdressing': 0.5,
   'yandere': 0.44}},
 {99726: {'video_games': 0.95,
   'virtual_world': 0.92,
   'hikikomori': 0.88,
   'primarily_adult_cast': 0.83,
   'female_protagonist': 0.8,
   'parody': 0.7,
   'gender_bending': 0.55,
   'reverse_harem': 0.2}},
 {98526: {'real_robot': 0.6, 'robots': 0.6, 'college': 0.2}},
 {966: {'episodic': 0.85,
   'family_life': 0.73,
   'kids': 0.72,
   'school': 0.66,
   'male_protagonist': 0.6,
   'shounen': 0.52}},
 {4876: {'classic_literature': 0.7,
   'historical': 0.7,
   'male_protagonist': 0.66,
   'foreign': 0.66,
   'politics': 0.6,
   'tragedy': 0.2,
   'swordplay': 0.2,
   'gore': 0.2}},
 {5114: {'conspiracy': 0.93,
   'tragedy': 0.92,
   'war': 0.92,
   'military': 0.91,
   'magic': 0.85,
   'politics': 0.84,
   'shounen': 0.81,
   'male_protagonist': 0.75,
   'philosophy': 0.73,
   'steampunk': 0.69,
   'foreign': 0.63,
   'lost_civilization': 0.

In [None]:
# generate set of all tags
all_tags_set = set()
for anime in all_shows_tags:
    for _id, d in anime.items():
        for tag, rank in d.items():
            all_tags_set.add(tag)

In [72]:
sorted(list(all_tags_set))

['4_koma',
 'achronological_order',
 'acting',
 'advertisement',
 'afterlife',
 'age_gap',
 'age_regression',
 'agender',
 'airsoft',
 'aliens',
 'alternate_universe',
 'american_football',
 'amnesia',
 'anachronism',
 'animals',
 'anthology',
 'anti_hero',
 'archery',
 'artificial_intelligence',
 'asexual',
 'assassins',
 'astronomy',
 'athletics',
 'augmented_reality',
 'autobiographical',
 'aviation',
 'badminton',
 'band',
 'bar',
 'baseball',
 'basketball',
 'battle_royale',
 'biographical',
 'bisexual',
 'body_horror',
 'body_swapping',
 'bondage',
 'boxing',
 'boys_love',
 'bullying',
 'calligraphy',
 'card_battle',
 'cars',
 'centaur',
 'cgi',
 'cheerleading',
 'chibi',
 'chuunibyou',
 'circus',
 'classic_literature',
 'college',
 'coming_of_age',
 'conspiracy',
 'cosmic_horror',
 'cosplay',
 'crime',
 'crossdressing',
 'crossover',
 'cult',
 'cultivation',
 'cute_girls_doing_cute_things',
 'cyberpunk',
 'cycling',
 'dancing',
 'death_game',
 'delinquents',
 'demons',
 'denpa',

In [73]:
# populate series with rank in same location of tag
# add series to dataframe
tag_rank_df = pd.DataFrame(columns=sorted(list(all_tags_set)))
for anime_dict in all_shows_tags:
    for _id, d in anime_dict.items():
        tag_rank_series = pd.Series(np.zeros(len(all_tags_set)), index=sorted(list(all_tags_set)), name=_id)
        for tag, rank in d.items():
            tag_rank_series.update(pd.Series([rank], index=[tag]))
        tag_rank_df = tag_rank_df.append(tag_rank_series)

In [74]:
tag_rank_df.head()

Unnamed: 0,4_koma,achronological_order,acting,advertisement,afterlife,age_gap,age_regression,agender,airsoft,aliens,...,witch,work,wrestling,writing,wuxia,yakuza,yandere,youkai,yuri,zombie
10161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.44,0.0,0.0,0.0
99726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
tag_rank_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9832 entries, 10161 to 101089
Columns: 247 entries, aliens to fashion
dtypes: float64(247)
memory usage: 18.6 MB


In [45]:
def filter_tags(df, threshold=60):
    '''Takes in a dataframe where each row is a show
    and each cell is a dictionary with tag name, rank, and 
    category. 
    
    Function filters out tags with a rank less than a threshold.
    
    Returns a series of lists. 
    '''
    keep_tags = []
    keep_tags_idxs = []
    for idx, row in tags_df.iloc[:,:].iterrows():
        t = []
        for tag in row:
            if tag == None:
                continue # skip
            else:
                if tag['rank'] >= threshold:
                    t.append(tag['name'])
                else:
                    continue # ignore
        keep_tags.append(t)
        keep_tags_idxs.append(idx)

    result = pd.DataFrame(keep_tags, index=keep_tags_idxs)
    result['tags'] = (result.values.tolist())
    
    return result['tags']



In [57]:
tags_df2 = pd.DataFrame(filter_tags(tags_df, threshold=0))

In [58]:
tags_df2.head()

Unnamed: 0,tags
10161,"[Dystopian, Boys' Love, Shoujo, Cyberpunk, Cro..."
99726,"[Video Games, Virtual World, Hikikomori, Prima..."
98526,"[Real Robot, Robots, College, None, None, None..."
966,"[Episodic, Family Life, Kids, School, Male Pro..."
4876,"[Classic Literature, Historical, Male Protagon..."


In [59]:
nones_removed = []
for idx, row in tags_df2.iterrows():
    row = [tag for tag in row[0] if tag]
    nones_removed.append(row)
tags_df2['tags'] = nones_removed

In the process of converting a sparse matrix into a series of lists, Nones were retained. They need to be removed. Eventually this functionality should be added to the function filter_tags.

In [60]:
tags_df2.head()

Unnamed: 0,tags
10161,"[Dystopian, Boys' Love, Shoujo, Cyberpunk, Cro..."
99726,"[Video Games, Virtual World, Hikikomori, Prima..."
98526,"[Real Robot, Robots, College]"
966,"[Episodic, Family Life, Kids, School, Male Pro..."
4876,"[Classic Literature, Historical, Male Protagon..."


In [61]:
clean_column_of_lists(tags_df2, 'tags')

In [62]:
tags_df2

Unnamed: 0,tags
10161,"[dystopian, boys_love, shoujo, cyberpunk, cros..."
99726,"[video_games, virtual_world, hikikomori, prima..."
98526,"[real_robot, robots, college]"
966,"[episodic, family_life, kids, school, male_pro..."
4876,"[classic_literature, historical, male_protagon..."
...,...
99586,"[volleyball, primarily_female_cast, female_pro..."
99916,"[yuri, coming_of_age, female_protagonist, scho..."
101283,"[urban_fantasy, philosophy, achronological_ord..."
101633,"[band, full_cgi, primarily_female_cast, school..."


In [85]:
# vectorizer = CountVectorizer(stop_words=None)
# tag_item_matrix = pd.DataFrame(vectorizer.fit_transform([' '.join(row) for row in tags_df2['tags']]).todense(), 
#                                   columns=vectorizer.get_feature_names(), 
#                                   index=media_df.index)

In [86]:
tag_item_matrix

Unnamed: 0_level_0,4_koma,achronological_order,acting,advertisement,afterlife,age_gap,age_regression,agender,airsoft,aliens,...,witch,work,wrestling,writing,wuxia,yakuza,yandere,youkai,yuri,zombie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
99726,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4876,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99916,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
101283,0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
101633,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


This notebook is primarily for data cleaning. Let's export content item matrix to a csv to be brought into another file.

In [65]:
# tag_item_matrix.to_csv('data/tag_item_matrix.csv')




## Staff


---

In [None]:
staff_df = pd.DataFrame(media_df['staff'])
staff_df.head()

In [None]:
staff_df = staff_df['staff'].apply(pd.Series)

In [None]:
staff_df = staff_df['edges'].apply(pd.Series)
staff_df.head(3)




## Studios


---

In [None]:
studios_df = pd.DataFrame(media_df['studios'])
studios_df.head()

In [None]:
studios_df = studios_df['studios'].apply(pd.Series)
studios_df.head(3)

In [None]:
studios_df = studios_df['edges'].apply(pd.Series)
studios_df.head(3)

  


## Characters and Voice Actors  
  
  
---  

In [None]:
characters_df = pd.DataFrame(media_df['characters'])
characters_df.head()

In [None]:
characters_df = characters_df['characters'].apply(pd.Series)
characters_df.head()

In [None]:
characters_df = characters_df['edges'].apply(pd.Series)
characters_df.head()