# Tokenizing

In [27]:
import utils.json_utils as jsu

In [28]:
corpus = jsu.read_json("8_coreference_resolution_black_clover.json")

In [29]:
characters = jsu.read_json('3_characters_black_clover.json')

### Tokenize

In [30]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/pauli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
from nltk import word_tokenize
from nltk import sent_tokenize

from nltk.corpus import stopwords
from nltk.corpus import wordnet

In [32]:
stoplist = stopwords.words("english")

In [33]:
tokenized_sentences = list()
for doc in corpus:
    doc_sentences = sent_tokenize(doc)
    for sentence in doc_sentences:
        tokenized_sentences.append([])
        words = word_tokenize(sentence)
        for word in words:
            tokenized_sentences[-1].append(word)
tokenized_sentences

[['In',
  'Hage',
  ',',
  'a',
  'priest',
  'finds',
  'two',
  'babies',
  'abandoned',
  'outside',
  'a',
  'priest',
  'church',
  '.'],
 ['a',
  'priest',
  'takes',
  'two',
  'babies',
  'abandoned',
  'outside',
  'his',
  'church',
  'inside',
  'and',
  'discovers',
  'two',
  'babies',
  'abandoned',
  'outside',
  'his',
  'church',
  'names',
  'to',
  'be',
  'Yuno',
  'and',
  'Asta',
  '.'],
 ['Fifteen',
  'years',
  'later',
  ',',
  'Asta',
  'proposes',
  'to',
  'Sister',
  'Lily',
  ',',
  'who',
  'refuses',
  'repeatedly',
  '.'],
 ['Yuno',
  'and',
  'the',
  'other',
  'orphans',
  'criticize',
  'Asta',
  'and',
  'point',
  'out',
  'Yuno',
  'lack',
  'of',
  'magic',
  '.'],
 ['Asta',
  'tries',
  'to',
  'show',
  'off',
  'Asta',
  'skills',
  ',',
  'but',
  'Yuno',
  'outshines',
  'Asta',
  'with',
  'Asta',
  'magic',
  '.'],
 ['Later',
  ',',
  'at',
  'the',
  'Grimoire',
  'Acceptance',
  'Ceremony',
  ',',
  'a',
  'pair',
  'of',
  'nobles',
  

In [34]:
jsu.write_json(tokenized_sentences, "9_raw_tokenized_sentences_black_clover.json")

### Tag Tokens

In [35]:
pos_tag2lemmatize = {
    'NN' : wordnet.NOUN,
    'VB' : wordnet.VERB,
    'JJ' : wordnet.ADJ,
    'RB' : wordnet.ADV
}

In [36]:
tagged_tokenized_sentences = [nltk.pos_tag(tokenized_sentence) for tokenized_sentence in tokenized_sentences]
tagged_tokenized_sentences

[[('In', 'IN'),
  ('Hage', 'NNP'),
  (',', ','),
  ('a', 'DT'),
  ('priest', 'JJ'),
  ('finds', 'VBZ'),
  ('two', 'CD'),
  ('babies', 'NNS'),
  ('abandoned', 'VBD'),
  ('outside', 'IN'),
  ('a', 'DT'),
  ('priest', 'JJ'),
  ('church', 'NN'),
  ('.', '.')],
 [('a', 'DT'),
  ('priest', 'JJ'),
  ('takes', 'VBZ'),
  ('two', 'CD'),
  ('babies', 'NNS'),
  ('abandoned', 'VBD'),
  ('outside', 'IN'),
  ('his', 'PRP$'),
  ('church', 'NN'),
  ('inside', 'NN'),
  ('and', 'CC'),
  ('discovers', 'NNS'),
  ('two', 'CD'),
  ('babies', 'NNS'),
  ('abandoned', 'VBD'),
  ('outside', 'IN'),
  ('his', 'PRP$'),
  ('church', 'NN'),
  ('names', 'NNS'),
  ('to', 'TO'),
  ('be', 'VB'),
  ('Yuno', 'NNP'),
  ('and', 'CC'),
  ('Asta', 'NNP'),
  ('.', '.')],
 [('Fifteen', 'CD'),
  ('years', 'NNS'),
  ('later', 'RB'),
  (',', ','),
  ('Asta', 'NNP'),
  ('proposes', 'VBZ'),
  ('to', 'TO'),
  ('Sister', 'NNP'),
  ('Lily', 'NNP'),
  (',', ','),
  ('who', 'WP'),
  ('refuses', 'VBZ'),
  ('repeatedly', 'RB'),
  ('.', '.')

In [37]:
tagged2lemmatize_tokenized_sentences = [[(tagged_token[0], tagged_token[1][:2], tagged_token[1]) for tagged_token in tagged_tokenized_sentence if tagged_token[1][:2] in pos_tag2lemmatize] for tagged_tokenized_sentence in tagged_tokenized_sentences]
tagged2lemmatize_tokenized_sentences

[[('Hage', 'NN', 'NNP'),
  ('priest', 'JJ', 'JJ'),
  ('finds', 'VB', 'VBZ'),
  ('babies', 'NN', 'NNS'),
  ('abandoned', 'VB', 'VBD'),
  ('priest', 'JJ', 'JJ'),
  ('church', 'NN', 'NN')],
 [('priest', 'JJ', 'JJ'),
  ('takes', 'VB', 'VBZ'),
  ('babies', 'NN', 'NNS'),
  ('abandoned', 'VB', 'VBD'),
  ('church', 'NN', 'NN'),
  ('inside', 'NN', 'NN'),
  ('discovers', 'NN', 'NNS'),
  ('babies', 'NN', 'NNS'),
  ('abandoned', 'VB', 'VBD'),
  ('church', 'NN', 'NN'),
  ('names', 'NN', 'NNS'),
  ('be', 'VB', 'VB'),
  ('Yuno', 'NN', 'NNP'),
  ('Asta', 'NN', 'NNP')],
 [('years', 'NN', 'NNS'),
  ('later', 'RB', 'RB'),
  ('Asta', 'NN', 'NNP'),
  ('proposes', 'VB', 'VBZ'),
  ('Sister', 'NN', 'NNP'),
  ('Lily', 'NN', 'NNP'),
  ('refuses', 'VB', 'VBZ'),
  ('repeatedly', 'RB', 'RB')],
 [('Yuno', 'NN', 'NN'),
  ('other', 'JJ', 'JJ'),
  ('orphans', 'NN', 'NNS'),
  ('criticize', 'VB', 'VBP'),
  ('Asta', 'NN', 'NNP'),
  ('point', 'VB', 'VB'),
  ('Yuno', 'NN', 'NNP'),
  ('lack', 'NN', 'NN'),
  ('magic', 'NN', 

### Lemmatize and get sentence subjects

In [38]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

In [39]:
def lemmatize(tagged2lemmatize_token):
    return lemmatizer.lemmatize(tagged2lemmatize_token[0], pos=pos_tag2lemmatize[tagged2lemmatize_token[1]])

In [40]:
def get_subjects_in_sent(tokenized_sent):
    return set(map(lambda x : x[0], filter(lambda x: x[2] == 'NNP' and x[0] in characters, tokenized_sent)))

In [41]:
aux = list()
for tagged2lemmatize_tokenized_sentence in tagged2lemmatize_tokenized_sentences:
    subjects = list(get_subjects_in_sent(tagged2lemmatize_tokenized_sentence))
    aux.append({
        "subjects": subjects, 
        "tokens" : []
    })
    for tagged2lemmatize_token in tagged2lemmatize_tokenized_sentence:
        if tagged2lemmatize_token[0] in characters:
            aux[-1]["tokens"].append(tagged2lemmatize_token[0])
        elif tagged2lemmatize_token[0] not in stoplist and tagged2lemmatize_token[0].isalpha():
            lemmatized_token = lemmatize((tagged2lemmatize_token[0].lower(), tagged2lemmatize_token[1]))
            if lemmatized_token not in stoplist:
                aux[-1]["tokens"].append(lemmatized_token)
lemmatizedWsubj_sentences = aux
lemmatizedWsubj_sentences

[{'subjects': [],
  'tokens': ['hage', 'priest', 'find', 'baby', 'abandon', 'priest', 'church']},
 {'subjects': ['Yuno', 'Asta'],
  'tokens': ['priest',
   'take',
   'baby',
   'abandon',
   'church',
   'inside',
   'discovers',
   'baby',
   'abandon',
   'church',
   'name',
   'Yuno',
   'Asta']},
 {'subjects': ['Lily', 'Asta'],
  'tokens': ['year',
   'later',
   'Asta',
   'propose',
   'sister',
   'Lily',
   'refuse',
   'repeatedly']},
 {'subjects': ['Yuno', 'Asta'],
  'tokens': ['Yuno',
   'orphan',
   'criticize',
   'Asta',
   'point',
   'Yuno',
   'lack',
   'magic']},
 {'subjects': ['Yuno', 'Asta'],
  'tokens': ['Asta',
   'try',
   'show',
   'Asta',
   'skill',
   'Yuno',
   'outshine',
   'Asta',
   'Asta',
   'magic']},
 {'subjects': [],
  'tokens': ['later',
   'grimoire',
   'acceptance',
   'ceremony',
   'pair',
   'noble',
   'criticize',
   'commoner']},
 {'subjects': ['Yuno', 'Asta'],
  'tokens': ['Asta',
   'receive',
   'grimoire',
   'Yuno',
   'attain',
 

## Non-Lemmatized Alternative

In [42]:
not2lemmatize_tagged_tokenized_sentence = [[(tagged_token[0], tagged_token[1], tagged_token[1]) for tagged_token in tagged_tokenized_sentence] for tagged_tokenized_sentence in tagged_tokenized_sentences ]
not2lemmatize_tagged_tokenized_sentence

[[('In', 'IN', 'IN'),
  ('Hage', 'NNP', 'NNP'),
  (',', ',', ','),
  ('a', 'DT', 'DT'),
  ('priest', 'JJ', 'JJ'),
  ('finds', 'VBZ', 'VBZ'),
  ('two', 'CD', 'CD'),
  ('babies', 'NNS', 'NNS'),
  ('abandoned', 'VBD', 'VBD'),
  ('outside', 'IN', 'IN'),
  ('a', 'DT', 'DT'),
  ('priest', 'JJ', 'JJ'),
  ('church', 'NN', 'NN'),
  ('.', '.', '.')],
 [('a', 'DT', 'DT'),
  ('priest', 'JJ', 'JJ'),
  ('takes', 'VBZ', 'VBZ'),
  ('two', 'CD', 'CD'),
  ('babies', 'NNS', 'NNS'),
  ('abandoned', 'VBD', 'VBD'),
  ('outside', 'IN', 'IN'),
  ('his', 'PRP$', 'PRP$'),
  ('church', 'NN', 'NN'),
  ('inside', 'NN', 'NN'),
  ('and', 'CC', 'CC'),
  ('discovers', 'NNS', 'NNS'),
  ('two', 'CD', 'CD'),
  ('babies', 'NNS', 'NNS'),
  ('abandoned', 'VBD', 'VBD'),
  ('outside', 'IN', 'IN'),
  ('his', 'PRP$', 'PRP$'),
  ('church', 'NN', 'NN'),
  ('names', 'NNS', 'NNS'),
  ('to', 'TO', 'TO'),
  ('be', 'VB', 'VB'),
  ('Yuno', 'NNP', 'NNP'),
  ('and', 'CC', 'CC'),
  ('Asta', 'NNP', 'NNP'),
  ('.', '.', '.')],
 [('Fifteen',

In [43]:
aux = list()
for tagged_tokenized_sentence in not2lemmatize_tagged_tokenized_sentence:
    subjects = list(get_subjects_in_sent(tagged_tokenized_sentence))
    aux.append({
        "subjects": subjects, 
        "tokens" : []
    })
    for tagged_token in tagged_tokenized_sentence:
        aux[-1]["tokens"].append(tagged_token[0])
nonLemmatizedWsubj_sentences = aux
nonLemmatizedWsubj_sentences

[{'subjects': [],
  'tokens': ['In',
   'Hage',
   ',',
   'a',
   'priest',
   'finds',
   'two',
   'babies',
   'abandoned',
   'outside',
   'a',
   'priest',
   'church',
   '.']},
 {'subjects': ['Yuno', 'Asta'],
  'tokens': ['a',
   'priest',
   'takes',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'inside',
   'and',
   'discovers',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'names',
   'to',
   'be',
   'Yuno',
   'and',
   'Asta',
   '.']},
 {'subjects': ['Lily', 'Asta'],
  'tokens': ['Fifteen',
   'years',
   'later',
   ',',
   'Asta',
   'proposes',
   'to',
   'Sister',
   'Lily',
   ',',
   'who',
   'refuses',
   'repeatedly',
   '.']},
 {'subjects': ['Yuno', 'Asta'],
  'tokens': ['Yuno',
   'and',
   'the',
   'other',
   'orphans',
   'criticize',
   'Asta',
   'and',
   'point',
   'out',
   'Yuno',
   'lack',
   'of',
   'magic',
   '.']},
 {'subjects': ['Yuno', 'Asta'],
  'tokens': ['Asta',
   'tr

### Remove sentences with no subject

In [44]:
lemmatizedWsubj_sentences = [lemmatizedWsubj_sentence for lemmatizedWsubj_sentence in lemmatizedWsubj_sentences if len(lemmatizedWsubj_sentence["subjects"]) != 0]
lemmatizedWsubj_sentences

[{'subjects': ['Yuno', 'Asta'],
  'tokens': ['priest',
   'take',
   'baby',
   'abandon',
   'church',
   'inside',
   'discovers',
   'baby',
   'abandon',
   'church',
   'name',
   'Yuno',
   'Asta']},
 {'subjects': ['Lily', 'Asta'],
  'tokens': ['year',
   'later',
   'Asta',
   'propose',
   'sister',
   'Lily',
   'refuse',
   'repeatedly']},
 {'subjects': ['Yuno', 'Asta'],
  'tokens': ['Yuno',
   'orphan',
   'criticize',
   'Asta',
   'point',
   'Yuno',
   'lack',
   'magic']},
 {'subjects': ['Yuno', 'Asta'],
  'tokens': ['Asta',
   'try',
   'show',
   'Asta',
   'skill',
   'Yuno',
   'outshine',
   'Asta',
   'Asta',
   'magic']},
 {'subjects': ['Yuno', 'Asta'],
  'tokens': ['Asta',
   'receive',
   'grimoire',
   'Yuno',
   'attain',
   'clover',
   'Asta',
   'challenge',
   'Yuno',
   'title',
   'wizard',
   'king',
   'Yuno',
   'ignore',
   'Asta']},
 {'subjects': ['Yuno'],
  'tokens': ['grimoire',
   'acceptance',
   'ceremony',
   'noble',
   'Yuno',
   'tower',
  

In [45]:
nonLemmatizedWsubj_sentences = [nonLemmatizedWsubj_sentence for nonLemmatizedWsubj_sentence in nonLemmatizedWsubj_sentences if len(nonLemmatizedWsubj_sentence["subjects"]) != 0]
nonLemmatizedWsubj_sentences

[{'subjects': ['Yuno', 'Asta'],
  'tokens': ['a',
   'priest',
   'takes',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'inside',
   'and',
   'discovers',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'names',
   'to',
   'be',
   'Yuno',
   'and',
   'Asta',
   '.']},
 {'subjects': ['Lily', 'Asta'],
  'tokens': ['Fifteen',
   'years',
   'later',
   ',',
   'Asta',
   'proposes',
   'to',
   'Sister',
   'Lily',
   ',',
   'who',
   'refuses',
   'repeatedly',
   '.']},
 {'subjects': ['Yuno', 'Asta'],
  'tokens': ['Yuno',
   'and',
   'the',
   'other',
   'orphans',
   'criticize',
   'Asta',
   'and',
   'point',
   'out',
   'Yuno',
   'lack',
   'of',
   'magic',
   '.']},
 {'subjects': ['Yuno', 'Asta'],
  'tokens': ['Asta',
   'tries',
   'to',
   'show',
   'off',
   'Asta',
   'skills',
   ',',
   'but',
   'Yuno',
   'outshines',
   'Asta',
   'with',
   'Asta',
   'magic',
   '.']},
 {'subjects': ['Yuno', 'As

### Repeat sentences for every subject

In [46]:
aux = []
for lemmatizedWsubj_sentence in lemmatizedWsubj_sentences:
    for subject in lemmatizedWsubj_sentence["subjects"]:
        aux.append({
            "subjects" : [subject],
            "tokens" : lemmatizedWsubj_sentence["tokens"]
        })
lemmatizedWsubj_sentences = aux
lemmatizedWsubj_sentences

[{'subjects': ['Yuno'],
  'tokens': ['priest',
   'take',
   'baby',
   'abandon',
   'church',
   'inside',
   'discovers',
   'baby',
   'abandon',
   'church',
   'name',
   'Yuno',
   'Asta']},
 {'subjects': ['Asta'],
  'tokens': ['priest',
   'take',
   'baby',
   'abandon',
   'church',
   'inside',
   'discovers',
   'baby',
   'abandon',
   'church',
   'name',
   'Yuno',
   'Asta']},
 {'subjects': ['Lily'],
  'tokens': ['year',
   'later',
   'Asta',
   'propose',
   'sister',
   'Lily',
   'refuse',
   'repeatedly']},
 {'subjects': ['Asta'],
  'tokens': ['year',
   'later',
   'Asta',
   'propose',
   'sister',
   'Lily',
   'refuse',
   'repeatedly']},
 {'subjects': ['Yuno'],
  'tokens': ['Yuno',
   'orphan',
   'criticize',
   'Asta',
   'point',
   'Yuno',
   'lack',
   'magic']},
 {'subjects': ['Asta'],
  'tokens': ['Yuno',
   'orphan',
   'criticize',
   'Asta',
   'point',
   'Yuno',
   'lack',
   'magic']},
 {'subjects': ['Yuno'],
  'tokens': ['Asta',
   'try',
   'sho

In [47]:
aux = []
for nonLemmatizedWsubj_sentence in nonLemmatizedWsubj_sentences:
    for subject in nonLemmatizedWsubj_sentence["subjects"]:
        aux.append({
            "subjects" : [subject],
            "tokens" : nonLemmatizedWsubj_sentence["tokens"]
        })
nonLemmatizedWsubj_sentences = aux
nonLemmatizedWsubj_sentences

[{'subjects': ['Yuno'],
  'tokens': ['a',
   'priest',
   'takes',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'inside',
   'and',
   'discovers',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'names',
   'to',
   'be',
   'Yuno',
   'and',
   'Asta',
   '.']},
 {'subjects': ['Asta'],
  'tokens': ['a',
   'priest',
   'takes',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'inside',
   'and',
   'discovers',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'names',
   'to',
   'be',
   'Yuno',
   'and',
   'Asta',
   '.']},
 {'subjects': ['Lily'],
  'tokens': ['Fifteen',
   'years',
   'later',
   ',',
   'Asta',
   'proposes',
   'to',
   'Sister',
   'Lily',
   ',',
   'who',
   'refuses',
   'repeatedly',
   '.']},
 {'subjects': ['Asta'],
  'tokens': ['Fifteen',
   'years',
   'later',
   ',',
   'Asta',
   'proposes',
   'to',
   'Sister',
   'Lily',
   

In [48]:
jsu.write_json(lemmatizedWsubj_sentences, "9_tokenized_sentences_black_clover.json")
jsu.write_json(nonLemmatizedWsubj_sentences, "9_non_lemmatized_tokenized_sentences_black_clover.json")

### Vocabulary

In [49]:
lemmatized_tokens = list(set(token for lemmatizedWsubj_sentence in lemmatizedWsubj_sentences for token in lemmatizedWsubj_sentence["tokens"]))
lemmatized_tokens

['guess',
 'fall',
 'lose',
 'never',
 'guardian',
 'everything',
 'marry',
 'ground',
 'wound',
 'shoulder',
 'individual',
 'sick',
 'shake',
 'bloom',
 'expedition',
 'abra',
 'library',
 'attractive',
 'intrigue',
 'prepared',
 'less',
 'Henry',
 'anything',
 'research',
 'lash',
 'remorsed',
 'properly',
 'limited',
 'uphold',
 'thumb',
 'sun',
 'impatient',
 'incoming',
 'twisted',
 'string',
 'squad',
 'bother',
 'member',
 'relief',
 'mother',
 'horn',
 'regular',
 'candelo',
 'recently',
 'salim',
 'intact',
 'badly',
 'cloth',
 'relocate',
 'ruin',
 'rampaging',
 'feel',
 'barely',
 'elvira',
 'dignity',
 'accept',
 'officially',
 'immediately',
 'feed',
 'terrible',
 'part',
 'letoile',
 'dam',
 'undergo',
 'sens',
 'initially',
 'proud',
 'article',
 'actually',
 'Grey',
 'internally',
 'careful',
 'clads',
 'flying',
 'kamaitachi',
 'belief',
 'recent',
 'lifeless',
 'abari',
 'trip',
 'resonate',
 'third',
 'stand',
 'rubble',
 'way',
 'teardrop',
 'yagos',
 'rejoice',
 '

In [50]:
jsu.write_json(lemmatized_tokens, "9_tokens_black_clover.json")