In [1]:
from characters_real_names import *

import re
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")

# Used to get the gender and distinct people with same family noun. 

def get_person_title(span):
    if span.label_ == "PERSON" and span.start != 0:
        prev_token = span.doc[span.start - 1]
        if prev_token.text in ("Mr", "Mr.", "Mrs", "Mrs."):
            return prev_token.text + " "
    return ""

Span.set_extension("person_title", getter=get_person_title, force=True)

book = open("../Homework 1/Harry Potter and the Sorcerer.txt").read()

book_chapters = re.split(r'CHAPTER [\w+]+', book, flags=re.IGNORECASE)[1:]
book_chapters[-1] = book_chapters[-1].split("THE END")[0]


In [2]:
docs = nlp(''.join(book_chapters))
all_sentences = [sent.text.strip() for sent in docs.sents]

In [3]:
all_sentences[0]

'Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.'

In [8]:
from tqdm import tqdm

# 7267 iterations
for i,sentence in tqdm(enumerate(all_sentences)):
    doc = nlp(sentence)
    entities = [dict(title=ent._.person_title,name=ent.text,position=(ent.start,ent.end)) for ent in doc.ents if ent.label_ == "PERSON"]

    for entity in entities:
        name_underscore = "_".join(entity.get('name').split(" "))
        if name_underscore in all_real_names.keys(): continue
        for real,check in all_real_names.items():
            real_underscore = "_".join(real.split(" "))

            if entity.get('title') + entity.get('name') in check: 
                all_sentences[i] = re.sub(entity.get('name'), real_underscore, sentence)

7267it [01:00, 120.34it/s]


In [10]:
all_sentences[0:10]

['Mr. and Mrs. Petunia_Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.',
 "They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.",
 'Mr. Vernon_Dursley was the director of a firm called Grunnings, which made drills.',
 'He was a big, beefy man with hardly any neck, although he did have a very large mustache.',
 'Mrs. Petunia_Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors.',
 'The Dursleys_Family had a small son called Dudley_Dursley and in their opinion there was no finer boy anywhere.',
 'The Dursleys_Family had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it.',
 "They didn't think they could bear it if anyone found out about the Potters.",

In [31]:
from gensim.parsing.preprocessing import preprocess_string

from string import punctuation
from gensim.parsing.preprocessing import strip_tags     
from gensim.parsing.preprocessing import strip_short      
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_non_alphanum

CUSTOM_FILTERS = [lambda x: strip_short(x.lower(), 1), strip_multiple_whitespaces, strip_tags, strip_non_alphanum]

all_sentences_preprocessed = []
punctuation_to_remove = punctuation.replace("_", "")
for sent in tqdm(all_sentences):
    sent = sent.translate(str.maketrans('', '', punctuation_to_remove))
    parsed_line = preprocess_string(sent, CUSTOM_FILTERS)
    all_sentences_preprocessed.append(parsed_line)

100%|██████████| 7267/7267 [00:00<00:00, 29114.76it/s]


In [32]:
all_sentences_preprocessed[0:100]

[['mr',
  'and',
  'mrs',
  'petunia_dursley',
  'of',
  'number',
  'four',
  'privet',
  'drive',
  'were',
  'proud',
  'to',
  'say',
  'that',
  'they',
  'were',
  'perfectly',
  'normal',
  'thank',
  'you',
  'very',
  'much'],
 ['they',
  'were',
  'the',
  'last',
  'people',
  'youd',
  'expect',
  'to',
  'be',
  'involved',
  'in',
  'anything',
  'strange',
  'or',
  'mysterious',
  'because',
  'they',
  'just',
  'didnt',
  'hold',
  'with',
  'such',
  'nonsense'],
 ['mr',
  'vernon_dursley',
  'was',
  'the',
  'director',
  'of',
  'a',
  'firm',
  'called',
  'grunnings',
  'which',
  'made',
  'drills'],
 ['he',
  'was',
  'a',
  'big',
  'beefy',
  'man',
  'with',
  'hardly',
  'any',
  'neck',
  'although',
  'he',
  'did',
  'have',
  'a',
  'very',
  'large',
  'mustache'],
 ['mrs',
  'petunia_dursley',
  'was',
  'thin',
  'and',
  'blonde',
  'and',
  'had',
  'nearly',
  'twice',
  'the',
  'usual',
  'amount',
  'of',
  'neck',
  'which',
  'came',
  'in',