In [None]:
# !pip install -U pip setuptools wheel
# !pip install -U 'spacy[apple]'
# !pip install spacy

In [None]:
# !python -m spacy download en_core_web_lg
# !python -m spacy download en_core_web_md
# !python -m spacy download en_core_web_trf

In [None]:
import pathlib
import re
from tqdm import tqdm
import typing
from pprint import pprint as pp
import json
import spacy
from spacy import displacy
from nltk.tokenize import word_tokenize
from collections import Counter, defaultdict
import numpy as np

load the spacy model

In [None]:
nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_trf")

# target word: negro

## Identify all books in PG American Literature corpus that contain 'negro'

the American literate corpus, corresponds to "en,PS,fiction", which can be found in ~/surfdrive/Data/PG_en_PS_fiction_050204

In [None]:
books_dir = pathlib.Path('~/surfdrive/Data/PG_en_PS_fiction_050204').expanduser().resolve()
books_dir.exists()

first sift

In [None]:
target_books = []

for fp in tqdm(books_dir.glob('*')):
    
    with open(fp, 'r', encoding='latin1') as f:
        book_txt = f.read()
        
    if ' negro' in book_txt:
        target_books.append(fp)
        
display(len(target_books))

In [None]:
print(str(target_books[:5]))

## yield (file id, quote, manner, speaker) via spaCy dep parse and pos tags

In [None]:
def between(x,t):
    lower, upper = t
    if x >= lower and x <= upper:
        return True
    else:
        return False
        
print(between(3, (1,5)), between(7, (1,5)))

let's keep it simple ...

    * replace quotations with e.g., 'oh' ... to remove parser complexity
    * assume that each consecutive pair of quote marks, subtend a quotation
    * where subtending quote marks share a VERB head, that VERB is assumed the to be the manner of speaking the quote
    * where one of the above VERBs is modified by dobj or nsubj, then we assume the speaker

In [None]:
def count_quotes(chunk):
    return chunk.count('"') + chunk.count('“') + chunk.count('”')

def yield_type1(text:str, replacement='"#"', graph = False):

    # chunk into quotes and text-in-between-quotes
    chunks = [chunk for chunk in re.split(r'(".*?")|(“.*?”)', text) if chunk]
    # print('chunks', chunks)

    # get the quotes only
    quotes = [mo.group(0) for mo in re.finditer(r'(".*?")|(“.*?”)', text)]
    # print('quotes', quotes)

    # get counts of quotes in each chunk
    quote_counts = [count_quotes(chunk) for chunk in chunks]
    # print('quote counts in chunks', quote_counts)

    # ensure where a quotations mark occurs, it does so in pairs
    if all([True if (count == 0 or count == 2) else False for count in quote_counts]):

        # build a chunks version with the quotes replaced with replacement
        if replacement == '':
            chunks_ = chunks
        else:
            chunks_ = [chunk if quote_count == 0 else replacement for chunk, quote_count in zip(chunks, quote_counts)]

        # build a text version with quotes replaced
        text_ = "".join(chunks_)
        # print('text_', text_)

        # text as a list of tokens and their properties
        doc = nlp(text_)
        if graph:
            displacy.render(doc, style="dep", jupyter=True)
        
        tokens = {token.i: {'text':token.text, 'modifiers':list([m.i for m in token.children]), 'pos':token.pos_, 'head':token.head.i, 'dep':token.dep_, 'idx':token.idx} for token in doc}
        # print('tokens', tokens)
        
                # list of indices which are opening quotation marks
        marks_i = [i for i, d in tokens.items() if d['text']=='"' or d['text']=='“' or d['text']=='”']
        # print('marks', marks_i)
        opening_marks_i = [marks_i[x] for x in range(0,len(marks_i),2)]
        # print('opening quotation marks', opening_marks_i)
        
        # list of indices which are verbs
        verbs_i = [i for i, d in tokens.items() if d['pos']=="VERB"]
        # print('verbs', verbs_i)

        # build a list of (quote, manner, speaker)
        extracts = []
        for r, o in enumerate(opening_marks_i):
            
            o_matched = False
            
            # find a verb which is modified by the opening quotation ...
            for v in verbs_i:
                if o in tokens[v]['modifiers']:
                    # for that verb, find a corresponding speaker, i.e., subj or dobj which modifies the verb
                    for m in tokens[v]['modifiers']:
                        if tokens[m]['dep']=="dobj" or tokens[m]['dep']=="nsubj":
                            extracts.append((quotes[r], tokens[v]['text'], tokens[m]['text']))
                            o_matched = True

            if o_matched == False:
                extracts.append((quotes[r], None, None))

        # yield
        for t in extracts:
            yield t

input_ = '"today seems nice", said Tom'
list(yield_type1(input_))
        

In [None]:
# multi-line dialogue ... 
input_ = """
He says:

“What you doin’ with this gun?”

I judged he didn’t know nothing about what he had been doing, so I says:

“Somebody tried to get in, so I was laying for him.”

“Why didn’t you roust me out?”

“Well, I tried to, but I couldn’t; I couldn’t budge you.” 
"""
list(yield_type1(input_))

In [None]:
# Note: doesn't pick up on conjuction wrt., multiple speakers ... this is fine ... we aren't targetting this
input_ = '"oh," said Tom and Dave'
list(yield_type1(input_, graph=True))

## test cases

In [None]:
test_cases = [
    # direct dialogue
    (
        yield_type1, 
        '"blah," said the Mr. Tom Jones to Dave', 
        [('"blah,"', "said", "Jones")]
    ),
    (
        yield_type1, 
        '"blah," Prof. James E. Jones said to Dave',
        [('"blah,"', "said", "Jones")],
    ),
    (
        yield_type1, 
        '"blah," said Tom Liam Smith to Dave',
        [('"blah,"', "said", "Smith")],
    ),
    (
        yield_type1, 
        '"blah," said J.F.K. about Dave',
        [('"blah,"', "said",  "J.F.K.")],
    ),
    (
        yield_type1, 
        '"blah," said the nurse to Dave', 
        [('"blah,"', "said", "nurse")]
    ),
    (
        yield_type1, 
        'Tom Smith shouts "blah", over the state of the Union',
        [('"blah"', "shouts", "Smith")],
    ),
    (
        yield_type1, 
        'Tom S. Smith said to Dave, "blah"',
        [('"blah"', "said", "Smith")],
    ),
    (
        yield_type1, 
        'the nurse said "blah" to Dave', 
        [('"blah"', "said", "nurse")],
    ),
    (
        yield_type1, 
        'the nurse said excitedly, "blah"',
        [('"blah"', "said", "nurse")],
    ),
    # split dialogue
    (
        yield_type1, 
        '"blah," said the very able nurse about Tom, "I don\'t like him"',
        [('"blah,"', "said", "nurse"), ('"I don\'t like him"', "said", "nurse")],
    ),
    (
        yield_type1, 
        '"if we are not quick", she replied, "we will be late"',
        [('"if we are not quick"', "replied", "she"), ('"we will be late"', "replied", "she")],
    ),
    # 
]

# run test cases on import
print("\trun test cases")
for i, (f, input_, expected) in enumerate(test_cases, start=1):
    
    out = list(f(input_))
   
    for e in expected:
        assert e in out, f"test case {i}: expected {e}, found {out}"
        
print("\ttests successful")


## get the quotatations

A large disparity hints at an oversight

In [None]:
dictionary_fp = pathlib.Path("../dictionaries/english.txt")
with open(dictionary_fp, 'r') as f:
    dictionary = set([line.strip('\n') for line in f.readlines()])
# display(str(dictionary))

In [None]:
def gen_paragraphs(fp: pathlib.Path, *, dictionary: set[str]) -> typing.Generator:
    """Return a generator of paragraph strings for book at fp.

    Note: dictionary is used to help resolve hyphenatic split words due to formatting
    Note: paragraphs assumed as separated by '\n\n'
    Note: paragraphs cleaned up, removing \n is a way sentitive to hyphens
    """

    # open the doc
    # As per https://python-notes.curiousefficiency.org/en/latest/python3/text_file_processing.html,
    # latin-1 encoding is an acceptable best approach if in doubt of encoding, and is close to the python3 permissible model.
    with open(fp, "r", encoding="latin-1") as f:
        doc = f.read()

    # ignore the extraneous PG text, take only the book
    match = re.search(
        r"\*\*\*\s*START OF.+?\*\*\*(.+)\*\*\*\s*END OF",
        doc,
        flags=re.DOTALL,
    )

    if match:

        # true book text
        doc = match.groups()[0]

        # split into presumed paragraphs
        paragraphs = re.split("\n\n\n*", doc)

        # remove empty paragraphs
        paragraphs = [p for p in paragraphs if len(p) != 0]

        # remove newlines (sensitively)
        pattern_split = re.compile(r"([a-zA-Z']+)-\s*\n\s*([a-zA-Z']+)")
        for i, paragraph in enumerate(paragraphs):

            # remove newlines adjacent to hyphenated words
            for x, y in set(re.findall(pattern_split, paragraph)):

                try:
                    if x + y in dictionary:
                        paragraph = re.sub(rf"{x}-\s*\n\s*{y}", f"{x}{y}", paragraph)
                    else:
                        paragraph = re.sub(rf"{x}-\s*\n\s*{y}", f"{x}-{y}", paragraph)
                except:
                    pass

            # remove other newline cases
            paragraph = re.sub(r"\s*\n\s*", r" ", paragraph)

            # strip start and end whitespace
            paragraph = paragraph.strip()

            # re-add amended to paragraph container
            paragraphs[i] = paragraph

        # yield
        for paragraph in paragraphs:
            yield paragraph

    else:

        return
        yield
        

get list of (fp, quote_count, patterns[:-1] match count ) for all books

In [None]:
%%time
# get paragraphs for book_id
extracts = []

for fp in tqdm(target_books):
# for fp in tqdm([pathlib.Path('~/surfdrive/Data/PG_en_PS_fiction_050204/15603.txt').expanduser()]):  
    
    paragraphs = list(gen_paragraphs(fp, dictionary=dictionary))
    
    # count extracted quotations
    for i, paragraph in enumerate(paragraphs):
        
        if '"' in paragraph or '”' in paragraph:
            for t in yield_type1(paragraph):
                if len(t) > 0:
                    extracts.append([str(fp.stem), i, t[0], t[1], t[2]])
                    
    with open("quotes.json", "w") as f:
        json.dump(extracts, f)
    

## Quick overview of extracted quotes

load

In [None]:
with open('quotes_5Jul/quotes_5Jul.json', 'r') as f:
    quotes = np.array(json.load(f))

overview

In [None]:
len(quotes)

In [None]:
# what do the quotes look like?
quotes[110000:110010]

In [None]:
# how many books in the quotations set?
len(set([id_ for id_, _, _, _, _ in quotes]))

## Identify problematic spans (i.e., > 1024 tokens wrt., gpt-2)

In [None]:
import torch
import torch.nn.functional as F
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
spans_lengths = np.array([len(tokenizer.tokenize(quote.strip('"“”'))) for id_, p, quote, manner, speaker in tqdm(quotes)])
(spans_lengths>1024).sum()

In [None]:
blacklist = np.array([i for i,(id_, p, quote, manner, speaker) in enumerate(quotes)])[spans_lengths>1024]
blacklist

In [None]:
with open("quotes_blacklist.json", 'w') as f:
    json.dump([x.item() for x in blacklist], f)

# Examine the Quotes (removed of blacklisted)

In [None]:
# who are the named speakers?
speaker_counts = Counter()
for i, (id_, p, quote, manner, speaker) in tqdm(enumerate(quotes)):
    if i not in blacklist:
        speaker_counts[speaker] += 1

In [None]:
for speaker in ['negro', 'Negro', 'nigger', 'Nigger', 'chinaman', 'Chinaman', 'Oriental', 'oriental', 'coloured', 'Coloured', 'mulatto', 'quadroon', 'black', 'Black', 'jew', 'Jew', 'yid', 'Yid']:
    print(speaker, speaker_counts[speaker])

In [None]:
# how many books containing quotations attributed to the speaker, 'negro'?
ids_with_negro_descriptors = set([id_ for id_, p, quote, manner, speaker in quotes if (speaker == 'negro' or speaker =='Negro')])
print(len(ids_with_negro_descriptors))