In [51]:
# !pip install -U pip setuptools wheel
# !pip install -U 'spacy[apple]'
# !pip install spacy

In [19]:
# !python -m spacy download en_core_web_lg
# !python -m spacy download en_core_web_md
# !python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting spacy-curated-transformers<0.3.0,>=0.2.0 (from en-core-web-trf==3.7.3)
  Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_tokenizers-0.0.9-cp310-cp310-macosx_11_0_arm64.whl.metadata (1.9 kB)
Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl (236 kB

In [1]:
import pathlib
import re
from tqdm import tqdm
import typing
from pprint import pprint as pp
import json
import spacy
from spacy import displacy
from nltk.tokenize import word_tokenize
from collections import Counter, defaultdict
import numpy as np

load the spacy model

In [2]:
nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_trf")

# target word: negro

## Identify all books in PG American Literature corpus that contain 'negro'

the American literate corpus, corresponds to "en,PS,fiction", which can be found in ~/surfdrive/Data/PG_en_PS_fiction_050204

In [3]:
books_dir = pathlib.Path('~/surfdrive/Data/PG_en_PS_fiction_050204').expanduser().resolve()
books_dir.exists()

True

first sift

In [4]:
target_books = []

for fp in tqdm(books_dir.glob('*')):
    
    with open(fp, 'r', encoding='latin1') as f:
        book_txt = f.read()
        
    if ' negro' in book_txt:
        target_books.append(fp)
        
display(len(target_books))

8760it [00:06, 1297.70it/s]


1974

In [5]:
print(str(target_books[:5]))

[PosixPath('/Users/ryanbrate/surfdrive/Data/PG_en_PS_fiction_050204/13514.txt'), PosixPath('/Users/ryanbrate/surfdrive/Data/PG_en_PS_fiction_050204/8711.txt'), PosixPath('/Users/ryanbrate/surfdrive/Data/PG_en_PS_fiction_050204/15603.txt'), PosixPath('/Users/ryanbrate/surfdrive/Data/PG_en_PS_fiction_050204/34801.txt'), PosixPath('/Users/ryanbrate/surfdrive/Data/PG_en_PS_fiction_050204/6072.txt')]


## yield (file id, quote, manner, speaker) via spaCy dep parse and pos tags

In [6]:
def between(x,t):
    lower, upper = t
    if x >= lower and x <= upper:
        return True
    else:
        return False
        
print(between(3, (1,5)), between(7, (1,5)))

True False


let's keep it simple ...

    * replace quotations with e.g., 'oh' ... to remove parser complexity
    * assume that each consecutive pair of quote marks, subtend a quotation
    * where subtending quote marks share a VERB head, that VERB is assumed the to be the manner of speaking the quote
    * where one of the above VERBs is modified by dobj or nsubj, then we assume the speaker

In [1]:
def count_quotes(chunk):
    return chunk.count('"') + chunk.count('“') + chunk.count('”')

def yield_type1(text:str, replacement='"#"', graph = False):

    # chunk into quotes and text-in-between-quotes
    chunks = [chunk for chunk in re.split(r'(".*?")|(“.*?”)', text) if chunk]
    # print('chunks', chunks)

    # get the quotes only
    quotes = [mo.group(0) for mo in re.finditer(r'(".*?")|(“.*?”)', text)]
    # print('quotes', quotes)

    # get counts of quotes in each chunk
    quote_counts = [count_quotes(chunk) for chunk in chunks]
    # print('quote counts in chunks', quote_counts)

    # ensure where a quotations mark occurs, it does so in pairs
    if all([True if (count == 0 or count == 2) else False for count in quote_counts]):

        # build a chunks version with the quotes replaced with replacement
        if replacement == '':
            chunks_ = chunks
        else:
            chunks_ = [chunk if quote_count == 0 else replacement for chunk, quote_count in zip(chunks, quote_counts)]

        # build a text version with quotes replaced
        text_ = "".join(chunks_)
        # print('text_', text_)

        # text as a list of tokens and their properties
        doc = nlp(text_)
        if graph:
            displacy.render(doc, style="dep", jupyter=True)
        
        tokens = {token.i: {'text':token.text, 'modifiers':list([m.i for m in token.children]), 'pos':token.pos_, 'head':token.head.i, 'dep':token.dep_, 'idx':token.idx} for token in doc}
        # print('tokens', tokens)
        
                # list of indices which are opening quotation marks
        marks_i = [i for i, d in tokens.items() if d['text']=='"' or d['text']=='“' or d['text']=='”']
        # print('marks', marks_i)
        opening_marks_i = [marks_i[x] for x in range(0,len(marks_i),2)]
        # print('opening quotation marks', opening_marks_i)
        
        # list of indices which are verbs
        verbs_i = [i for i, d in tokens.items() if d['pos']=="VERB"]
        # print('verbs', verbs_i)

        # build a list of (quote, manner, speaker)
        extracts = []
        for r, o in enumerate(opening_marks_i):
            
            o_matched = False
            
            # find a verb which is modified by the opening quotation ...
            for v in verbs_i:
                if o in tokens[v]['modifiers']:
                    # for that verb, find a corresponding speaker, i.e., subj or dobj which modifies the verb
                    for m in tokens[v]['modifiers']:
                        if tokens[m]['dep']=="dobj" or tokens[m]['dep']=="nsubj":
                            extracts.append((quotes[r], tokens[v]['text'], tokens[m]['text']))
                            o_matched = True

            if o_matched == False:
                extracts.append((quotes[r], None, None))

        # yield
        for t in extracts:
            yield t

input_ = '"today seems nice", said Tom'
list(yield_type1(input_))
        

NameError: name 're' is not defined

In [9]:
# multi-line dialogue ... 
input_ = """
He says:

“What you doin’ with this gun?”

I judged he didn’t know nothing about what he had been doing, so I says:

“Somebody tried to get in, so I was laying for him.”

“Why didn’t you roust me out?”

“Well, I tried to, but I couldn’t; I couldn’t budge you.” 
"""
list(yield_type1(input_))

[('“What you doin’ with this gun?”', 'says', 'He'),
 ('“Somebody tried to get in, so I was laying for him.”', 'says', 'I'),
 ('“Why didn’t you roust me out?”', None, None),
 ('“Well, I tried to, but I couldn’t; I couldn’t budge you.”', None, None)]

In [11]:
# Note: doesn't pick up on conjuction wrt., multiple speakers ... this is fine ... we aren't targetting this
input_ = '"oh," said Tom and Dave'
list(yield_type1(input_, graph=True))

[('"oh,"', 'said', 'Tom')]

## test cases

In [12]:
test_cases = [
    # direct dialogue
    (
        yield_type1, 
        '"blah," said the Mr. Tom Jones to Dave', 
        [('"blah,"', "said", "Jones")]
    ),
    (
        yield_type1, 
        '"blah," Prof. James E. Jones said to Dave',
        [('"blah,"', "said", "Jones")],
    ),
    (
        yield_type1, 
        '"blah," said Tom Liam Smith to Dave',
        [('"blah,"', "said", "Smith")],
    ),
    (
        yield_type1, 
        '"blah," said J.F.K. about Dave',
        [('"blah,"', "said",  "J.F.K.")],
    ),
    (
        yield_type1, 
        '"blah," said the nurse to Dave', 
        [('"blah,"', "said", "nurse")]
    ),
    (
        yield_type1, 
        'Tom Smith shouts "blah", over the state of the Union',
        [('"blah"', "shouts", "Smith")],
    ),
    (
        yield_type1, 
        'Tom S. Smith said to Dave, "blah"',
        [('"blah"', "said", "Smith")],
    ),
    (
        yield_type1, 
        'the nurse said "blah" to Dave', 
        [('"blah"', "said", "nurse")],
    ),
    (
        yield_type1, 
        'the nurse said excitedly, "blah"',
        [('"blah"', "said", "nurse")],
    ),
    # split dialogue
    (
        yield_type1, 
        '"blah," said the very able nurse about Tom, "I don\'t like him"',
        [('"blah,"', "said", "nurse"), ('"I don\'t like him"', "said", "nurse")],
    ),
    (
        yield_type1, 
        '"if we are not quick", she replied, "we will be late"',
        [('"if we are not quick"', "replied", "she"), ('"we will be late"', "replied", "she")],
    ),
    # 
]

# run test cases on import
print("\trun test cases")
for i, (f, input_, expected) in enumerate(test_cases, start=1):
    
    out = list(f(input_))
   
    for e in expected:
        assert e in out, f"test case {i}: expected {e}, found {out}"
        
print("\ttests successful")


	run test cases
	tests successful


## get the quotatations

A large disparity hints at an oversight

In [13]:
dictionary_fp = pathlib.Path("../dictionaries/english.txt")
with open(dictionary_fp, 'r') as f:
    dictionary = set([line.strip('\n') for line in f.readlines()])
# display(str(dictionary))

In [14]:
def gen_paragraphs(fp: pathlib.Path, *, dictionary: set[str]) -> typing.Generator:
    """Return a generator of paragraph strings for book at fp.

    Note: dictionary is used to help resolve hyphenatic split words due to formatting
    Note: paragraphs assumed as separated by '\n\n'
    Note: paragraphs cleaned up, removing \n is a way sentitive to hyphens
    """

    # open the doc
    # As per https://python-notes.curiousefficiency.org/en/latest/python3/text_file_processing.html,
    # latin-1 encoding is an acceptable best approach if in doubt of encoding, and is close to the python3 permissible model.
    with open(fp, "r", encoding="latin-1") as f:
        doc = f.read()

    # ignore the extraneous PG text, take only the book
    match = re.search(
        r"\*\*\*\s*START OF.+?\*\*\*(.+)\*\*\*\s*END OF",
        doc,
        flags=re.DOTALL,
    )

    if match:

        # true book text
        doc = match.groups()[0]

        # split into presumed paragraphs
        paragraphs = re.split("\n\n\n*", doc)

        # remove empty paragraphs
        paragraphs = [p for p in paragraphs if len(p) != 0]

        # remove newlines (sensitively)
        pattern_split = re.compile(r"([a-zA-Z']+)-\s*\n\s*([a-zA-Z']+)")
        for i, paragraph in enumerate(paragraphs):

            # remove newlines adjacent to hyphenated words
            for x, y in set(re.findall(pattern_split, paragraph)):

                try:
                    if x + y in dictionary:
                        paragraph = re.sub(rf"{x}-\s*\n\s*{y}", f"{x}{y}", paragraph)
                    else:
                        paragraph = re.sub(rf"{x}-\s*\n\s*{y}", f"{x}-{y}", paragraph)
                except:
                    pass

            # remove other newline cases
            paragraph = re.sub(r"\s*\n\s*", r" ", paragraph)

            # strip start and end whitespace
            paragraph = paragraph.strip()

            # re-add amended to paragraph container
            paragraphs[i] = paragraph

        # yield
        for paragraph in paragraphs:
            yield paragraph

    else:

        return
        yield
        

get list of (fp, quote_count, patterns[:-1] match count ) for all books

In [15]:
%%time
# get paragraphs for book_id
extracts = []

for fp in tqdm(target_books):
# for fp in tqdm([pathlib.Path('~/surfdrive/Data/PG_en_PS_fiction_050204/15603.txt').expanduser()]):  
    
    paragraphs = list(gen_paragraphs(fp, dictionary=dictionary))
    
    # count extracted quotations
    for i, paragraph in enumerate(paragraphs):
        
        if '"' in paragraph or '”' in paragraph:
            for t in yield_type1(paragraph):
                if len(t) > 0:
                    extracts.append([str(fp.stem), i, t[0], t[1], t[2]])
                    
    with open("quotes.json", "w") as f:
        json.dump(extracts, f)
    

  0%|                                        | 2/1974 [00:04<1:06:45,  2.03s/it]


KeyboardInterrupt: 

## Quick overview of extracted quotes

load

In [7]:
with open('quotes_5Jul/quotes_5Jul.json', 'r') as f:
    quotes = np.array(json.load(f))

overview

In [8]:
len(quotes)

2379115

In [9]:
# what do the quotes look like?
quotes[110000:110010]

array([['47385', 266, '"My sweetheart! I\'m so sorry for the pain."',
        None, None],
       ['47385', 268, '"Jack,"', 'said', 'she'],
       ['47385', 268, '"you do love me, don\'t you?"', 'said', 'she'],
       ['47385', 269,
        '"My darling, I love you better than anything in the world. You are the dearest little woman I ever saw. It isn\'t much of a heart, dear, but you\'ve got it all. Crying? Why, what is it, sweetheart?"',
        None, None],
       ['47385', 270, '"The baby,"', 'answered', 'she'],
       ['47385', 271,
        '"Dorothy, dearest, you know that was best. He wasn\'t like--"',
        'say', 'Jack'],
       ['47385', 271,
        '"Dorothy, dearest, you know that was best. He wasn\'t like--"',
        'say', 'words'],
       ['47385', 275, '"the den"', None, None],
       ['47385', 278, '"Come,"', 'called', 'she'],
       ['47385', 279, '"Oh, it\'s you,"', 'cried', 'she']], dtype=object)

In [10]:
# how many books in the quotations set?
len(set([id_ for id_, _, _, _, _ in quotes]))

1517

## Identify problematic spans (i.e., > 1024 tokens wrt., gpt-2)

In [12]:
import torch
import torch.nn.functional as F
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [13]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
spans_lengths = np.array([len(tokenizer.tokenize(quote.strip('"“”'))) for id_, p, quote, manner, speaker in tqdm(quotes)])
(spans_lengths>1024).sum()


  0%|                                               | 0/2379115 [00:00<?, ?it/s][A
  0%|                                   | 793/2379115 [00:00<05:00, 7906.28it/s][A
  0%|                                 | 1972/2379115 [00:00<03:53, 10187.52it/s][A
  0%|                                 | 3321/2379115 [00:00<03:23, 11691.84it/s][A
  0%|                                 | 4733/2379115 [00:00<03:07, 12644.13it/s][A
  0%|                                 | 6244/2379115 [00:00<02:55, 13530.95it/s][A
  0%|                                 | 7640/2379115 [00:00<02:53, 13674.61it/s][A
  0%|▏                                | 9449/2379115 [00:00<02:36, 15116.28it/s][A
  0%|▏                               | 11184/2379115 [00:00<02:29, 15826.27it/s][A
  1%|▏                               | 12924/2379115 [00:00<02:25, 16314.84it/s][A
  1%|▏                               | 14556/2379115 [00:01<02:27, 15990.13it/s][A
  1%|▏                               | 16157/2379115 [00:01<02:32, 15534.92

In [20]:
blacklist = np.array([i for i,(id_, p, quote, manner, speaker) in enumerate(quotes)])[spans_lengths>1024]
blacklist

array([  50832,  178776,  224623,  467098,  467392,  491798,  504223,
        504258,  552576,  646885,  647144,  689226,  782551,  929646,
        947499,  951097,  977954,  981894, 1249164, 1391889, 1438775,
       1445689, 1543791, 1642411, 1720062, 1721802, 1733245, 1738547,
       1766189, 1855893, 1896581, 2036587, 2155920, 2159428, 2169109,
       2169154, 2182459, 2182460, 2352396])

In [24]:
with open("quotes_blacklist.json", 'w') as f:
    json.dump([x.item() for x in blacklist], f)

# Examine the Quotes (removed of blacklisted)

In [28]:
# who are the named speakers?
speaker_counts = Counter()
for i, (id_, p, quote, manner, speaker) in tqdm(enumerate(quotes)):
    if i not in blacklist:
        speaker_counts[speaker] += 1

2379115it [00:06, 375302.03it/s]


In [29]:
for speaker in ['negro', 'Negro', 'nigger', 'Nigger', 'chinaman', 'Chinaman', 'Oriental', 'oriental', 'coloured', 'Coloured', 'mulatto', 'quadroon', 'black', 'Black', 'jew', 'Jew', 'yid', 'Yid']:
    print(speaker, speaker_counts[speaker])

negro 792
Negro 24
nigger 3
Nigger 15
chinaman 0
Chinaman 17
Oriental 7
oriental 0
coloured 0
Coloured 0
mulatto 57
quadroon 9
black 144
Black 41
jew 0
Jew 130
yid 0
Yid 0


In [30]:
# how many books containing quotations attributed to the speaker, 'negro'?
ids_with_negro_descriptors = set([id_ for id_, p, quote, manner, speaker in quotes if (speaker == 'negro' or speaker =='Negro')])
print(len(ids_with_negro_descriptors))

248


In [31]:
# what are the counts by speaker?
counter = Counter()
for id_, p, quote, manner, speaker in [(id_, p, quote, manner, speaker) for id_, p, quote, manner, speaker in quotes if id_ in ids_with_negro_descriptors]:
    counter[speaker] += 1

In [32]:
for speaker in ['negro', 'Negro', 'nigger', 'Nigger', 'chinaman', 'Chinaman', 'Oriental', 'oriental', 'coloured', 'Coloured', 'mulatto', 'quadroon', 'black', 'Black', 'jew', 'Jew', 'yid', 'Yid']:
    print(speaker, counter[speaker])

negro 792
Negro 24
nigger 2
Nigger 15
chinaman 0
Chinaman 1
Oriental 1
oriental 0
coloured 0
Coloured 0
mulatto 29
quadroon 2
black 91
Black 17
jew 0
Jew 22
yid 0
Yid 0


In [62]:
# what am I doing when I correct it?: 
#     * look at each word separately, and propose an amendment based on similary sounding words
#     * double check they all make sense together in context, and then propose another

### Black Authors in PG

In [65]:
# works by black authors ...
black_works = [
    ("Paul Laurence Dunbar", ['18338', '15041', '15886', '17854', '25171', '24716']), # PS Fiction
    # ("Harriet Jacobs",['11030']),
    # ("W. E. B. Du Bois",['408', '17700', '15210', '31254', '62799', '35399', '66398', '62582', '5685']),
    # ("Booker T. Washington",['2376', '73923', '60484', '20923', '26507', '61223', '64504', '35399', '69692', '63620', '61953', '45125']),
    # ("Frederick Douglas",['23', '202', '99', '71893', '31839', '34915']),
    # ("Anna J. Cooper",['61741']),
    # ("Solomon Northup", ['45631']),
    # ("James Weldon Johnson", ['11986', '11012', '6316', '17884', '35025']),
    ("Frances Ellen Watkins Harper", ['12352', '11056', '679', '11053', '69249', '69248', '11022']), # PS Fiction
    # ("Ida B. Wells-Barnett", ['14977', '14975', '14976', '64426']),
    ("Charles W. Chesnutt", ['15041', '11057', '11228', '472', '19746', '35063', '10986']), # PS Fiction
    # ("VARIOUS", ['71448']),
    # ("Langston Hughes", ['19435']),  # PS Drama
    # ("William Wells Brown", ['15132', '64971', '64883', '65519', '2046', '15830', '50130', '59500', '59114', '50092', '58414', '2095', '']),
    # ("Alison Moore Dunbar-Nelson", ['18713', '688']),  # PS Fiction
    # ("Zora Neale Hurston", ['19435', '73715', '17187', '15902', '22146']),  # PS Drama
]

In [66]:
# how many negro quotations do we observe?

black_ids = []
for author, ids in black_works:
    black_ids += ids

for id_, p, quote, manner, speaker in tqdm(quotes):
    if id_ in black_ids and speaker == 'negro':
        print(quote, manner, id_)

 52%|██████████████▌             | 1239495/2379115 [00:00<00:00, 3218470.48it/s]

"Nothin', suh, nothin' at all, suh," responded 11228
"Yes, suh," replied 11228


100%|████████████████████████████| 2379115/2379115 [00:00<00:00, 3239813.20it/s]

"Is de shurff in," inquired 11057
"I dunno who-all _is_ comin'," replied 11057
"Dere 's Mistah McSwayne, en Doc' Cain, en Maje' McDonal', en Kunnel Wright, en a heap er yuthers. I wuz so skeered I done furgot mo' d'n half un em. I spec' dey mus' be mos' here by dis time, so I 'll git outen de way, fer I don' want nobody fer ter think I wuz mix' up in dis business." glanced 11057
"Dey won't try ter steal me, will dey, marster?" asked 11057





# Foray into non-canonical forms

let's gradually assemble a list of non-canonical forms (including abbrevs) ...

In [455]:
i = 0

In [95]:
# quick routine ... 
non_canon = {"":"", "missus":"mistress", "marser":"master", "sez":"says", "jes":"just", "ever'":"every", "heah'd":"heard", "he'p":"help", "skeer":"scares", "mos'":"almost", "heah":"hear", "o'":"of", "goin'":"going", "fer":"for", "somewhur":"somewhere", "sep'rate":"separate", "udders":"others", "dem":"them", "'bout":"about", "tol'":"told", "arternoon":"afternoon", "comed":"came", "chile":"child", "sayin'":"saying", "widout":"without", "'upon":"upon", "aldough":"although", "bery":"very", "poh":"poor", "berry":"very", "Jes":"just", "seed":"seen", "eber":"ever", "lubliest":"loveliest", "mysef": "myself","rader":"rather", "dat's":"thats", "spect":"expect", "to-night":"tonight", "ob":"of", "dat":"that", "dar":"there", "de":"the", "an'":"and", "arter":"after", "wid":"with", "wont":"won't", "dis":"this", "gwine":"going", "'em":"them", "dey":"they", "suh":"sir", "marse":"master", "seed":"said", "yestiddy":"yesterday", "sutney":"certainly", "lookin'":"looking"}
# mistis, missis, libbing, Jes
# I rather -> I would rather, dat come -> that came, was from -> were from, "comed up": "who came"
# ending 'like' 
# missing connecting words, 'who'

quote = quotes_negro[i]
tokens = word_tokenize(quote)
potentials  = [token.lower() for token in tokens if token not in dictionary and token.lower() not in non_canon.keys()]
print(quote)
print(potentials)
i+=1

"Dar now,"
['``', ',', "''"]


In [494]:
# contractions = {
#     "roun'":'round', "goin'":"going", "sinkin'":"sinking", "un'der":"under", "'em":"them", "couldn'":"couldn't", "retu'n":"return", "holdin'":"holding", "till":"until", "mornin'":"morning", "an'":"and", "wonderin":"wondering", "lan":"land", "promisin":"promising", "wavin":"waving", "see-sawin":"see-sawing", "cap'en":"captain", "mo":"more", "mo'":"more", "cl'ar":"clear", "workin'":"working", "'scussin'":"discussing", "Cap'in":"Captain", "alway":"always", "t'ing":"things", "won'derful", "'especially", "yo'":["your", "you"], "li'l", "tellin'", "flow'rs":"flowers", "fo'":"for", "wearin'":"wearing", "passin'":"passing", "talkin'":"talking", "reg'lar":"regularly", "li'l'":"little", "'im":"", "tain't":"that ain't", "snapping":"snapping", "'spectable":"respectable", "willin'":"willing", "th'":"the", "yas'm":"yes maam", "had'n":"hadn't", "'spec":"expect", "t'ink":"thing", "bes'":"best", "'fore":"before", "o'":"of", "s'prised":"surprised", "don'":"don't", "w'at":"what", "slave'y":"slavery", "min'":"mind", "worl'":"world"
# }  # i.e., characters in canonical order, with apostrophe's denoted shortening
    
misspell = {
    "wont":"", "mistis":"mister", "dey":"they", "marse":"master", "seed":"said", "yestiddy":"yesterday", "sutney":"certainly", "wuz":"was", "peart":"pert", "ye":"you", "missus":"mistress", "Massa":"Master", "wot":"what", "gwine":"going", "ter":"to", "sah":"sir", "neber":"never", "Mistuh":["Mister", "Mr"], "Missis":["Mrs"], "massa":["master"], "fust-rate":"first rate", "tink":"think", "dat":"that", "Sartainly":"certainly", "yung":"young", "git":"get", "pears":"appears", "lak":"like", "haint":["ain't", "isn't"], "kerrige":"carriage", "ole":"old", "und":"and", "scovered":"discovered", "to-night":"tonight", "agwine":"going", "orter":"ought to", "Ef":"If", "den":"then", "dere":"there", "wudn't":"wouldn't", "ben":"been", "scandle":"skandle", "de":"the", "es": "is", "ergwine": "going", "skornful": "scornful", "wid":"with", "dere": "their", "pinted": "pointed", "gal":"girl", "ax": "ask", "deyselves":"themselves", "cum":"come", "jined":"joined", "marser": "master", "wun":"one", "jes":"yes", "dun":"done", "gon":"gone", "arter":"after", "rong":"wrong", "yu":"you", "ekal":"equal", "yas":"yes", "sar":"sir", "sprises":"surprise", "Jonsin":"Johnson", "ergin":"again", "yo":"your", "menny": "many", "hab": ["have", "has"], "shurrufs":"sherifs", "Dats":"That's", "mity":"mighty", "heep":"heap", "ob":"of", "dis":"this", "sees":"see", "butifullest":"most beautiful", "somewhere":"sunwhere", "I's":"I am", "gib":"give", "hit":"it", "gits":"gets", "Mistis":"Misters", "hed":"had", "Ah":"I", "hopes":"hope", "foh":"for", "mars'r":"master", "dar'll":"there'll", "I'se":"I am", "afeerd":"afaind", "ef":"if", "yer":"your", "An":"And", "gwyn":"going", "de":"the", "quare": "quoir", "tuk":"took", "Dar's":"There's", "marster":"master", "arternoon":"afternoon", "eber":"ever", "sence":"since", "Dey":"They", "heah":"here", "to-day":"today", "seh":"sir", "sich":"such", "ob":"about", "bofe":"both", "hoss":"horse", "arter":"after", "Dis":"This", "'e":"the", "hyer":"higher", "mought":"mighty", "sutny":"certainly", "Mebbe":"Maybe", "They":"Their", "apaht":"apart", "Sutny":"Certainly", "Mas'r":"Master", "fotched":"fetched", "Yassah": "Yes sir", "gittin'":"getting", "Marse":"Master", "hain't":"ain't", "Thankee":["Thank you", "Thanks"], "heself":"his self", "behave":"behaves", "quatah":"quarter", "ovah":"over", "wheel-barrer":"wheel barrow", "red":"ride", "Mass'":"Master", "whar":"where", "Tankee":["Thanks", "Thank you"], "fah":"far", "Yu":"You", "yu":"you", "enny":"any", "Suttinly":"Certainly", "jest":"just", "kin":"can", "naw":"no", "'er":"have", "skuze":"excuse", "tole":"told", "whut":"what", "fur":"for", "licker":"liquor", "lubiest":"loveliest", "eber":"ever", "nebber":"never", "mistis":"misters", "dere":"there", "shure":"sure", "dey":"they", "mistah":"mister", "suh":"sir", "Injun":"Indian", "sojers":"soldiers", "wha":"where", "fader's":"fathers", "suah":"sir", "pahdon":"pardon", "yassuh":"yes sir", "nussin'":"nursing", "um":"them", "nuffin":"nothing", "forgib":"forgive", "gitt'n":"getting", "Missus":"Mistress", "Cel'bratun":"Celebrating", "Dunno": "Don't know", "dat's":"that's", "Dem":"them", "bin":"been", "out'n":"out of"  
} 

In [473]:
"arter" in misspell

True

In [386]:
# What do we see that's 'non-canonical' in form, in the speaker=negro quotations? 

contractions = {
    "roun'":'round', "goin'":"going", "sinkin'":"sinking", "un'der":"under", "'em":"them", "couldn'":"couldn't", "retu'n":"return", "holdin'":"holding", "till":"until", "mornin'":"morning", "an'":"and", "wonderin":"wondering", "lan":"land", "promisin":"promising", "wavin":"waving", "see-sawin":"see-sawing", "cap'en":"captain", "mo":"more", "mo'":"more", "cl'ar":"clear", "workin'":"working", "'scussin'", "Cap'in", "alway":"always", "t'ing":"things", "won'derful", "'especially", "yo'":["your", "you"], "li'l", "tellin'", "flow'rs":"flowers", "fo'":"for", "wearin'":"wearing", "passin'":"passing", "talkin'":"talking", "reg'lar":"regularly", "li'l'":"little", "'im":"", "tain't":"that ain't", "snapping":"snapping", "'spectable":"respectable", "willin'":"willing", "th'":"the", "yas'm":"yes maam", "had'n":"hadn't", "'spec":"expect", "t'ink":"thing", "bes'":"best", "'fore":"before", "o'":"of", "s'prised":"surprised", "don'":"don't", "w'at":"what", "slave'y":"slavery", "min'":"mind", "worl'":"world"
}  # i.e., characters in canonical order, with apostrophe's denoted shortening
    
misspell = {
    "mistis":"mister", "Dey":"They", "Marse":"Master", "seed":"said", "yestiddy":"yesterday", "sutney":"certainly", "wuz":"was", "peart":"pert", "ye":"you", "missus":"mistress", "Massa":"Master", "wot":"what", "gwine":"going", "ter":"to", "sah":"sir", "neber":"never", "Mistuh":["Mister", "Mr"], "Missis":["Mrs"], "massa":["master"], "fust-rate":"first rate", "tink":"think", "dat":"that", "Sartainly":"certainly", "yung":"young", "git":"get", "pears":"appears", "lak":"like", "haint":["ain't", "isn't"], "kerrige":"carriage", "ole":"old", "und":"and", "scoveredd":"discovered", "to-night":"tonight", "agwine":"going", "orter":"ought to", "Ef":"If", "den":"then", "dere":"there", "wudn't":"wouldn't", "ben":"been", "scandle":"skandle", "de":"the", "es": "is", "ergwine": "going", "skornful": "scornful", "wid":"with", "dere": "their", "pinted": "pointed", "gal":"girl", "ax": "ask", "deyselves":"themselves", "cum":"come", "jined":"joined", "marser": "master", "wun":"one", "jes", "dun":"done", "gon":"gone", "arter":"after", "rong":"wrong", "yu":"you", "ekal":"equal", "yas":"yes", "sar":"sir", "sprises":"surprise", "Jonsin":"Johnson", "ergin":"again", "yo":"your", "menny": "many", "hab": ["have", "has"], "shurrufs":"sherifs", "Dats":"That's", "mity":"mighty", "heep":"heap", "ob":"of", "dis":"this", "sees":"see", "butifullest":"most beautiful", "somewhere":"sunwhere", "I's":"I am", "gib":"give", "hit":"it", "gits":"gets", "Mistis":"Misters","hed":"had", "Ah":"I", "hopes":"hope", "foh":"for", "mars'r":"master", "dar'll":"there'll", "I'se":"I am", "afeerd":"afaind", "ef":"if", "yer":"your", "An":"And", "gwyn":"going", "de":"the", "quare": "quoir", "tuk":"took", "Dar's":"There's", "marster":"master", "arternoon":"afternoon", "eber":"ever", "sence":"since", "Dey":"They", "heah":"here", "to-day":"today", "seh":"sir", "sich":"such", "ob":"about", "bofe":"both", "hoss":"horse", "arter":"after", "Dis":"This", "'e":"the", "hyer":"higher", "mought":"mighty", "sutny":"certainly", "Mebbe":"Maybe", "They":"Their", "apaht":"apart", "Sutny":"Certainly", "Mas'r":"Master", "fotched":"fetched", "Yassah": "Yes sir", "gittin'":"getting", "Marse":"Master", "hain't":"ain't", "Thankee":["Thank you", "Thanks"], "heself":"his self", "behave":"behaves", "quatah":"quarter", "ovah":"over", "wheel-barrer":"wheel barrow", "red":"ride", "Mass'":"Master", "whar":"where", "Tankee":["Thanks", "Thank you"], "fah":"far", "Yu":"You", "yu":"you", "enny":"any", "Suttinly":"Certainly", "jest":"just", "kin":"can", "naw":"no", "'er":"have", "skuze":"excuse", "tole":"told", "whut":"what", "fur":"for", "licker":"liquor", "lubiest":"loveliest", "eber":"ever", "Nebber":"Never", "mistis":"misters", "dere":"there", "shure":"sure", "dey":"they", "Mistah":"Mister", "suh":"sir", "Injun":"Indian", "sojers":"soldiers", "wha":"where", "fader's":"fathers", "suah":"sir", "Pahdon":"pardon", "pahdon":"pardon", "Yassuh":"Yes sir", "yassuh":"yes sir", "nussin'":"nursing", "um":"them", "nuffin":"nothing", "forgib":"forgive", "gitt'n":"getting", "Missus":"Mistress", "Cel'bratun":"Celebrating", "Dunno": "Don't know", "dat's":"that's", "Dem":"them", "bin":"been", "out'n":"out of"
    
}  # i.e., characters in non-canonical order ... but we don't correct teh conjugation here ... e.g., 
# (???) cap'en, mars, sorrer, mout, conwic, sponuality (responsibility?), jurer (juror?), secesh, sho, pintedly, kote, stummic, Tollable, kotched, we's, 

# I ain't a gwine

# other:
#     she'm (she is?)
    
# phrases = {
#     "do seem":"does seem", "an'ef": "and if"
# }

# incorrect verb conjugation
# "how I likes it" rather than "how I like it"

## incorrect tenses, even once non-canonical spelling is sorted out
# "I has gib you licker" should be "I have given you liquor" (also incorrect verb conjugation)
# 'done gone smuggle in a bottle' should be "smuggled in a bottle" ('done gone' a thing?)

## improper pronoun use
# "me" instead of "I" ('and me' rather than 'and I')

## unnecesary additions before verb:
# "er", "done" (fore I done tole you; ), "a-", "here" (dis here Injun)

## missing auxilliary verbs
# 'but it [has] bin so long'

## missing prepositions
# "to" (I gwine red up [to] de house )
# "is" (what dat?)

## non standard grammar
# (subj and obj together) "Fate him merrits" instead of "Fate merrits him", "He is merrited by Fate"


SyntaxError: ':' expected after dictionary key (3840600015.py, line 4)

In [431]:
## possible candidate counterpoint speakers?

# gentleman
for counterpoint in [['gentleman'], ['lady'], ['man'], ['woman'], ['child'], ['boy'], ['girl']]:
    print(counterpoint, len([quote for id_, p, quote, manner, speaker in quotes if speaker in counterpoint]))

['gentleman'] 1478
['lady'] 3997
['man'] 15178
['woman'] 4101
['child'] 933
['boy'] 3362
['girl'] 8288
['slave'] 65
