Which speakers or speaker groups share non-standard linguistic forms shown to be distinguish-
ing of n-word referenced speakers, according to LLM surprisal?


In [1]:
import json
from pprint import pprint as pp
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from collections import Counter, defaultdict
import pathlib
import copy
import re
import numpy as np

# functions for determining phonetic shifts

In [2]:
import requests
lines = requests.get("http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b").text.splitlines()

# ignore eveything up the 'A  AH0'
lines = lines[lines.index('A  AH0'):]

# ignore stresses as deep phenomizer doesnt have these
word2arpa = defaultdict(list)
for line in lines:
    splits = [split for split in line.split(" ") if split != '']
    word = splits[0].split('(')[0]
    arpa = "".join(['[' + split.translate(str.maketrans('', '', '0123456789')) + ']' for split in splits[1:]])
    if word not in word2arpa: # just get the main pronunciaion
        word2arpa[word].append(arpa)

word2arpa['WAS']

['[W][AA][Z]']

In [3]:
# load dp
from dp.phonemizer import Phonemizer
phonemizer = Phonemizer.from_checkpoint('./en_us_cmudict_forward.pt')
phonemizer('WAS', lang='en_us')

  checkpoint = torch.load(checkpoint_path, map_location=device)


'[W][AA][Z]'

In [4]:
def arpaguess(s:str):
    if s.upper() not in word2arpa:
        return [phonemizer(s, lang='en_us')]
    else:
        return word2arpa[s.upper()]

arpaguess("who")

['[HH][UW]']

In [5]:
import difflib
def get_changes(from_:list[str], to_:list[str], return_indices=False)->bool:

    # accumulator
    changes = []

    # state
    in_change = False

    for x in difflib.ndiff(from_, to_): #iterate over diff
        # print(x, in_change)
        
        if x[0] == "+" or x[0] == "-":  # change detected
            sign, entry = x.split(" ")

            # new change? then init addition to changes
            if in_change == False:
                changes.append([[], []])
            
            # regardless, record state
            in_change = True

            # regardless, capture changes
            if sign == '-':
                changes[-1][0].append(entry)
            elif sign == '+':
                changes[-1][1].append(entry)

        elif x[0] == "?":
            pass
            
        else:  # match detected
                
            # regardless, record state    
            in_change = False

    return changes
            
print(get_changes(['a','b','b','c','e'], ['a','b','d','e']))
print(get_changes(["it", "oughta", "happen"], ["it", "ought", "to", "happen"]))
print(get_changes("it oughta happen".split(), "it ought to happen".split()))
list(difflib.ndiff(["it", "oughta", "happen"], ["it", "ought", "to", "happen"]))

[[['b', 'c'], ['d']]]
[[['oughta'], ['ought', 'to']]]
[[['oughta'], ['ought', 'to']]]


['  it', '- oughta', '?      -\n', '+ ought', '+ to', '  happen']

# get variations

## manually assigned phonological variations

In [6]:
DH_D = {
    "de":["the"], 
    "dey":["they"],
    "dey's":["they's"],
    "dey'll":["they'll"],
    "dey'd":["they'd"],
    "dis":["this"],
    "dis'll":["this'll"],
    "dis'd":["this'd"],
    "dat":["that"],
    "dat's":["that"],
    "dat'll":["that'll"],
    "dat'd":["that'd"],
    "De":["The"], 
    "Dey":["They"],
    "Dey's":["They's"],
    "Dey'll":["They'll"],
    "Dey'd":["They'd"],
    "Dis":["This"],
    "Dis'll":["This'll"],
    "Dis'd":["This'd"],
    "Dat":["That"],
    "Dat's":["That"],
    "Dat'll":["That'll"],
    "Dat'd":["That'd"]
}

phonological_variations = {}
phonological_variations["['DH']->['D']"] = set(list(DH_D.keys()))
manually_assigned = set(list(DH_D.keys()))

## get phonological variations from american literature corpus

In [7]:
# build annotations set
annotations = dict()
for speakers in [["man"], ["woman"], ["gentleman"], ["lady"],  ["child"], ["Negro", "negro"]]:

    s = ",".join(speakers)

    with open(f"RQ1_ood_words/RQ1_ood_words_{s}.json", "r") as f:
        annotations.update(json.load(f))

In [8]:
# get phological variations implied by non_canonical:canonical pairs
for non_canonical, corrections in annotations.items():
    if len(corrections) == 1 and non_canonical not in manually_assigned:
        canonical = corrections[0]

        for from_, to_ in get_changes(re.findall(r"\w+", arpaguess(canonical)[0]), re.findall(r"\w+", arpaguess(non_canonical)[0])):

            # capture instances
            if f"{from_}->{to_}" not in phonological_variations:
                phonological_variations[f"{from_}->{to_}"] = set()
            phonological_variations[f"{from_}->{to_}"].add(non_canonical)


In [9]:
phonological_variations["['NG']->['N']"]

{"Callatin'",
 "Comin'",
 "Doin'",
 "Evenin'",
 "Gittin'",
 "Good-evenin'",
 "Good-mornin'",
 "He-said-nothin'",
 "Layin'",
 "Lookin'",
 "Lynchin'",
 "Marryin'",
 "Mawnin'",
 "Meanin'",
 "Mornin'",
 "Noffin'",
 "Nothin'",
 "Overpowerin'",
 "Partin'",
 "Peddlin'",
 "Playin'",
 "S'posin'",
 "Seein'",
 "Shirkin'",
 "Somefin'",
 "Sompin's",
 "Suthin'",
 "Waitin'",
 "Waitin's",
 "Wool-gatherin'",
 "a-bawlin'",
 "a-blamin'",
 "a-burnin'",
 "a-carryin'",
 "a-changin'",
 "a-comin'",
 "a-crawlin'",
 "a-cuttin'",
 "a-dealin'",
 "a-disappearin'",
 "a-dyin'",
 "a-feedin'",
 "a-fetchin'",
 "a-fishin'",
 "a-gittin'",
 "a-hearin'",
 "a-holdin'",
 "a-hurtin'",
 "a-kickin'",
 "a-lookin'",
 "a-prayin'",
 "a-ridin'",
 "a-runnin'",
 "a-rushin'",
 "a-savin'",
 "a-settin'",
 "a-sittin'",
 "a-sleepin'",
 "a-smokin'",
 "a-standin'",
 "a-tormentin'",
 "a-wantin'",
 "a-watchin'",
 "a-wearin'",
 "a-whippin'",
 "a-worryin'",
 "a-worshippin'",
 "a-writin'",
 "achin'",
 "agettin'",
 "allowin'",
 "alookin'",
 "amazi

for each get upper and lowercase word variants

In [10]:
# add in verbs for a-verb cases
for v, wordset in phonological_variations.items():
    to_add = set()
    for word in wordset:
        if word[:2] == "a-" or word[:2] == "A-":
            to_add.add(word[2:])
            annotations[word[2:]] = [x[2:] for x in annotations[word]]

# create lower and upper case versions of all
for v, wordset in phonological_variations.items():
    to_add = set()
    for word in wordset:       
        to_add.add(word.capitalize())
        to_add.add(word.lower())
        annotations[word.capitalize()] = [x.capitalize() for x in annotations[word]]
        annotations[word.lower()] = [x.lower() for x in annotations[word]]
    phonological_variations[v] = wordset.union(to_add)

In [11]:
phonological_variations["['NG']->['N']"]

{"A-bawlin'",
 "A-blamin'",
 "A-burnin'",
 "A-carryin'",
 "A-changin'",
 "A-comin'",
 "A-crawlin'",
 "A-cuttin'",
 "A-dealin'",
 "A-disappearin'",
 "A-dyin'",
 "A-feedin'",
 "A-fetchin'",
 "A-fishin'",
 "A-gittin'",
 "A-hearin'",
 "A-holdin'",
 "A-hurtin'",
 "A-kickin'",
 "A-lookin'",
 "A-prayin'",
 "A-ridin'",
 "A-runnin'",
 "A-rushin'",
 "A-savin'",
 "A-settin'",
 "A-sittin'",
 "A-sleepin'",
 "A-smokin'",
 "A-standin'",
 "A-tormentin'",
 "A-wantin'",
 "A-watchin'",
 "A-wearin'",
 "A-whippin'",
 "A-worryin'",
 "A-worshippin'",
 "A-writin'",
 "Achin'",
 "Agettin'",
 "Allowin'",
 "Alookin'",
 "Amazin'",
 "Anythin'",
 "Askin'",
 "Astonishin'",
 "Awaitin'",
 "B'ilin'",
 "Bad-lookin'",
 "Bearin'",
 "Beatin'",
 "Beggin'",
 "Beginnin'",
 "Bein'",
 "Belongin'",
 "Betrayin'",
 "Bettin'",
 "Bitin'",
 "Blacksmit'in'",
 "Blamin'",
 "Blatherin'",
 "Blessin'",
 "Blockin'",
 "Bloomin'",
 "Blowin'",
 "Boardin'-houses",
 "Book-learnin'",
 "Boostin'",
 "Bossin'",
 "Breakin'",
 "Breavin'",
 "Buckin'-bro

## combine phonological variations with dialectical

In [12]:
dialect_words1 = {
 "I'se": 'I am',
 "i'se": 'i am',
 "we'se": "we are",
 "we's": "we are",
 "they'se": "they are",
 "they's": "they are",
 "dey'se": "dey are",
 "dey's": "dey are",
}
dialect_words1.update({k.capitalize():v.capitalize() for k, v in dialect_words1.items()})
print(dialect_words1.keys())

dialect_words2 = {
 'a-gwine': 'going',
 'agwine': 'going',
 'ergwine': 'going',
 'gwine': 'going',
}
dialect_words2.update({k.capitalize():v.capitalize() for k, v in dialect_words2.items()})
print(dialect_words2.keys())


dict_keys(["I'se", "i'se", "we'se", "we's", "they'se", "they's", "dey'se", "dey's", "We'se", "We's", "They'se", "They's", "Dey'se", "Dey's"])
dict_keys(['a-gwine', 'agwine', 'ergwine', 'gwine', 'A-gwine', 'Agwine', 'Ergwine', 'Gwine'])


In [13]:
variations = copy.deepcopy(phonological_variations)
variations["dialect_words1"] = set(dialect_words1.keys())
variations["dialect_words2"] = set(dialect_words2.keys())
    

# which speaker descriptors in the American Literature corpus share this vocabulary?


## load PG_df

In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('../llama3.1_70B/tokenizer/')

In [15]:
from nltk.tokenize import WhitespaceTokenizer, word_tokenize
tk = WhitespaceTokenizer()
whitespace_tokenize = tk.tokenize

In [16]:
def get_words(s:str)->list[str]:
    """ split into words, ensure that contractions aren't split
        Note: This is my N wrt., 
        Note: why not word_tokenize() ? ... because it splits contractions
    """

    # split on whitespace
    words = whitespace_tokenize(s)

    # split off cases where word is enclosed by '' or `'
    words_ = []
    for word in words:
        if (word[0] == "'" and word[-1] == "'") or (word[0] == "`" and word[-1] == "'") or (word[0] == "\"" and word[-1] == "\""):
            words_.append(word[0])
            words_.append(word[1:-1])
            words_.append(word[-1])
        else:
            words_.append(word)
    words = words_
    

    # split of multiple hyphens, elipses, honorifics, initial (followed by dot), non alphanumeric/hyphen/apostrophe
    words_ = []
    for word in words:
        words_ += re.split(r"(-{2,}|\.\.\.|Mr\.|Mrs\.|Dr\.|Prof\.|[A-Z]\.|[^A-Za-z0-9_'-])", word)
    words = words_

    # finally, remove empty strings
    words = [word for word in words if word != ""]

    return words
                
    
def get_words_indices(words, tokens):

    words_indices = []

    tokens_ = [token[1:] if token[0] == "Ġ" else token for token in tokens]

    w = 0
    state = ["", [], True]  # accum of tokens, assum token indices, add to state?

    i = 0
    
    while i < len(tokens_):

        token = tokens_[i]

        if tokens[i][-1] == "Ġ":  # we have \s+ that's been mapped to a token ... skip it ... it's rare
            i+=1
            continue            
        
        # add more tokens to state and record the previous state
        if state[2] == True:
            state[0] += token
            state[1].append(i)

        # we've completed a word!
        if state[0] == words[w]:
            words_indices.append(state[1])
            w += 1
            state = ["", [], True]

        # state exceeds current word ... we have tokens what span words
        elif state[0][:len(words[w])]==words[w]:
            
            # add a decimal to indicate the root to take from token common between words
            excess = len(state[0]) - len(words[w])
            wanted = len(token) - excess
            state[1][-1] = state[1][-1] + wanted/len(token)

            words_indices.append(state[1])
            w+=1

            # init state for next word, incorporating excess of common token
            state = [token[-excess:], [i + (excess / len(token))], False]

            continue

        i += 1

    return words_indices

import math

def get_surprisals(words, tokens, chain):

    variation_surprisal = []
    
    for word, word_indices in zip(words, get_words_indices(words, tokens)):
        
        # are word_indices a decimal? then we need to apply a power to the probability, to split a token that spans words
        powers, indices = zip(*[math.modf(i) for i in word_indices])
        powers = np.array([1 if p == 0 else p for p in powers])
        indices = [int(i) for i in indices]

        # log(x^p) = p*log(x)
        word_surprisal = (-np.log(chain[indices])*powers).sum()
        variation_surprisal.append(word_surprisal)

    return variation_surprisal
            
        
# test
# s = "what is this lif', if fulling!..."s
s = "it is the parents', not the children's"
words = get_words(s)
tokens = tokenizer.tokenize(s)
print(words)
print(tokens)
words_indices = get_words_indices(words, tokens)
print(words_indices)

# yields [[0], [1], [2], [3, 4.5], [4.5], [6, 7]], where 4.2 means we take square root wrt., token 4 probability as estimated contribution to prob chain

['it', 'is', 'the', "parents'", ',', 'not', 'the', "children's"]
['it', 'Ġis', 'Ġthe', 'Ġparents', "',", 'Ġnot', 'Ġthe', 'Ġchildren', "'s"]
[[0], [1], [2], [3, 4.5], [4.5], [5], [6], [7, 8]]


In [17]:
with open('../../PG/extract_quotes_via_spaCy/quotes_blacklist.json', 'r') as f:
    PG_blacklist = json.load(f) 

In [18]:
with open('../../PG/extract_quotes_via_spaCy/quotes_5Jul.json', 'r') as f:
    PG_df = pd.DataFrame([t for i,t in enumerate(json.load(f))], columns = ["id", "p", "quote", "manner", "speaker"])
PG_df['i'] = list(range(len(PG_df)))

In [19]:
# ignore (as done for chains) ... those quotes in blacklist or not of speakers of interest
PG_df = PG_df.loc[(PG_df.loc[:,'i'].isin(PG_blacklist)==False),:]

In [20]:
# reset indices, reset 'i'
PG_df.reset_index(drop=True, inplace=True)
PG_df['i'] = list(range(len(PG_df)))

In [21]:
bad_encoding_i = []
bad_encoding_speakers = Counter()
for i, (q, speaker) in tqdm(enumerate(zip(PG_df.loc[:,'quote'], PG_df.loc[:,'speaker'])), total=len(PG_df)):
    try:
        s = q[1:-1]
        words = get_words(s)
        tokens = tokenizer.tokenize(s)
        words_indices = get_words_indices(words, tokens)

        assert len(words) == len(words_indices)
    except:
        bad_encoding_i.append(i)
        bad_encoding_speakers[speaker] += 1

  0%|                                               | 0/2379076 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████████████████████████| 2379076/2379076 [02:01<00:00, 19656.48it/s]


In [22]:
# drop from quotes
PG_df.drop(bad_encoding_i, inplace=True)
PG_df.reset_index(drop=True, inplace=True)
PG_df['i'] = list(range(len(PG_df)))

In [23]:
PG_df.iloc[0]

id                      8711
p                          4
quote      "The Dodge Club,"
manner                  None
speaker                 None
i                          0
Name: 0, dtype: object

In [24]:
PG_df["words"] = PG_df["quote"].progress_apply(lambda x: get_words(x[1:-1]))


100%|██████████████████████████████| 2368938/2368938 [00:28<00:00, 82092.22it/s]


In [25]:
PG_df["sid"] = PG_df['speaker'] + "_" + PG_df['id']
PG_df['count'] = 0

init. a containeer for capturing (speaker, book id) tuples for speakers of interest

# identify words demonstrative of variations (of interest) in each quote

In [26]:
# we specify the variations of interest, i.e., variations which distinguish nword speakers wrt., normative reference speakers in literature
V = [
    "['NG']->['N']", 
    'dialect_words2', 
    "['T']->[]", 
    "['ER']->['AH']", 
    "['AH']->[]", 
    "['R']->[]", 
    'dialect_words1', 
    "['AE']->['AA', 'R']", 
    "['TH']->['F']", 
    "['TH']->['T']"
]

In [27]:
for v in V:
    # PG_df["x"] = PG_df.loc[:, "words"].progress_apply(lambda x: bool(x.intersection(variation[v])))
    PG_df[v] = PG_df.loc[:, "words"].progress_apply(lambda x: variations[v].intersection(x))

100%|█████████████████████████████| 2368938/2368938 [00:06<00:00, 371314.59it/s]
100%|█████████████████████████████| 2368938/2368938 [00:03<00:00, 779800.48it/s]
100%|█████████████████████████████| 2368938/2368938 [00:04<00:00, 574921.62it/s]
100%|█████████████████████████████| 2368938/2368938 [00:04<00:00, 483182.13it/s]
100%|█████████████████████████████| 2368938/2368938 [00:12<00:00, 196215.11it/s]
100%|█████████████████████████████| 2368938/2368938 [00:12<00:00, 192681.40it/s]
100%|████████████████████████████| 2368938/2368938 [00:02<00:00, 1122588.45it/s]
100%|█████████████████████████████| 2368938/2368938 [00:13<00:00, 173120.59it/s]
100%|████████████████████████████| 2368938/2368938 [00:01<00:00, 1605835.50it/s]
100%|█████████████████████████████| 2368938/2368938 [00:15<00:00, 149025.94it/s]


In [28]:
PG_df.loc[PG_df.loc[:, 'dialect_words2']!=set(), ['dialect_words2', 'quote']]

Unnamed: 0,dialect_words2,quote
16534,{gwine},"""Marse Richard, you can't reason homesickness ..."
16890,{gwine},"""Miss Margaret, honey, that's foolishness! Not..."
16993,{gwine},"""I suttingly ain't gwine do it to they offspri..."
17705,{gwine},"""Yo' Uncle Richard ain't gwine come back to-ni..."
20230,{gwine},"""I ain't a gwine to stay here, missis,"""
...,...,...
2368386,{gwine},"""Happen so, honey, happen so! De French tombst..."
2368461,{gwine},"""'Well, Gin'l,' he said, 'I'm glad you is got ..."
2368581,{gwine},"""an' Colonel French, my husban' Bud is done go..."
2368586,{gwine},"""Thank'y, suh, thank'y, Mars' Colonel, an' Mis..."


# get sids which ...

In [29]:
sids = set()

# get sids which are in top 100 for any single variation
for v in V:
    print(v)
    
    targets = variations[v]

    # speakers to ignore
    speakers_ignore = ["he", "He", "she", "She", "I", "man", "woman", "gentleman", "lady", "child", "negro", "Negro"]
    
    # sum counts by sid
    agg_df = PG_df.loc[(PG_df.loc[:,v]!=set()) & (~PG_df.loc[:,'speaker'].isin(speakers_ignore)) & (PG_df.loc[:,'speaker'].str.istitle()), ['sid', 'count']].groupby('sid').sum()
    S = sorted(list(zip(agg_df.index, agg_df['count'])), key = lambda x: x[1], reverse=True)
    sids = sids.union([s[0] for s in S[:min(100, len(S))]])
    # print(len(sids), len(S))

# No. quotes matching
print("No. matching quotes", len(PG_df.loc[PG_df.loc[:, 'sid'].isin(sids),:]))
print("No. matching sids", len(sids))

['NG']->['N']
dialect_words2
['T']->[]
['ER']->['AH']
['AH']->[]
['R']->[]
dialect_words1
['AE']->['AA', 'R']
['TH']->['F']
['TH']->['T']
No. matching quotes 13977
No. matching sids 677


## build a container of quotes for which to get chains

In [30]:
# # get all quotes for sids of interest
# selected_quotes = []
# for i, row in PG_df.loc[PG_df.loc[:, 'sid'].isin(sids),:].iterrows():
#     selected_quotes.append([i, row["speaker"], row["id"], row["quote"][1:-1]])

# with open("RQ2/RQ2_selected_sids.json", "w") as f:
#     json.dump(selected_quotes, f)

# get contributions by speaker

without correcting for downstream effect

In [31]:
with open("RQ2/RQ2_selected_sids.json", "r") as f:
    RQ2_quotes = json.load(f)
print(len(RQ2_quotes))

with open("RQ2/chains_llama3.1_70B_sids.json", "r") as f:
    RQ2_chains = json.load(f)
print(len(RQ2_chains))

13977
13977


In [32]:
RQ2_quotes[0]

[15904, 'Cely', '57418', "Well, she wa'n't jes as you might say a baby,"]

In [33]:
bad_encoding_i = []
bad_encoding_speakers = Counter()
for (i, speaker, id_, quote), chain in zip(RQ2_quotes, RQ2_chains):
    try:
        words = get_words(quote)
        tokens = tokenizer.tokenize(quote)
        words_indices = get_words_indices(words, tokens)

        get_surprisals(words, tokens, np.array(chain[1:]))

        assert len(words) == len(words_indices)
    except:
        bad_encoding_i.append(i)
        bad_encoding_speakers[speaker] += 1

display(bad_encoding_i)
display(bad_encoding_speakers)

[1662288]

Counter({'Chad': 1})

In [34]:
I = [i for i, speaker, id_, quote in RQ2_quotes if i not in bad_encoding_i]
PG_RQ2 = PG_df.loc[I, :]
print(len(PG_RQ2))
print(PG_RQ2.iloc[0])

13976
id                                                                 57418
p                                                                    781
quote                    "Well, she wa'n't jes as you might say a baby,"
manner                                                              said
speaker                                                             Cely
i                                                                  15904
words                  [Well, ,, she, wa'n't, jes, as, you, might, sa...
sid                                                           Cely_57418
count                                                                  0
['NG']->['N']                                                         {}
dialect_words2                                                        {}
['T']->[]                                                          {jes}
['ER']->['AH']                                                        {}
['AH']->[]                                   

In [35]:
i2c = {i:c for c, (i, speaker, id_, quote) in enumerate(RQ2_quotes)}

In [36]:
# get words and surprisals for the selected quotes
PG_RQ2["words"] = PG_RQ2["quote"].progress_apply(lambda x: get_words(x[1:-1]))
PG_RQ2['surprisals'] = [get_surprisals(get_words(quote), tokenizer.tokenize(quote), np.array(chain[1:])) for (i, speaker, id_, quote), chain in zip(RQ2_quotes, RQ2_chains) if i not in bad_encoding_i]


100%|█████████████████████████████████| 13976/13976 [00:00<00:00, 112610.66it/s]


In [37]:
PG_RQ2

Unnamed: 0,id,p,quote,manner,speaker,i,words,sid,count,['NG']->['N'],dialect_words2,['T']->[],['ER']->['AH'],['AH']->[],['R']->[],dialect_words1,"['AE']->['AA', 'R']",['TH']->['F'],['TH']->['T'],surprisals
15904,57418,781,"""Well, she wa'n't jes as you might say a baby,""",said,Cely,15904,"[Well, ,, she, wa'n't, jes, as, you, might, sa...",Cely_57418,0,{},{},{jes},{},{},{},{},{},{},{},"[10.300479921654476, 24.79325874935989, 10.009..."
15905,57418,781,"""but she was the onlies' one I had, and when a...",said,Cely,15905,"[but, she, was, the, onlies', one, I, had, ,, ...",Cely_57418,0,{},{},{},{},{'Pears},{},{},{},{},{},"[11.430847028240985, 9.08608527794871, 8.40651..."
15910,57418,788,"""I haven't always lived in Maryland, Miss Marg...",began,Cely,15910,"[I, haven't, always, lived, in, Maryland, ,, M...",Cely_57418,0,{},{},{},{},{},{},{},{},{},{},"[12.306202682202313, 31.10923871682436, 8.8106..."
15935,57418,845,"""take her and put her to bed. This has been ha...",appeared,Cely,15935,"[take, her, and, put, her, to, bed, ., This, h...",Cely_57418,0,{},{},{},{},{},{},{},{},{},{},"[15.361773905911711, 9.071644365321236, 7.3089..."
16552,57418,1510,"""You git a chile in that frame of mind,""",moralized,Cely,16552,"[You, git, a, chile, in, that, frame, of, mind...",Cely_57418,0,{},{},{},{},{},{},{},{},{},{},"[13.834049620527484, 5.058872014911746, 7.3611..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2363015,13531,384,"""Six dollars, sah,""",said,Grandison,2363015,"[Six, dollars, ,, sah, ,]",Grandison_13531,0,{},{},{},{},{},{},{},{},{},{},"[9.321138481481842, 10.102986883647903, 20.862..."
2363020,13531,388,"""Wot par'ble?""",said,Grandison,2363020,"[Wot, par'ble, ?]",Grandison_13531,0,{},{},{},{},{par'ble},{},{},{},{},{},"[26.023117188254314, 31.57235209086705, 12.995..."
2363023,13531,392,"""Git up,""",said,Grandison,2363023,"[Git, up, ,]",Grandison_13531,0,{},{},{},{},{},{},{},{},{},{},"[7.981849275995357, 7.652085454951323, 21.0512..."
2368152,19746,554,"""Yo' boss is a godsen' ter dis town,""",declared,Archie,2368152,"[Yo', boss, is, a, godsen', ter, dis, town, ,]",Archie_19746,0,{},{},{},{},{},{},{},{},{},{},"[17.326783821761197, 9.31851796445723, 9.10037..."


In [38]:
i = 15904
words = PG_RQ2.loc[i, 'words']
tokens = tokenizer.tokenize(PG_RQ2.loc[i, 'quote'][1:-1])
chain = np.array(RQ2_chains[i2c[i]][1:])
get_surprisals(words, tokens, chain)

[10.300479921654476,
 24.79325874935989,
 10.009807888948112,
 29.223640937683193,
 8.621276364842362,
 8.466506710226957,
 9.343876706343544,
 11.368004967266902,
 9.567148656004152,
 8.09024207962894,
 5.450214310618973,
 18.80000372577162]

get (proportional) contributions of each variation to each ...

In [39]:
contributions = {}

# consider each speaker in turn
for sid in tqdm(set(PG_RQ2.loc[:,'sid'])):
    contributions[sid] = {}

    # get x_bar ...
    mask = (PG_RQ2.loc[:,"sid"] == sid)

    # get_suprisals
    surprisals = []
    for x in PG_RQ2.loc[mask, "surprisals"]:
        surprisals += x
    x_bar = np.array(surprisals).mean()

    # get words
    words = []
    for x in PG_RQ2.loc[mask, "words"]:
        words += x

    # consider each variation in turn
    for v in V:    

        ss = 0
        counter = 0

        for word, surprisal in zip(words, surprisals):
            if word not in variations[v]:
                ss += surprisal
                counter += 1

        contributions[sid][v] = 100*(x_bar - ss / counter) / x_bar


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1148.80it/s]


In [40]:
for v in V:
    X = []
    for sid, d in tqdm(contributions.items()):
        n = sum(PG_RQ2.loc[:, "sid"]==sid)
        if n >= 10:  # only consider sids with 10 or more quotations
            X.append((sid, d[v]))
    print(v)
    print('\t', sorted(X, key=lambda x: x[1], reverse=True)[:20])

100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1433.88it/s]


['NG']->['N']
	 [('Burl_27363', 5.0237572883687385), ('Ben_36531', 4.951759365001411), ('Ambrose_33289', 4.539784754388284), ('Linda_12352', 4.219323337624955), ('Prince_31160', 3.9231533777084504), ('Babe_31160', 3.7375233827211245), ('Alf_44879', 3.6213234145547366), ('Amanda_26928', 3.497682815502206), ('Becky_31160', 3.483738944024594), ('Janet_3619', 3.46690663366562), ('Zachariah_6013', 3.3815881393569294), ('Angel_20292', 3.324276720088045), ('Jeff_44222', 3.2946494937382225), ('Anderson_12352', 3.2884609215557106), ('Milly_11057', 3.238522275042771), ('Belindy_23810', 3.2124256253596584), ('Beamish_429', 3.1828198733948163), ('Cudjo_31406', 3.1226514675612336), ('Doggett_36283', 3.078475563429472), ('Claggett_41591', 3.02825965728287)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1439.59it/s]


dialect_words2
	 [('Bob_18318', 2.4701891701163667), ('Joshua_41857', 2.059180331290361), ('Clarissa_41857', 1.9052400665448301), ('Fanny_33407', 1.8003195503495064), ('Hannah_41857', 1.7770361199403635), ('Dicey_46381', 1.7291073734520865), ('Dan_2059', 1.7186236832935526), ('Eliab_6058', 1.651127453160761), ('Alek_39644', 1.302438442839571), ('Ned_41857', 1.2839947589992733), ('Brad_26934', 1.2785848073832007), ('Basha_15796', 1.2557677761867438), ('Tildy_24430', 1.189013049907974), ('Tildy_26429', 1.1833781725712298), ('Bijah_13531', 1.0562457417125855), ('Jerry_11228', 1.0445837060779983), ('Belindy_23810', 1.0025535039819542), ('Letty_10973', 0.983627187870083), ('Isham_10973', 0.9772770924919062), ('Tom_12352', 0.971969764141541)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1439.61it/s]


['T']->[]
	 [('Doret_4082', 2.254186586395587), ('Letty_10973', 2.201963060656129), ('Delilah_36829', 2.104290405118421), ('Demming_31134', 1.7658862169047125), ('Wellington_11057', 1.7492445390611828), ('Pallas_46586', 1.6626509385117156), ('Jazon_4097', 1.5898973202254052), ('Dicey_46381', 1.360936751597923), ('Jerry_11228', 1.3330586099164363), ('Cookie_12639', 1.2642277484760476), ('Ephraim_35423', 1.2619491906758005), ('Isham_10973', 1.251436488189813), ('Hightower_31160', 1.2485266906238108), ('Bud_26112', 1.242183214019435), ('Sandy_11228', 1.0935572319123175), ('Bobby_33234', 1.044625350422049), ('Zachariah_6013', 1.010342772923554), ('Chunk_5309', 0.9877815292235083), ('Alexandre_15881', 0.9694619755892787), ('Cook_35359', 0.9168127962515469)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1440.73it/s]


['ER']->['AH']
	 [('Jerry_11228', 5.124436952591369), ('Jeremiah_33058', 3.5097541189142527), ('Joshua_63223', 2.3856905219849254), ('Zachariah_6013', 2.340702278398244), ('Andy_52782', 2.283474676258064), ('Jeff_44222', 2.2302710196175073), ('Claggett_41591', 2.0465139992376846), ('Sandy_11228', 1.7350835224591872), ('Colonel_27741', 1.4327649151666209), ('Captain_23745', 1.3815921129205189), ('Jeems_18817', 1.3667210834908914), ('Fanny_33407', 1.3505951268650902), ('Jim_15886', 1.3218702719990483), ('Jefferson_33963', 1.298444824049504), ('Isham_10973', 1.1418222260078534), ('Alek_39644', 1.0378051284877434), ('Dicey_46381', 0.9990810686537477), ('Bennett_9149', 0.9908614789523023), ('Eradicate_4230', 0.9766336544711471), ('Bob_18318', 0.9666847296663272)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1434.96it/s]


['AH']->[]
	 [('Maria_18687', 4.754375078461045), ('B._472', 4.543408278400094), ('Creole_29439', 3.4022494516635224), ('Prince_31160', 2.6109482973364257), ('Cruzatte_42925', 2.2543982728899454), ('Andy_3734', 2.1741950811370128), ('Jupiter_35462', 2.126707056128358), ('Barnes_28439', 2.1071855359397547), ('Matthias_18332', 1.9396432235382728), ('Pallas_46586', 1.9319592913521024), ('Jute_5309', 1.9213133900608785), ('Maria_27949', 1.8595802974667386), ('Birdsall_6719', 1.8007581036062736), ('Cousin_26631', 1.7569124579573492), ('Eradicate_4230', 1.6231048364494995), ('Grandison_13531', 1.6222117140432941), ('Jimmy_5187', 1.5897117591240058), ('Tempy_24430', 1.5528666212499014), ('Tempy_26429', 1.4875933876361727), ('Dicey_46381', 1.4671087954998645)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1436.50it/s]


['R']->[]
	 [('Ephraim_35423', 1.96962986489898), ('Jim_15886', 1.63926859483702), ('Delphy_16505', 1.6193266342693942), ('Alf_44879', 1.1959370202402297), ('Bowles_6431', 1.1575914728379826), ('Euonymus_15881', 1.066958564541016), ('Sam_40013', 0.8622749617344542), ('Milly_11057', 0.8464884491285958), ('Jane_11228', 0.7846867358227055), ('Jeff_44222', 0.7845579985257668), ('Jute_5309', 0.7682214299503576), ('Harry_4746', 0.7594165658701767), ('Abel_6872', 0.6781406570415642), ('Sheba_6719', 0.6567212318887231), ('Ann_16138', 0.650656036729779), ('Blensop_9908', 0.6394215138992858), ('Fanny_33407', 0.6363455524686377), ('Gordon_25884', 0.5573727771538056), ('Birdsall_6719', 0.5546085985196278), ('Dicey_46381', 0.5250503158746274)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1438.67it/s]


dialect_words1
	 [('Hannibal_6090', 2.4940127046218867), ('Tom_12352', 2.2732973558414487), ('Daniel_12352', 1.7798142971852615), ('Hundred_55012', 1.4965679457796939), ('Wool_3792', 1.329159603769922), ('Dicey_46381', 1.30913675067905), ('Joshua_63223', 1.2728611779946524), ('Barbara_36390', 1.2658972685675633), ('Wool_24337', 1.1326599884000423), ('Sheba_6719', 1.1149819081535406), ('Maria_27949', 0.9981341042671477), ('Salters_12352', 0.9130790619049489), ('Maria_18687', 0.876759845648751), ('Abel_6872', 0.8357839425079276), ('Anderson_12352', 0.7869745999967459), ('Emile_4955', 0.7635485778733359), ('Dolf_30111', 0.742360589828714), ('Berry_6058', 0.6950652255807209), ('Chunk_5309', 0.564127487531683), ('Mammy_41598', 0.5467301251225783)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1436.13it/s]


['AE']->['AA', 'R']
	 [('Wool_24337', 2.033975185734332), ('Wool_3792', 1.683982376828534), ('Joshua_63223', 1.536891114565755), ('Zachariah_6013', 1.5105027588872986), ('Anderson_12352', 1.393760747512505), ('Joe_32757', 1.3925188944810236), ('Basha_15796', 1.378525569038142), ('Jute_5309', 1.2222785787882213), ('Harbert_50701', 0.9462382035596003), ('Joe_23789', 0.9166743142730934), ('Delilah_36829', 0.8841389653303217), ('Janet_3619', 0.8156937122054391), ('Chunk_5309', 0.8122396545778734), ('Abel_6872', 0.8041307446116265), ('Dicey_46381', 0.797506715061419), ('Katie_6376', 0.7867484606278997), ('Cely_57418', 0.7494841964464356), ('Daniel_12352', 0.6779883031017934), ('Jinkey_5309', 0.5651245891861945), ('Bub_5660', 0.5307405200304011)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1436.34it/s]


['TH']->['F']
	 [('Wool_3792', 1.5903951184648466), ('Isham_10973', 1.15733442463221), ('Cudjo_31406', 1.0436377769697767), ('Jeremiah_33058', 1.0113319404012198), ('Letty_10973', 1.002820187278338), ('Hugh_15796', 0.9696525407738117), ('Milly_11057', 0.8884700344694664), ('Zachariah_6013', 0.7884711328569556), ('Tempy_24430', 0.7677996241253745), ('Toby_31406', 0.7315420226436544), ('Wellington_11057', 0.697402610282019), ('Andy_52782', 0.6729406991457646), ('Tempy_26429', 0.6283763788657377), ('Wool_24337', 0.6207960070241343), ('Jute_5309', 0.583442768185019), ('Jim_15402', 0.5785007870096804), ('Jovial_15774', 0.525670025911959), ('Emile_4955', 0.5215454842111956), ('Nimbus_6058', 0.5190237257931488), ('Ned_41857', 0.5044843476757126)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1437.89it/s]

['TH']->['T']
	 [('Onondago_8880', 5.600656777386363), ('Koerner_40398', 2.8475917567868034), ('Creoles_42925', 2.581401109167171), ('Maka_12190', 2.389473738771174), ('Ikey_22804', 2.2754514506013233), ('Cruzatte_42925', 2.0469175756651268), ('Flopper_15578', 1.513498406544359), ('Caesar_9845', 1.4920894109695102), ('Doret_4082', 1.3585520792240515), ('Ches_25809', 1.3086256800691978), ('Belindy_23810', 1.0159469058202717), ('Corlaer_70683', 0.9475260308608753), ('Nick_10434', 0.9299336418751593), ('Laguerre_34567', 0.7858776853132002), ('Chainbearer_34916', 0.7437243341793532), ('Dooley_35374', 0.7435202002665775), ('Juan_51987', 0.5914309465587408), ('Fritz_37149', 0.5140027238813977), ('General_15328', 0.46288671189114444), ('Jinkey_5309', 0.4448252019653905)]





# get contributions by speaker

correcting for downstream effect

## get corrected quotations

In [41]:
from collections import deque

def word_swaps(words:list[str], replacements:dict[str,str]):
    swaps = [(word, replacements[word]) for word in words if word in replacements.keys()]
    return swaps

def replace_strings(quote:str, replacements:dict[str,str]):

    # identify what is to be swapped in order
    words = get_words(quote)
    swaps = deque(word_swaps(words, replacements)) # state

    # for every word (in quote), associate it with its quote.find index
    word_and_in_text_index = []
    start = 0
    for word in words:
        quote_index = quote.find(word, start)
        word_and_in_text_index.append((word, quote_index))
        start = quote_index + len(word)
    word_and_in_text_index = deque(word_and_in_text_index)

    # build the new quote
    new_quote:str = ""
    start = 0
    while len(swaps) > 0:
        old_word, new_word = swaps.popleft()

        while True:
            word, quote_index = word_and_in_text_index.popleft()

            if old_word == word:
                new_quote += quote[start:quote_index] + new_word
                start = quote_index + len(old_word)
                break      

    new_quote += quote[start:]
                
    return new_quote
        
display(replace_strings("Dat water is dealt, Dat's _is_ good", {"Dat":"That", "_is_":"is"}))
display(replace_strings("'cause I say so!", {"'cause":"because"}))
display(replace_strings("don' do it", {"don'":"don't"}))
display(replace_strings("and _why_ do you think that?", {"_why_":"why"}))
display(replace_strings("I'se going", {"I'se":"I am"}))


"That water is dealt, Dat's is good"

'because I say so!'

"don't do it"

'and why do you think that?'

'I am going'

In [42]:
# for each quote wrt., our sids of interest, for each variation of interest, build an corrected context
corrected_quotes = []
count_no_variation = 0
count_variation = 0

# iterate over the 
for i, row in tqdm(PG_RQ2.loc[PG_RQ2.loc[:,"sid"].isin(sids),:].iterrows()):
    if i not in bad_encoding_i:
        
        quote = row['quote'][1:-1]
    
        some_variation = False
        
        # handle phonetic variations
        for v in ["['NG']->['N']", "['T']->[]", "['ER']->['AH']", "['AH']->[]", "['R']->[]", "['AE']->['AA', 'R']","['TH']->['F']", "['TH']->['T']"]:
            demonstrative_words:set = row[v]
            if len(demonstrative_words) > 0:
                replacements = {x: annotations[x][0] for x in demonstrative_words}
                corrected_quote = replace_strings(quote, replacements)
                corrected_quotes.append([i, replacements, v, corrected_quote])
                some_variation = True
    
        for v in ['dialect_words1']:
            demonstrative_words:set = row[v]
            if len(demonstrative_words) > 0:
                replacements = {x: dialect_words1[x] for x in demonstrative_words}
                corrected_quote = replace_strings(quote, replacements)
                corrected_quotes.append([i, replacements, v, corrected_quote])
                some_variation = True
    
        for v in ['dialect_words2']:
            demonstrative_words:set = row[v]
            if len(demonstrative_words) > 0:
                replacements = {x: dialect_words2[x] for x in demonstrative_words}
                corrected_quote = replace_strings(quote, replacements)
                corrected_quotes.append([i, replacements, v, corrected_quote])
                some_variation = True
    
        # let's check that everything adds up!
        if some_variation == False:
            count_no_variation += 1
        else:
            count_variation += 1
            
print(len(corrected_quotes))
print(count_no_variation + count_variation)

13976it [00:00, 34181.42it/s]

3972
13976





In [43]:
# with open("RQ2_downstream/all_corrected.json", "w") as f:
#     json.dump(corrected_quotes, f)

## load corrected and chains

In [44]:
with open("RQ2_downstream/all_corrected.json", "r") as f:
    corrected_quotes = json.load(f)
print(len(corrected_quotes))

with open("RQ2_downstream/chains_llama3.1_70B.json", "r") as f:
    chains_corrected = json.load(f)
print(len(chains_corrected))


3972
3972


## let's check ... are we able to calculate surprisals for all corrected quotes?

In [45]:
corrected_quotes[0]

[15904,
 {'jes': 'just'},
 "['T']->[]",
 "Well, she wa'n't just as you might say a baby,"]

In [46]:
for c, (i, r, v, quote) in tqdm(enumerate(corrected_quotes)):
    words = get_words(quote)
    tokens = tokenizer.tokenize(quote)
    chain = np.array(chains_corrected[c][1:])
    get_surprisals(words, tokens, chain)

3972it [00:00, 4137.81it/s]


## calculate contributions

In [47]:
# consider all variations found in the corrected quotes
V = set([v for i, r, v, quote in corrected_quotes])

contributions_c = {}

for sid in tqdm(set(PG_RQ2.loc[:,'sid'])):
    contributions_c[sid] = {}

    # for quotation collection corresponding to sid, get \bar{S}_collection, which we label x_bar
    surprisals = []
    for s in PG_RQ2.loc[PG_RQ2.loc[:,'sid']==sid, "surprisals"]:
        surprisals += s
    x_bar =  np.mean(surprisals)
    
    # consider each variation in turn
    for v in V:

        # where I is the PG_RQ2 indices in the corrections which match sid and variation considered
        i2c = {i:c for c, (i, d, variation, quote) in enumerate(corrected_quotes) if v==variation and PG_RQ2.loc[i,'sid']==sid}
        I = list(i2c.keys())

        # collect surprisal for all non-corrected quotes (wrt., variation considered), for sid
        surprisals = []
        for s in PG_RQ2.loc[(~PG_RQ2.index.isin(I)) & (PG_RQ2.loc[:,"sid"]==sid), "surprisals"]:
            surprisals += s

        # collect surprisals for 
        for i in I:

            # get surprisals (by word) wrt., corrected quotations
            quote_c = corrected_quotes[i2c[i]][3]
            chain_c = np.array(chains_corrected[i2c[i]][1:])  # ignore standard beginning token given to all chains
            words_c = get_words(quote_c)
            tokens_c = tokenizer.tokenize(quote_c)
            surprisals_c = get_surprisals(words_c, tokens_c, chain_c)


            # sum the surprisals where corresponding words are unchanged between correction and original 
            matcher = difflib.SequenceMatcher(None, PG_RQ2.loc[i, 'words'], words_c)
            opcodes = matcher.get_opcodes()
            for tag, i1, i2, j1, j2 in opcodes:  # where x1
                if tag == "equal":
                    surprisals += surprisals_c[j1:j2]
    
        # report
        contributions_c[sid][v] = 100*(x_bar - np.array(surprisals).mean())/x_bar


100%|█████████████████████████████████████████| 677/677 [00:12<00:00, 53.96it/s]


In [50]:
for v in V:
    X = []
    for sid, d in tqdm(contributions_c.items()):
        n = sum(PG_RQ2.loc[:, "sid"]==sid)
        if n >= 10:  # only consider sids with 10 or more quotations
            X.append((sid, d[v]))
    print(v)
    print('\t', sorted(X, key=lambda x: x[1], reverse=True)[:20])

100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1307.53it/s]


dialect_words1
	 [('Tom_12352', 2.9390584210041886), ('Hannibal_6090', 2.8765395830710205), ('Daniel_12352', 1.7571945091994754), ('Hundred_55012', 1.7363105441098197), ('Wool_3792', 1.7118320125233144), ('Joshua_63223', 1.7110194134608334), ('Dicey_46381', 1.4484662490347922), ('Wool_24337', 1.4386347448116659), ('Sheba_6719', 1.331762845806213), ('Barbara_36390', 1.3195163701594073), ('Abel_6872', 1.2199185289922951), ('Maria_27949', 1.1147815387527567), ('Berry_6058', 1.0157565320395492), ('Salters_12352', 1.0081329249788686), ('Bud_26112', 1.0006277481286676), ('Maria_18687', 0.9882277756710929), ('Dolf_30111', 0.9244306490905603), ('Anderson_12352', 0.8843221392018511), ('Chunk_5309', 0.8413416632258779), ('Emile_4955', 0.8084398733823298)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1414.22it/s]


['T']->[]
	 [('Doret_4082', 2.5152303813209005), ('Cookie_12639', 2.4316030346248985), ('Delilah_36829', 2.2090681785638995), ('Letty_10973', 2.1550970149701425), ('Jazon_4097', 1.8889194641496019), ('Wellington_11057', 1.7699896782760387), ('Bobby_33234', 1.6406721586819855), ('Jerry_11228', 1.5402490585732327), ('Ephraim_35423', 1.5317041916152614), ('Pallas_46586', 1.4029264434828412), ('Hightower_31160', 1.3786341060610938), ('Isham_10973', 1.200921503728235), ('Demming_31134', 1.1607504374846436), ('Dicey_46381', 1.1351165726720815), ('Jim_15886', 1.0896395265387953), ('Bud_26112', 0.9889766906199602), ('Alexandre_15881', 0.9433574869578928), ('Jefferson_33963', 0.8273196513713529), ('Bishop_23637', 0.8018186546903455), ('Billy_5187', 0.7895140022097054)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1408.18it/s]


['TH']->['T']
	 [('Onondago_8880', 4.823120470558763), ('Koerner_40398', 2.76979678353384), ('Creoles_42925', 2.5598885277259145), ('Ikey_22804', 2.3675608555697076), ('Maka_12190', 2.3625756164924256), ('Cruzatte_42925', 2.0397382209565094), ('Doret_4082', 1.4602869554285065), ('Flopper_15578', 1.3945470362966472), ('Ches_25809', 1.3061893413743704), ('Chainbearer_34916', 1.1989139812889937), ('Caesar_9845', 1.051862365587308), ('Belindy_23810', 1.0237251945244326), ('Corlaer_70683', 1.0097741926082913), ('Nick_10434', 0.9585991112407487), ('Laguerre_34567', 0.8271469806036231), ('Dooley_35374', 0.7567051500888772), ('Fritz_37149', 0.5277453102092591), ('Juan_51987', 0.4386541285003227), ('Jack_26429', 0.43476599925028825), ('Jack_24430', 0.43476599925028825)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1416.34it/s]


['R']->[]
	 [('Delphy_16505', 1.9163038674516177), ('Ephraim_35423', 1.6468348059607572), ('Jim_15886', 1.6094660533569665), ('Euonymus_15881', 1.3717100686964163), ('Jane_11228', 1.261987239744906), ('Alf_44879', 1.0434213400539092), ('Bowles_6431', 0.9367745277667101), ('Abel_6872', 0.9285424049269105), ('Ann_16138', 0.8759228224168657), ('Harry_4746', 0.8111702752885832), ('Milly_11057', 0.7881116955287221), ('Jute_5309', 0.7625077571603367), ('Sam_40013', 0.7483206680883507), ('Dicey_46381', 0.7119731632172335), ('Daniel_50494', 0.6212210095730941), ('Tempy_24430', 0.594295848279164), ('Blensop_9908', 0.5937418836584701), ('Sheba_6719', 0.5921101600208288), ('Gordon_25884', 0.5776979530550072), ('Birdsall_6719', 0.5658378828641712)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1417.21it/s]


['NG']->['N']
	 [('Burl_27363', 5.189947058678523), ('Ben_36531', 4.931543202476135), ('Ambrose_33289', 4.503479545680077), ('Prince_31160', 4.101124427248771), ('Linda_12352', 3.9652283175278633), ('Milly_11057', 3.828532824341438), ('Becky_31160', 3.6758563915914424), ('Alf_44879', 3.638549012633227), ('Cudjo_31406', 3.4859939188703493), ('Amanda_26928', 3.437100028413665), ('Babe_31160', 3.4326120773926836), ('Zachariah_6013', 3.3585662723361436), ('Isham_10973', 3.333115327390869), ('Anson_21850', 3.3045592161142086), ('Doggett_36283', 3.249514899511497), ('Janet_3619', 3.2003096001130062), ('Mammy_41598', 3.2002365151193093), ('Belindy_23810', 3.1967056641166742), ('Elmer_6908', 3.189593254688915), ('Adam_18286', 3.1012280898964306)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1415.02it/s]


dialect_words2
	 [('Bob_18318', 2.7043134099076855), ('Fanny_33407', 2.519934855104532), ('Dicey_46381', 2.4721133377249522), ('Clarissa_41857', 2.183391354221218), ('Hannah_41857', 2.1165024198144518), ('Brad_26934', 1.8327371302003264), ('Joshua_41857', 1.801227246891708), ('Dan_2059', 1.793590333777779), ('Ned_41857', 1.6256525219359832), ('Eliab_6058', 1.601874081337017), ('Crissy_50701', 1.5225806301720866), ('Basha_15796', 1.4808506298585142), ('Jerry_11228', 1.471704826729092), ('Tildy_26429', 1.4470826230813558), ('Alek_39644', 1.4348588885738172), ('Tildy_24430', 1.386505079425229), ('Isham_10973', 1.378114423759967), ('Belindy_23810', 1.2591974168433793), ('Ike_34575', 1.1043427201615992), ('Tom_12352', 1.055262424465669)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1403.22it/s]


['AE']->['AA', 'R']
	 [('Joshua_63223', 2.0452126231398227), ('Wool_3792', 1.955522627404107), ('Zachariah_6013', 1.637462285799035), ('Anderson_12352', 1.6193297896360848), ('Joe_32757', 1.5967325997593314), ('Wool_24337', 1.454798877938226), ('Jute_5309', 1.317396491105517), ('Tom_12352', 1.08153637853513), ('Dicey_46381', 1.0411117515835295), ('Janet_3619', 0.9947544484150048), ('Katie_6376', 0.984004559248543), ('Ephraim_35423', 0.9672366225375747), ('Chunk_5309', 0.9366949559909402), ('Harbert_50701', 0.9080875425227295), ('Basha_15796', 0.9029097286110728), ('Abel_6872', 0.804503855938033), ('Jinkey_5309', 0.6995238597820032), ('Daniel_12352', 0.6749502635601802), ('Cely_57418', 0.6666512758706676), ('Cleave_22066', 0.6399427185490221)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1344.85it/s]


['AH']->[]
	 [('Maria_18687', 5.366696980928583), ('B._472', 5.161058933060685), ('Creole_29439', 3.49950243197036), ('Prince_31160', 2.6734868683704756), ('Matthias_18332', 2.6051050813405436), ('Maria_27949', 2.4834037930474118), ('Cruzatte_42925', 2.3822249491135943), ('Jupiter_35462', 2.1654665447362866), ('Andy_3734', 1.9773327672747163), ('Jute_5309', 1.9476604693048507), ('Jimmy_5187', 1.8647880121891884), ('Benton_18332', 1.8512349143094913), ('Pallas_46586', 1.790331041570259), ('Birdsall_6719', 1.7500202907986002), ('Tempy_24430', 1.7373022444675341), ('Cousin_26631', 1.7308123022612714), ('Grandison_13531', 1.7298466210163927), ('Ann_28439', 1.6737135144721909), ('Eradicate_4230', 1.6545249355146354), ('Tempy_26429', 1.6538865640092455)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1385.82it/s]


['ER']->['AH']
	 [('Jerry_11228', 4.917827065427178), ('Jeremiah_33058', 3.6944668660165174), ('Captain_23745', 3.238635428570913), ('Jeff_44222', 2.353986837777089), ('Andy_52782', 2.3533455533941705), ('Colonel_27741', 2.326687126570132), ('Zachariah_6013', 2.193595195693582), ('Claggett_41591', 2.1757300944633826), ('Sandy_11228', 1.7660462381408744), ('Joshua_63223', 1.7623612957699868), ('Jeems_18817', 1.492446905981333), ('Fanny_33407', 1.355208369090468), ('Bob_18318', 1.315079803846893), ('Jim_15886', 1.2036866626914364), ('Jefferson_33963', 1.1327912678388048), ('Isham_10973', 1.0481550029336035), ('Alek_39644', 1.029551807434017), ('Dicey_46381', 0.9683081583943917), ('Anderson_12352', 0.9481247849869717), ('Agnes_41590', 0.810560185583066)]


100%|███████████████████████████████████████| 677/677 [00:00<00:00, 1078.39it/s]

['TH']->['F']
	 [('Wool_3792', 1.3662486074351676), ('Isham_10973', 1.3094789735899826), ('Cudjo_31406', 1.1533292337068506), ('Jim_15402', 1.050447066973604), ('Letty_10973', 1.036659753302553), ('Milly_11057', 1.030502800687011), ('Jeremiah_33058', 1.0117688730228742), ('Hugh_15796', 0.9764983906040955), ('Zachariah_6013', 0.8893256203471266), ('Jute_5309', 0.8665683014544661), ('Wellington_11057', 0.8396746756312371), ('Tempy_24430', 0.7604670637940983), ('Toby_31406', 0.7370884682816873), ('Andy_52782', 0.662246796982849), ('Wool_24337', 0.6399077658616067), ('Tempy_26429', 0.6340718719652171), ('Plato_33058', 0.5377648860985372), ('Emile_4955', 0.5328142965962968), ('Nimbus_6058', 0.5261473984842786), ('Ned_41857', 0.5118613547959292)]



