RQ3: According to LLM suprisal, to what extent do quotations attributed to speakers referenced by
’n-word’ differ between the American Literate and American News domains?

In [1]:
import json
from itertools import chain
import re
import string
from collections import defaultdict, Counter
from tqdm import tqdm
import pandas as pd
import pathlib
import numpy as np

import arviz as az
import pymc as pm
from pprint import pprint as pp
import copy

from transformers import AutoTokenizer
from nltk.tokenize import WhitespaceTokenizer, word_tokenize

tqdm.pandas()

# Tokenizing functions

In [2]:
tokenizer = AutoTokenizer.from_pretrained('../llama3.1_70B/tokenizer/')

tk = WhitespaceTokenizer()
whitespace_tokenize = tk.tokenize

In [3]:
def get_words(s:str)->list[str]:
    """ split into words, ensure that contractions aren't split
        Note: This is my N wrt., 
        Note: why not word_tokenize() ? ... because it splits contractions
    """

    # split on whitespace
    words = whitespace_tokenize(s)

    # split off cases where word is enclosed by '' or `'
    words_ = []
    for word in words:
        if (word[0] == "'" and word[-1] == "'") or (word[0] == "`" and word[-1] == "'") or (word[0] == "\"" and word[-1] == "\""):
            words_.append(word[0])
            words_.append(word[1:-1])
            words_.append(word[-1])
        else:
            words_.append(word)
    words = words_
    

    # split of multiple hyphens, elipses, honorifics, initial (followed by dot), non alphanumeric/hyphen/apostrophe
    words_ = []
    for word in words:
        words_ += re.split(r"(-{2,}|\.\.\.|Mr\.|Mrs\.|Dr\.|Prof\.|[A-Z]\.|[^A-Za-z0-9_'-])", word)
    words = words_

    # finally, remove empty strings
    words = [word for word in words if word != ""]

    return words
                
    
def get_words_indices(words, tokens):

    words_indices = []

    tokens_ = [token[1:] if token[0] == "Ġ" else token for token in tokens]

    w = 0
    state = ["", [], True]  # accum of tokens, assum token indices, add to state?

    i = 0
    
    while i < len(tokens_):

        token = tokens_[i]

        if tokens[i][-1] == "Ġ":  # we have \s+ that's been mapped to a token ... skip it ... it's rare
            i+=1
            continue            
        
        # add more tokens to state and record the previous state
        if state[2] == True:
            state[0] += token
            state[1].append(i)

        # we've completed a word!
        if state[0] == words[w]:
            words_indices.append(state[1])
            w += 1
            state = ["", [], True]

        # state exceeds current word ... we have tokens what span words
        elif state[0][:len(words[w])]==words[w]:
            
            # add a decimal to indicate the root to take from token common between words
            excess = len(state[0]) - len(words[w])
            wanted = len(token) - excess
            state[1][-1] = state[1][-1] + wanted/len(token)

            words_indices.append(state[1])
            w+=1

            # init state for next word, incorporating excess of common token
            state = [token[-excess:], [i + (excess / len(token))], False]

            continue

        i += 1

    return words_indices
            

# Identify phonological variations in the LOC n-word quotations

load tuples

In [4]:
with open("../../LOC/tuples_news.json", "r") as f:
    tuples_news = json.load(f)
len(tuples_news)

1742

identify ood words

In [5]:
# scowl english words en_us large
with open('../../wordlist-en_US-large-2020.12.07/en_US-large.txt', 'r') as f:
    en_words = [line.strip('\n') for line in f.readlines()]
en_words += [w.capitalize() for w in en_words]
en_words = set(en_words)
len(en_words)

303817

In [6]:
def is_standard_contr(word:str)->bool:
    # ignore standard contractions 'll, 's, 'd, 've where, the 'stem' is in dict
    if (word[-3:] == "'ll" and word[:-3] in en_words) or (word[-3:] == "'ve" and word[:-3] in en_words) or (word[-2:] == "'s" and word[:-2] in en_words) or (word[-3:] == "n't" and word[:-3] in en_words) or (word[-3:] == "'re" and word[:-3] in en_words)  or (word[-2:] == "'d" and word[:-2] in en_words) or (word[-2:] == "'m" and word[:-2] in en_words) or (word[-1:] == "'" and word[:-1] in en_words):
        return False
    else:
        return True

def is_hyphenated(word:str)->bool:
    if all([w in en_words for w in word.split('-')]):
        return True
    else:
        False
        

get ood words (commented out, so don't overwrite annotations)

In [7]:
# get all words from all quotations
words = list(chain(*[get_words(tuple_[1][0]) for tuple_ in tuples_news]))

# get ood words
ood_words = list(set([word for word in words if not (word.strip('-') in en_words or word in string.punctuation or not is_standard_contr(word.strip('-')) or word.isnumeric() or is_hyphenated(word))]))

# save ood words
with open(f"RQ3_ood_words/RQ3_ood_words_pre.json", 'w') as f:
        json.dump({word:[] for word in ood_words}, f)


annotate ... cell below useful for examining parent quotations of ood words


In [8]:
word = "yah"
for tuple_ in tuples_news:
    q = tuple_[1][0]
    words = get_words(q)
    if word in words:
        print(q.replace(word, '<<'+word+'>>'))

Soup? - <<yah>>, ha! What a looking feller dat is to make soup ob! Heah Caesar, bite him,


# Build amended quotations for each targeted variation

## Variations targetted in LOC

dialect words

In [9]:
dialect_words1 = {
 "i's": 'i am',
 "i'se": 'i am',
 "we'se": "we are",
 "we's": "we are",
 "they'se": "they are",
 "they's": "they are",
 "dey'se": "dey are",
 "dey's": "dey are",
}
dialect_words1.update({k.capitalize():v.capitalize() for k, v in dialect_words1.items()})
print(dialect_words1.keys())

dialect_words2 = {
 'a-gwine': 'going',
 'agwine': 'going',
 'ergwine': 'going',
 'gwine': 'going',
}
dialect_words2.update({k.capitalize():v.capitalize() for k, v in dialect_words2.items()})
print(dialect_words2.keys())


dict_keys(["i's", "i'se", "we'se", "we's", "they'se", "they's", "dey'se", "dey's", "I's", "I'se", "We'se", "We's", "They'se", "They's", "Dey'se", "Dey's"])
dict_keys(['a-gwine', 'agwine', 'ergwine', 'gwine', 'A-gwine', 'Agwine', 'Ergwine', 'Gwine'])


In [10]:
tuples_news[0]

['https://chroniclingamerica.loc.gov/lccn/sn83025182/1910-05-22/ed-1/seq-6/#words=negro',
 ["Hit only cost me a string er fish ter git married, jedge, but, please God, I'd give a whale ter git rid er her.",
  'said',
  'negro']]

italicised

In [11]:
# get list of italicised words
def is_italicised(word:str)->bool:
    if len(word) > 2:
        if word[0] == "_" and word[-1] == "_":
            return True
    return False

italicised_words = dict()
for url, (quote, manner, speaker) in tuples_news:
    words = get_words(quote)
    for word in words:
        if is_italicised(word):
            italicised_words[word] = word.strip("_")

italicised_words

{}

In [12]:
regularised_irregular = {
 'beated': 'beat',
 'becomed': 'became',
 'beginnned': 'began',
 'bended': 'bent',
 'bidded': 'bid',
 'binded': 'bound', #
 'bited': 'bit',
 'bleeded': 'bled', #
 'breaked': 'broke',
 'bringed': 'brought',
 'builded': 'built',
 'buyed': 'bought',
 'catched': 'caught',  # 
 'choosed': 'chose',
 'comed': 'came',
 'dealed': 'dealt',
 'doesed': 'did',
 'drawed': 'drew',
 'drinked': 'drunk',
 'drived': 'drove', # 
 'eated': 'ate',
 'falled': 'fell',
 'feeded': 'fed',
 'feeled': 'felt',  # 
 'fighted': 'fought',
 'finded': 'found',
 'forgetted': 'forgot',
 'getted': 'got',
 'gived': 'gave',
 'goed': 'went',
 'growed': 'grew',
 'hased': 'had',
 'haved': 'had',
 'holded': 'held',
 'hurted': 'hurt',
 'ised': 'was',
 'keeped': 'kept',
 'knowed': 'knew',
 'leaded': 'led',
 'maked': 'made',
 'meeted': 'met',
 'mistaked': 'mistook',
 'readed': 'read',
 'rided': 'rode',
 'rised': 'rose',
 'runned': 'ran',
 'sayed': 'said',
 'seeked': 'sought',
 'sended': 'sent',
 'shalled': 'should',
 'shooted': 'shot',
 'sinked': 'sunk',
 'sitted': 'sat',
 'sleeped': 'slept',
 'speaked': 'spoke',
 'spended': 'spent',
 'springed': 'sprung',
 'standed': 'stood',
 'stealed': 'stole',
 'striked': 'struck',
 'swimmed': 'swum',
 'swinged': 'swung',
 'taked': 'took',
 'thinked': 'thought',
 'throwed': 'threw',
 'understanded': 'understood',
 'winned': 'won',
 'writed': 'wrote'}
regularised_irregular.update({k.capitalize():v.capitalize() for k, v in regularised_irregular.items()})

### various resources and functions for identifying phonological variants

load cmu

In [13]:
# import
import requests
lines = requests.get("http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b").text.splitlines()

# ignore eveything up the 'A  AH0'
lines = lines[lines.index('A  AH0'):]

# ignore stresses as deep phenomizer doesnt have these
word2arpa = defaultdict(list)
for line in lines:
    splits = [split for split in line.split(" ") if split != '']
    word = splits[0].split('(')[0]
    arpa = "".join(['[' + split.translate(str.maketrans('', '', '0123456789')) + ']' for split in splits[1:]])
    if word not in word2arpa: # just get the main pronunciaion
        word2arpa[word].append(arpa)

word2arpa['WAS']

['[W][AA][Z]']

load deep phonomizer

In [14]:
# load dp
from dp.phonemizer import Phonemizer
phonemizer = Phonemizer.from_checkpoint('./en_us_cmudict_forward.pt')
phonemizer('WAS', lang='en_us')

  checkpoint = torch.load(checkpoint_path, map_location=device)


'[W][AA][Z]'

useful functions ...

In [15]:
def arpaguess(s:str):
    if s.upper() not in word2arpa:
        return [phonemizer(s, lang='en_us')]
    else:
        return word2arpa[s.upper()]

arpaguess("going")

['[G][OW][IH][NG]']

In [16]:
import difflib
def get_changes(from_:list[str], to_:list[str], return_indices=False)->bool:

    # accumulator
    changes = []

    # state
    in_change = False

    for x in difflib.ndiff(from_, to_): #iterate over diff
        # print(x, in_change)
        
        if x[0] == "+" or x[0] == "-":  # change detected
            sign, entry = x.split(" ")

            # new change? then init addition to changes
            if in_change == False:
                changes.append([[], []])
            
            # regardless, record state
            in_change = True

            # regardless, capture changes
            if sign == '-':
                changes[-1][0].append(entry)
            elif sign == '+':
                changes[-1][1].append(entry)

        elif x[0] == "?":
            pass
            
        else:  # match detected
                
            # regardless, record state    
            in_change = False

    return changes
            
print(get_changes(['a','b','b','c','e'], ['a','b','d','e']))
print(get_changes(["it", "oughta", "happen"], ["it", "ought", "to", "happen"]))
print(get_changes("it oughta happen".split(), "it ought to happen".split()))
list(difflib.ndiff(["it", "oughta", "happen"], ["it", "ought", "to", "happen"]))
    

[[['b', 'c'], ['d']]]
[[['oughta'], ['ought', 'to']]]
[[['oughta'], ['ought', 'to']]]


['  it', '- oughta', '?      -\n', '+ ought', '+ to', '  happen']

specify 'DH->D' phonological variations

In [17]:
phonological_variations_LOC = {}


In [18]:
DH_D = {
    "de":["the"], 
    "dey":["they"],
    "dey's":["they's"],
    "dey'll":["they'll"],
    "dey'd":["they'd"],
    "dis":["this"],
    "dis'll":["this'll"],
    "dis'd":["this'd"],
    "dat":["that"],
    "dat's":["that"],
    "dat'll":["that'll"],
    "dat'd":["that'd"],
    "De":["The"], 
    "Dey":["They"],
    "Dey's":["They's"],
    "Dey'll":["They'll"],
    "Dey'd":["They'd"],
    "Dis":["This"],
    "Dis'll":["This'll"],
    "Dis'd":["This'd"],
    "Dat":["That"],
    "Dat's":["That"],
    "Dat'll":["That'll"],
    "Dat'd":["That'd"]
}

for speakers in [["Negro", "negro"]]:

    s = ",".join(speakers)
    phonological_variations_LOC = {"['DH']->['D']":set(list(DH_D.keys()))}

manually_assigned = set(list(DH_D.keys()))

identify phonological variations from annotations wrt., n-word attributed quotations in LOC 

In [19]:
# phonological_variations_LOC = {}

# ------
# get a dict of words by phonological variation: i.e., variation2words['man']["['NG']->['N']"] = ["comin'", ...]
# ------
with open(f"RQ3_ood_words/RQ3_ood_words.json", "r") as f:
    annotations = json.load(f)

for non_canonical, corrections in annotations.items():
    if len(corrections) == 1 and non_canonical not in manually_assigned:
        canonical = corrections[0]

        for from_, to_ in get_changes(re.findall(r"\w+", arpaguess(canonical)[0]), re.findall(r"\w+", arpaguess(non_canonical)[0])):
            # capture instances
            if f"{from_}->{to_}" not in phonological_variations_LOC:
                phonological_variations_LOC[f"{from_}->{to_}"] = set()
            phonological_variations_LOC[f"{from_}->{to_}"].add(non_canonical)

phonological_variations_LOC

{"['DH']->['D']": {"'dat",
  'Dat',
  "Dat'd",
  "Dat'll",
  "Dat's",
  'De',
  "Deh's",
  'Dese',
  'Dey',
  "Dey'd",
  "Dey'll",
  "Dey's",
  'Dis',
  "Dis'd",
  "Dis'll",
  'brudder',
  'dan',
  'dat',
  "dat'd",
  "dat'll",
  "dat's",
  'dats',
  'de',
  'dem',
  'dese',
  'dey',
  "dey'd",
  "dey'll",
  "dey's",
  'deze',
  'dis',
  "dis'd",
  "dis'll",
  'disyeh',
  'furder',
  'nudder',
  'togedder',
  'wid'},
 "['IH', 'K']->[]": {"'cept",
  "'scuse",
  "'specs",
  "'sperience",
  "'splain",
  "'spress",
  "'spression",
  "s'peck",
  'spected'},
 "['T']->[]": {"'sackly",
  "'specs",
  "Doan'",
  "Jes'",
  'des',
  "didn'",
  'doan',
  "doan'",
  "excep'",
  'ezzactly',
  "genalmun's",
  'gennulman',
  "hain'",
  'jes',
  "jes'",
  "las'",
  "lef'",
  'lif',
  "mos'ly",
  "nex'",
  "s'peck",
  "spec'able",
  "trus'"},
 "['G', 'EH']->['JH', 'IY']": {'giti'},
 "[]->['IY']": {'Jedge', 'giti', 'jedge', 'lawdy', 'useter'},
 "['AH']->['EH']": {'Jedge', "Jes'", 'jedge', 'jes', "jes'", '

## build quotations corrected of variations (targeting each variation separately)

thus, e.g., if a quote has 2 targetted variations, then there'll be 2 instnaces of a corrected quotations, each noted against the variation in question

Note: once built, chain chains...


In [20]:
corrected_quotes = []

get corrected quotations wrt., dialect_words

In [21]:
from collections import deque

def word_swaps(words:list[str], replacements:dict[str,str]):
    swaps = [(word, replacements[word]) for word in words if word in replacements.keys()]
    return swaps

def replace_strings(quote:str, replacements:dict[str,str]):

    # identify what is to be swapped in order
    words = get_words(quote)
    swaps = deque(word_swaps(words, replacements)) # state

    # for every word (in quote), associate it with its quote.find index
    word_and_in_text_index = []
    start = 0
    for word in words:
        quote_index = quote.find(word, start)
        word_and_in_text_index.append((word, quote_index))
        start = quote_index + len(word)
    word_and_in_text_index = deque(word_and_in_text_index)

    # build the new quote
    new_quote:str = ""
    start = 0
    while len(swaps) > 0:
        old_word, new_word = swaps.popleft()

        while True:
            word, quote_index = word_and_in_text_index.popleft()

            if old_word == word:
                new_quote += quote[start:quote_index] + new_word
                start = quote_index + len(old_word)
                break      

    new_quote += quote[start:]
                
    return new_quote
        
display(replace_strings("Dat water is dealt, Dat's _is_ good", {"Dat":"That", "_is_":"is"}))
display(replace_strings("'cause I say so!", {"'cause":"because"}))
display(replace_strings("don' do it", {"don'":"don't"}))
display(replace_strings("and _why_ do you think that?", {"_why_":"why"}))
display(replace_strings("I'se going", {"I'se":"I am"}))


# replacements = {'de':'the'}
# q = PG_df.loc[13811, "quote"][1:-1]
# print(q)
# print(replace_strings(q, replacements))



"That water is dealt, Dat's is good"

'because I say so!'

"don't do it"

'and why do you think that?'

'I am going'

In [22]:
# get corrections for italicised words
def is_italicised(word:str)->bool:
    if len(word) > 2:
        if word[0] == "_" and word[-1] == "_":
            return True
    return False

targets = set(italicised_words.keys())

counter = 0
for i, tuple_ in enumerate(tuples_news):
    
    quote = tuple_[1][0]
    words = get_words(quote)

    targets_present = set(words).intersection(targets)

    if len(targets_present) > 0:
        counter += 1

        # get dict of all corrections to be made
        replacements = {t:t[1:-1] for t in targets_present}

        # make corrections
        corrected_quote = replace_strings(quote, replacements)

        # record
        corrected_quotes.append([i, replacements, "italicised", corrected_quote])

targets = set(italicised_words.keys())

counter

0

In [23]:
targets = set(dialect_words1.keys())

counter = 0
for i, tuple_ in enumerate(tuples_news):
    
    quote = tuple_[1][0]
    words = get_words(quote)

    targets_present = set(words).intersection(targets)

    if len(targets_present) > 0:
        counter += 1

        # get dict of all corrections to be made
        replacements = {t:dialect_words1[t] for t in targets_present}

        # make corrections
        corrected_quote = replace_strings(quote, replacements)

        # record
        corrected_quotes.append([i, replacements, "dialect_words1", corrected_quote])

counter

59

In [24]:
targets = set(dialect_words2.keys())

counter = 0
for i, tuple_ in enumerate(tuples_news):
    
    quote = tuple_[1][0]
    words = get_words(quote)

    targets_present = set(words).intersection(targets)

    if len(targets_present) > 0:
        counter += 1

        # get dict of all corrections to be made
        replacements = {t:dialect_words2[t] for t in targets_present}

        # make corrections
        corrected_quote = replace_strings(quote, replacements)

        # record
        corrected_quotes.append([i, replacements, "dialect_words2", corrected_quote])

counter

42

get corrected quotations wrt., regularised irregular words

In [25]:
targets = set(regularised_irregular.keys())

counter = 0
for i, tuple_ in enumerate(tuples_news):
    
    quote = tuple_[1][0]
    words = get_words(quote)

    targets_present = set(words).intersection(targets)

    if len(targets_present) > 0:
        counter += 1

        # get dict of all corrections to be made
        replacements = {t:regularised_irregular[t] for t in targets_present}

        # make corrections
        corrected_quote = replace_strings(quote, replacements)

        # record
        corrected_quotes.append([i, replacements, "regularised_irregular", corrected_quote])

counter

12

get corrected quotations wrt., phonological variations

In [26]:
with open(f"RQ3_ood_words/RQ3_ood_words.json", "r") as f:
    annotations = json.load(f)

counter = 0
for v in tqdm(phonological_variations_LOC):

    targets:set = phonological_variations_LOC[v].intersection([k for k,v in annotations.items() if len(v) == 1])

    for i, tuple_ in enumerate(tuples_news):
        
        quote = tuple_[1][0]
        words = get_words(quote)
    
        targets_present = set(words).intersection(targets)
    
        if len(targets_present) > 0:
            counter += 1
    
            # get dict of all corrections to be made
            replacements = {t:annotations[t][0] for t in targets_present}
    
            # make corrections
            corrected_quote = replace_strings(quote, replacements)
    
            # record
            corrected_quotes.append([i, replacements, v, corrected_quote])

counter

100%|█████████████████████████████████████████| 272/272 [00:03<00:00, 74.07it/s]


2637

In [27]:
display(corrected_quotes[:1])
len(corrected_quotes)

[[86,
  {"I's": 'I am'},
  'dialect_words1',
  'Jist like a woman! always wants to visit in bad weather! And now I am got to sue de white folks of this train in the Federal Court for my damages and rights. I is going to do that very thing, if God spares me and I can git a lawyer!']]

2750

In [28]:
# out of interest: what dialect words do we see in LOC?
print(set(chain(*[x[1].keys() for x in corrected_quotes if x[2] == "dialect_words2"])))
print(set(chain(*[x[1].keys() for x in corrected_quotes if x[2] == "dialect_words1"])))

{'gwine'}
{"I'se", "they's", "We'se", "I's", "dey's", "They's", "they'se", "i'se", "we's", "Dey's"}


save corrected quotes (commented out, so as not to overwrite)

In [None]:
# with open("RQ3_downstream/all_corrected.json", "w") as f:
#     json.dump(corrected_quotes, f)

# Load chains for calculating .... 

$\bar{S}_{nword/lit}$ 

$\bar{S}_{nword/lit, \backslash v}$

$\bar{S}_{nword/news}$

$\bar{S}_{nword/news, \backslash v}$


load PG quotes and corresponding chains

In [29]:
with open('../../PG/extract_quotes_via_spaCy/quotes_blacklist.json', 'r') as f:
    PG_blacklist = json.load(f) 

In [30]:
with open('../../PG/extract_quotes_via_spaCy/quotes_5Jul.json', 'r') as f:
    PG_df = pd.DataFrame([t for i,t in enumerate(json.load(f))], columns = ["id", "p", "quote", "manner", "speaker"])
PG_df['i'] = list(range(len(PG_df)))

In [31]:
# ignore (as done for chains) ... those quotes in blacklist or not of speakers of interest
speakers_of_interest = set(['man', 'woman', 'child', 'gentleman', 'lady', 'negro', 'Negro'])
PG_df = PG_df.loc[(PG_df.loc[:,'i'].isin(PG_blacklist)==False)&(PG_df.loc[:,'speaker'].isin(speakers_of_interest)==True),:]

In [32]:
# reset indices, reset 'i'
PG_df.reset_index(drop=True, inplace=True)
PG_df['i'] = list(range(len(PG_df)))

In [33]:
chains_dir = pathlib.Path('../../PG/Snellius/mwcgln/llama3.1_70B/')

# get orders list of chains fps - they need to be re-assembled in this order, to correspond to 
ordered_chains_fps = sorted(list(chains_dir.glob('*.json')), key=lambda c: int(re.search(r"_(\d+)\.json", str(c)).groups()[0]))

# re-assemble
PG_chains = []
for chain_fp in tqdm(ordered_chains_fps):
    with open(chain_fp, 'r') as f:
        PG_chains += json.load(f)
len(PG_chains)

100%|█████████████████████████████████████████████| 2/2 [00:00<00:00,  9.64it/s]


26502

In [34]:
bad_encoding_i = []
bad_encoding_speakers = Counter()
for i, (q, speaker) in enumerate(zip(PG_df.loc[:,'quote'], PG_df.loc[:,'speaker'])):
    try:
        s = q[1:-1]
        words = get_words(s)
        tokens = tokenizer.tokenize(s)
        words_indices = get_words_indices(words, tokens)

        assert len(words) == len(words_indices)
    except:
        bad_encoding_i.append(i)
        bad_encoding_speakers[speaker] += 1

display(bad_encoding_i)
display(bad_encoding_speakers)

[399,
 1199,
 3217,
 3880,
 3882,
 3995,
 3997,
 4401,
 5371,
 5878,
 5881,
 5884,
 5885,
 6795,
 7345,
 7347,
 7349,
 7350,
 7352,
 7353,
 7358,
 7362,
 7364,
 7765,
 8793,
 8803,
 9064,
 10471,
 12156,
 12194,
 12670,
 13484,
 13525,
 14547,
 15114,
 16441,
 16461,
 17035,
 17101,
 17106,
 18217,
 18809,
 19027,
 19129,
 19603,
 20159,
 20164,
 20467,
 21675,
 21676,
 22704,
 22771,
 23625,
 24905,
 25744,
 25756,
 25765,
 26422]

Counter({'man': 39, 'lady': 6, 'negro': 6, 'woman': 5, 'gentleman': 2})

In [35]:
# drop from quotes
PG_df.drop(bad_encoding_i, inplace=True)
PG_df.reset_index(drop=True, inplace=True)
PG_df['i'] = list(range(len(PG_df)))

In [36]:
# drop from chains
for i in sorted(bad_encoding_i, reverse=True):
    del PG_chains[i]

In [37]:
display(len(PG_df))
display(len(PG_chains))

26444

26444

In [38]:
import math

def get_surprisals(words, tokens, chain):

    variation_surprisal = []
    
    for word, word_indices in zip(words, get_words_indices(words, tokens)):
        
        # are word_indices a decimal? then we need to apply a power to the probability, to split a token that spans words
        powers, indices = zip(*[math.modf(i) for i in word_indices])
        powers = np.array([1 if p == 0 else p for p in powers])
        indices = [int(i) for i in indices]

        # log(x^p) = p*log(x)
        word_surprisal = (-np.log(chain[indices])*powers).sum()
        variation_surprisal.append(word_surprisal)

    return variation_surprisal

In [39]:
PG_df["words"] = PG_df["quote"].progress_apply(lambda x: get_words(x[1:-1]))

100%|█████████████████████████████████| 26444/26444 [00:00<00:00, 111153.93it/s]


In [40]:
PG_df['surprisals'] = [get_surprisals(PG_df.loc[i, 'words'], tokenizer.tokenize(PG_df.loc[i, 'quote'][1:-1]), np.array(PG_chains[i][1:])) for i in tqdm(range(len(PG_df)))]


100%|██████████████████████████████████| 26444/26444 [00:02<00:00, 13106.07it/s]


load PG corrected quotes and corresponding chains

In [41]:
with open("RQ1_downstream/all_corrected.json", "r") as f:
    corrected_quotes_PG = json.load(f)
len(corrected_quotes_PG)

8653

In [42]:
with open("RQ1_downstream/chains_llama3.1_70B_all_corrected.json", "r") as f:
    chains_corrected_PG = json.load(f)
len(chains_corrected_PG)

8653

load LOC quotes and corresponding chains

In [43]:
with open("../../LOC/tuples_news.json", "r") as f:
    tuples_news = json.load(f)
len(tuples_news)

1742

In [44]:
with open("../../LOC/chains_llama3.1_news.json", "r") as f:
    chains_LOC = json.load(f)
len(chains_LOC)

1742

load LOC corrected quotes and corresponding chains

In [45]:
with open("RQ3_downstream/all_corrected.json", "r") as f:
    corrected_quotes_LOC = json.load(f)
len(corrected_quotes_LOC)

2750

In [46]:
with open("RQ3_downstream/chains_llama3.1_70B_all_corrected.json", "r") as f:
    chains_corrected_LOC = json.load(f)
len(chains_corrected_LOC)

2750

## Deviation between observed mean LLM surprisal over words between n-word quotes of PG and LOC

calculate x_bar_PG

In [47]:
x_bar_PG = np.array(list(chain(*3PG_df.loc[PG_df.loc[:,'speaker'].isin(["Negro", "negro"]), 'surprisals']))).mean()
x_bar_PG

15.267566609622147

### calculate x_bar_LOC

In [48]:
# add word surprisals to tuples_news
surprisals = []
for i, tuple_ in enumerate(tuples_news):
    c = np.array(chains_LOC[i][1:])
    q = tuple_[1][0]
    words = get_words(q)
    tokens = tokenizer.tokenize(sq)
    tuple_[1] += [list(get_surprisals(words, tokens, c))]


In [49]:
x_bar_LOC = np.array(list(chain(*[tuple_[1][-1] for tuple_ in tuples_news]))).mean()
x_bar_LOC

13.373269680636792

calc. $(\bar{S}_{nword, lit} - \bar{S}_{nword, news}) / \bar{S}_{nword, news} $

In [50]:
100*(x_bar_PG - x_bar_LOC) / x_bar_LOC

14.164800188903053

## Deviation between latent mean LLM surprisal over words between n-word quotes of PG and LOC

In [51]:
def get_hdi(d: np.ndarray, hdi=0.89)->tuple[float]:
    """ Return (lowerbound::float, upperbound::float) wrt.,
        prescribed highest density interval
    """
    lb = (1-hdi)/2
    ub = hdi+lb
    return (np.quantile(d, lb), np.quantile(d, ub))

def bayesian_bootstrapping(data, a=3, target_accept=0.95):
    with pm.Model() as model:

        w = pm.Dirichlet("w", a=np.ones(len(data))*a)
        
        mean = pm.Deterministic("mean", pm.math.sum(w * data))

        ## using NUTS sampler
        trace = pm.sample(target_accept=target_accept)
        
    return model, trace

### estimate the $\bar{Z}_{n-word, literature}$ wrt., PG & n-word

In [52]:
s = ",".join(["Negro", "negro"])
trace_PG = az.from_netcdf(f'RQ1_population_samples/TRACE_{s}.nc')
posterior_samples_PG = az.extract(trace_PG, var_names=['mean'], combined=True)
print(get_hdi(posterior_samples_PG))

(15.167998199577706, 15.343414115553507)


### estimate the $\bar{Z}_{n-word, news}$ wrt., LOC & n-word

In [53]:
# # sample the data (commented out now data has been sampled)
# data = np.array(list(chain(*[tuple_[1][-1] for tuple_ in tuples_news])))
# sample_data = list(np.random.choice(data, size=10000))
# with open("RQ3_population_samples/sample_10000.json", "w") as f:
#     json.dump(sample_data, f)

In [54]:
trace_LOC = az.from_netcdf('RQ3_population_samples/TRACE.nc')
posterior_samples_LOC = az.extract(trace_LOC, var_names=['mean'], combined=True)
print(get_hdi(posterior_samples_LOC))

(13.400298910739892, 13.541473622224425)


### estimate $(\bar{Z}_{nword, lit} - \bar{Z}_{nword, news}) / \bar{Z}_{nword, news}$


In [55]:
pc = 100*(posterior_samples_PG - posterior_samples_LOC) / posterior_samples_LOC
get_hdi(pc, hdi=0.99)

(11.823071442419211, 14.724926470554383)

# calculate $C(\bar{S}_{\text{nword}}, v) - C(\bar{S}_{\text{normative reference speaker}}, v)$

consider all phonological variations from PG, from news, and manually specified

### get phonological variations in PG n-word quotes

In [56]:
phonological_variations_PG = {}

DH_D = {
    "de":["the"], 
    "dey":["they"],
    "dey's":["they's"],
    "dey'll":["they'll"],
    "dey'd":["they'd"],
    "dis":["this"],
    "dis'll":["this'll"],
    "dis'd":["this'd"],
    "dat":["that"],
    "dat's":["that"],
    "dat'll":["that'll"],
    "dat'd":["that'd"],
    "De":["The"], 
    "Dey":["They"],
    "Dey's":["They's"],
    "Dey'll":["They'll"],
    "Dey'd":["They'd"],
    "Dis":["This"],
    "Dis'll":["This'll"],
    "Dis'd":["This'd"],
    "Dat":["That"],
    "Dat's":["That"],
    "Dat'll":["That'll"],
    "Dat'd":["That'd"]
}

for speakers in [["Negro", "negro"]]:

    s = ",".join(speakers)
    phonological_variations_PG = {"['DH']->['D']":set(list(DH_D.keys()))}

manually_assigned = set(list(DH_D.keys()))

In [57]:
# ------
# get a dict of words by phonological variation: i.e., variation2words['man']["['NG']->['N']"] = ["comin'", ...]
# ------
with open(f"RQ1_ood_words/RQ1_ood_words_Negro,negro.json", "r") as f:
    annotations = json.load(f)

for non_canonical, corrections in annotations.items():
    if len(corrections) == 1 and non_canonical not in manually_assigned:
        canonical = corrections[0]

        for from_, to_ in get_changes(re.findall(r"\w+", arpaguess(canonical)[0]), re.findall(r"\w+", arpaguess(non_canonical)[0])):
            # capture instances
            if f"{from_}->{to_}" not in phonological_variations_PG:
                phonological_variations_PG[f"{from_}->{to_}"] = set()
            phonological_variations_PG[f"{from_}->{to_}"].add(non_canonical)

phonological_variations_PG

{"['DH']->['D']": {'Brudder',
  'Dat',
  "Dat'd",
  "Dat'll",
  "Dat's",
  'Dats',
  'De',
  'Dem',
  'Dey',
  "Dey'd",
  "Dey'll",
  "Dey's",
  'Dis',
  "Dis'd",
  "Dis'll",
  'anudder',
  'brudder',
  'dan',
  'dass',
  'dat',
  "dat'd",
  "dat'll",
  "dat's",
  'dat-what',
  'de',
  'dem',
  'dese',
  'dey',
  "dey'd",
  "dey'll",
  "dey's",
  "dey'se",
  "deyse'f",
  'deyselves',
  'deze',
  'dis',
  "dis'd",
  "dis'll",
  'fedders',
  'furder',
  'nudder',
  'togedder',
  'wid'},
 "['OW']->['AA']": {"don'", 'fokses', 'naw', 'totin'},
 "['T']->[]": {"'spec",
  "'speck",
  "Baptis'",
  "Cap'en",
  "Fus'",
  "Jes'",
  'Marser',
  "Mas'r",
  'Masser',
  "ain'",
  "cap'en",
  "couldn'",
  'dass',
  'des',
  "des'",
  'dess',
  "don'",
  "don'e",
  "expec'",
  'fack',
  "greates'",
  "had'n",
  'inturrup',
  'jes',
  "jes'",
  "jus'",
  "las'",
  'marser',
  "marser's",
  "mas'r",
  'masser',
  "mos'",
  "mus'",
  'nex',
  "nex'",
  'speckin',
  'spek',
  'widgable'},
 "['AE', 'S', 'T',

In [58]:
# use PG phonological variations as a base
variations = copy.deepcopy(phonological_variations_PG)  # {"man":{"italicised":set of non canonical words}

# add in phonological variations from LOC
for v, wordset in phonological_variations_LOC.items():
    if v not in variations:
        variations[v] = set()
    variations[v] = variations[v].union(wordset)   

# add in ...
variations["dialect_words1"] = set(dialect_words1.keys())
variations["dialect_words2"] = set(dialect_words2.keys())
variations["regularised_irregular"] = set(regularised_irregular.keys())
variations["italicised"] = set(italicised_words.keys())

# First, as a check, without considering the downstream effects ...


## get $C(\bar{S_{nword}, lit}, v) = \frac{\bar{S_{i}} - \bar{S}_{i,\backslash v}}{\bar{S_{i}}}$

i.e., the proportional reduction in mean word LLM surprisal due to the ommission of words demonstrative of the variation

doesn't consider downstream effects due to the upstream instances of variation in question!


In [59]:
contributions_lit = {}

# get mean of word LLM surprisal
x_bar = np.array(list(chain(*PG_df.loc[PG_df.loc[:,"speaker"].isin(["Negro","negro"]), "surprisals"]))).mean()

# consider each variation in turn
V = variations.keys()
for v in tqdm(V):    

    ss = 0
    counter = 0

    # surprisals for speaker, ignoring corrected quotes
    surprisals = list(chain(*PG_df.loc[PG_df.loc[:,"speaker"].isin(speakers), "surprisals"]))
    words = list(chain(*PG_df.loc[PG_df.loc[:,"speaker"].isin(speakers), "words"]))

    for word, surprisal in zip(words, surprisals):
        if word not in variations[v]:
            ss += surprisal
            counter += 1

    contributions_lit[v] = 100*(x_bar - ss / counter) / x_bar   


100%|████████████████████████████████████████| 432/432 [00:00<00:00, 492.81it/s]


In [60]:
v = "['NG']->['N']"
display(x_bar_PG)
display(contributions_lit[v])


15.267566609622147

1.5571436397028469


## get $C(\bar{S}_{nword, news}, v) = \frac{\bar{S_{i}} - \bar{S}_{i,\backslash v}}{\bar{S_{i}}}$

i.e., the proportional reduction in mean word LLM surprisal due to the ommission of words demonstrative of the variation

doesn't consider downstream effects due to the upstream instances of variation in question!

In [61]:
contributions_news = {}

# get mean of word LLM surprisal
words = list(chain(*[get_words(x[1][0]) for x in tuples_news]))
surprisals = list(chain(*[x[1][-1] for x in tuples_news]))
x_bar = np.array(surprisals).mean()

# consider each variation in turn
V = variations.keys()
for v in tqdm(V):    

    ss = 0
    counter = 0

    for word, surprisal in zip(words, surprisals):
        if word not in variations[v]:
            ss += surprisal
            counter += 1

    contributions_news[v] = 100*(x_bar - ss / counter) / x_bar   
       

100%|████████████████████████████████████████| 432/432 [00:01<00:00, 387.74it/s]


In [62]:
v = "['NG']->['N']"
display(x_bar_LOC)
display(contributions_news[v])


13.373269680636792

0.15873526998720544

## get $C(\bar{S}_{\text{nword}}, v) - C(\bar{S}_{\text{normative reference speaker}}, v)$

i.e., let's look at the relative contributions of words demonstrative of each variation...

again ... does not correct for downstream effects

In [101]:
# biggest differences between ...
xbar_lit = np.array(list(chain(*PG_df.loc[PG_df.loc[:,"speaker"].isin(["Negro","negro"]), "surprisals"]))).mean()
xbar_news = np.array(list(chain(*[x[1][-1] for x in tuples_news]))).mean()

pp(
    sorted(
        [
            (
                # varatiation
                v, 
                # difference in proportional contributions
                round(contributions_lit[v] - contributions_news[v], 2), 
                # percentage reduction in difference in mean word LLM surprisal betweeen nword and normative reference speaker
                round(100*(contributions_lit[v]*xbar_lit/100 - contributions_news[v]*xbar_news/100)/(xbar_lit - xbar_news), 1),
                # proportional contribution of variation wrt., nword mean word LLM surprisal
                round(contributions_lit[v], 2), 
                # proportional contribution of variation wrt., normative ref mean word LLM surprisal
                round(contributions_news[v], 2)
            ) 
            if v in contributions_news
            else (
                # varatiation
                v, 
                # difference in proportional contributions
                contributions_lit[v] - 0, 
                # reduction in difference in mean word LLM surprisal betweeen nword and normative reference speaker
                100*(contributions_lit[v]*xbar_lit/100 - 0)/(xbar_lit - xbar_news),
                # proportional contribution of variation wrt., nword mean word LLM surprisal
                contributions_lit[v], 
                # proportional contribution of variation wrt., normative ref mean word LLM surprisal
                0 
            ) for v in contributions_lit.keys()
        ], 
        key = lambda x: x[1], reverse=True
    )[:15]
)

[("['NG']->['N']", 1.4, 11.4, 1.56, 0.16),
 ('dialect_words2', 0.46, 3.7, 0.45, -0.01),
 ("['ER']->['AH']", 0.42, 3.5, 0.48, 0.06),
 ("['T']->[]", 0.41, 3.3, 0.47, 0.07),
 ("['AH']->[]", 0.38, 3.1, 0.4, 0.01),
 ("['TH']->['T']", 0.25, 2.0, 0.24, -0.01),
 ("['R']->[]", 0.24, 2.0, 0.28, 0.04),
 ("['AE']->['AA', 'R']", 0.24, 2.0, 0.28, 0.04),
 ("['TH']->['F']", 0.22, 1.7, 0.21, -0.01),
 ("['OW', 'IH', 'NG']->['W', 'AY', 'N']", 0.22, 1.8, 0.21, -0.01),
 ('dialect_words1', 0.22, 1.8, 0.23, 0.01),
 ("['T', 'ER']->[]", 0.18, 1.5, 0.21, 0.03),
 ("['L']->[]", 0.16, 1.3, 0.16, 0.0),
 ("['ER']->['AA', 'R']", 0.14, 1.1, 0.17, 0.03),
 ("['AH']->['IH']", 0.13, 1.0, 0.15, 0.02)]


# Next, considering the downstream effects

# get $C_{n\text{-word},lit}(\bar{S}_{i}, v) = \frac{\bar{S}_{i} - \bar{S}_{i,\backslash v}}{\bar{S_{i}}}$

i.e., contribution correcting for downstream effect

In [64]:
# consider all variations found in the corrected quotes
V = variations.keys()

contributions_lit_c = {}  # estimated contributions based on 'correction' of words demonstrative of a each variation
contributions_lit_c[s] = {}

x_bar =  np.array(list(chain(*PG_df.loc[PG_df.loc[:, 'speaker'].isin(["Negro","negro"]), "surprisals"]))).mean() 

# consider each variation in turn
for v in tqdm(V):

    # get PG_df indices which contain words demonstrative of the variation
    i2c = {i:c for c, (i, d, variation , _) in enumerate(corrected_quotes_PG) if v==variation}  
    I = list(i2c.keys())

    # surprisals for speaker, ignoring I ... we will add to these
    surprisals = list(chain(*PG_df.loc[(~PG_df.loc[:,"i"].isin(I)) & (PG_df.loc[:,"speaker"].isin(speakers)), "surprisals"]))

    # add surprisals for I
    for i in PG_df.loc[(PG_df.loc[:,"i"].isin(I)) & (PG_df.loc[:,"speaker"].isin(["Negro", "negro"])), "i"]:

        # get surprisals (by word) wrt., corrected quotations
        quote_c = corrected_quotes_PG[i2c[i]][3]
        chain_c = np.array(chains_corrected_PG[i2c[i]][1:])  # ignore standard beginning token given to all chains
        words_c = get_words(quote_c)
        tokens_c = tokenizer.tokenize(quote_c)
        surprisals_c = get_surprisals(words_c, tokens_c, chain_c)

        # sum the surprisals where corresponding words are unchanged between correction and original 
        matcher = difflib.SequenceMatcher(None, PG_df.loc[i, 'words'], words_c)
        opcodes = matcher.get_opcodes()
        for tag, i1, i2, j1, j2 in opcodes:  # where x1
            if tag == "equal":
                surprisals += surprisals_c[j1:j2]

    # report
    contributions_lit_c[v] = 100*(x_bar - np.array(surprisals).mean())/x_bar

100%|████████████████████████████████████████| 432/432 [00:01<00:00, 351.12it/s]


In [66]:
v = "['NG']->['N']"
display(xbar_lit)
display(contributions_lit[v])
display(contributions_lit_c[v])

15.267566609622147

1.5571436397028469

1.5278906256067426

# get $C_{n\text{-word},news}(\bar{S}_{i}, v) = \frac{\bar{S}_{i} - \bar{S}_{i,\backslash v}}{\bar{S_{i}}}$

i.e., contribution correcting for downstream effect

In [79]:
corrected_quotes_LOC[0]

[86,
 {"I's": 'I am'},
 'dialect_words1',
 'Jist like a woman! always wants to visit in bad weather! And now I am got to sue de white folks of this train in the Federal Court for my damages and rights. I is going to do that very thing, if God spares me and I can git a lawyer!']

In [97]:
# consider all variations found in the corrected quotes
V = variations.keys()

contributions_news_c = {}  # estimated contributions based on 'correction' of words demonstrative of a each variation

# get x_bar of the original collection
surprisals = []
for x in tuples_news:
    surprisals += x[1][-1]
x_bar =  np.array(surprisals).mean()

# consider each variation in turn
for v in ["['NG']->['N']"]: #tqdm(V):

    I = [x[0] for x in corrected_quotes_LOC if x[2] == v]  # tuples_news indices corrected wrt., current variation
    i2c = {x[0]:c for c, x in enumerate(corrected_quotes_LOC) if x[0] in I}

    # surprisals for speaker, ignoring I ... we will add to these
    surprisals = []
    for i, x in enumerate(tuples_news):
        if i not in I:
            surprisals += x[1][-1]

    # add surprisals for I
    for i in I:

        quote = tuples_news[i][1][0]
        words = get_words(quote)

        # get surprisals (by word) wrt., corrected quotations
        quote_c = corrected_quotes_LOC[i2c[i]][3]
        chain_c = np.array(chains_corrected_LOC[i2c[i]][1:])  # ignore standard beginning token given to all chains
        words_c = get_words(quote_c)
        tokens_c = tokenizer.tokenize(quote_c)
        surprisals_c = get_surprisals(words_c, tokens_c, chain_c)

        # sum the surprisals where corresponding words are unchanged between correction and original 
        matcher = difflib.SequenceMatcher(None, words, words_c)
        opcodes = matcher.get_opcodes()
        for tag, i1, i2, j1, j2 in opcodes:  # where x1
            if tag == "equal":
                surprisals += surprisals_c[j1:j2]
            else:
                if v == "['NG']->['N']":
                    print(words[i1:i2], words_c[j1:j2])

    # report
    contributions_news_c[v] = 100*(x_bar - np.array(surprisals).mean())/x_bar

['nutfin'] ['nothing']
['sich'] ['such']
["wantin'"] ['wanting']
["Dancin'"] ['Dancing']
['bress'] ['bless']
["doin'"] ['doing']
["readin'"] ['reading']
["cain't"] ["can't"]
['Heah'] ['Here']
['heah'] ['hear']
["nothin'"] ['nothing']
['aginst'] ['against']
["roarin'"] ['roaring']
['Dancin'] ['Dancing']
["Dancin'"] ['Dancing']
["'publicans"] ['republicans']
["'publicans"] ['republicans']
['sah'] ['sir']
['mah'] ['my']
["hopin'"] ['hoping']
["'cause"] ['because']
['perceeded'] ['preceded']
["th'"] ['the']
["th'"] ['the']
["th'"] ['the']
["th'"] ['the']
["summin'"] ['something']
["sumpin'"] ['something']
["Dancin'"] ['Dancing']
["Dancin'"] ['Dancing']
["Dancin'"] ['Dancing']
["noffin'"] ['nothing']
['wuz'] ['was']
['wuz'] ['was']
['sint'] ["isn't"]
["showin'"] ['showing']
['debbil'] ['devil']
["Dancin'"] ['Dancing']
['nuffin'] ['nothing']
['truf'] ['truth']
["a-tryin'"] ['a-trying']
["mawnin'"] ['morning']
["mawnin'"] ['morning']
["showin'"] ['showing']
["sett'n"] ['setting']
['somewhars'

In [102]:
v = "['NG']->['N']"
display(xbar_lit)
display(contributions_news[v])
display(contributions_news_c[v])


15.267566609622147

0.15873526998720544

-0.7015255528268064

## get $C(\bar{S}_{n-\text{word},lit }, v) - C(\bar{S}_{n-\text{word},news}, v)$

In [100]:
# biggest differences between ...
xbar_lit = np.array(list(chain(*PG_df.loc[PG_df.loc[:,"speaker"].isin(["Negro","negro"]), "surprisals"]))).mean()

# get x_bar of the original collection
surprisals = []
for x in tuples_news:
    surprisals += x[1][-1]
xbar_news =  np.array(surprisals).mean()

pp(
    sorted(
        [
            (
                # varatiation
                v, 
                # difference in proportional contributions
                round(contributions_lit_c[v] - contributions_news_c[v], 2), 
                # percentage reduction in difference in mean word LLM surprisal betweeen nword and normative reference speaker
                round(100*(contributions_lit_c[v]*xbar_lit/100 - contributions_news_c[v]*xbar_news/100)/(xbar_lit - xbar_news), 1),
                # proportional contribution of variation wrt., nword mean word LLM surprisal
                round(contributions_lit_c[v], 2), 
                # proportional contribution of variation wrt., normative ref mean word LLM surprisal
                round(contributions_news_c[v], 2)
            ) 
            if v in contributions_news_c
            else (
                # varatiation
                v, 
                # difference in proportional contributions
                contributions_lit_c[v] - 0, 
                # reduction in difference in mean word LLM surprisal betweeen nword and normative reference speaker
                100*(contributions_lit_c[v]*xbar_lit/100 - 0)/(xbar_lit - xbar_news),
                # proportional contribution of variation wrt., nword mean word LLM surprisal
                contributions_lit_c[v], 
                # proportional contribution of variation wrt., normative ref mean word LLM surprisal
                0 
            ) for v in contributions_lit.keys()
        ], 
        key = lambda x: x[1], reverse=True
    )[:15]
)

[("['NG']->['N']", 2.23, 17.3, 1.53, -0.7),
 ('dialect_words2',
  0.4955245887615404,
  3.993806118703225,
  0.4955245887615404,
  0),
 ("['T']->[]", 0.44716408583037237, 3.6040323781250168, 0.44716408583037237, 0),
 ("['ER']->['AH']",
  0.42954214584137784,
  3.4620038827735264,
  0.42954214584137784,
  0),
 ("['AH']->[]", 0.42877018058885635, 3.455782033002858, 0.42877018058885635, 0),
 ("['R']->[]", 0.3085463740836171, 2.4868077683060195, 0.3085463740836171, 0),
 ('dialect_words1',
  0.2802483746226149,
  2.258732863533135,
  0.2802483746226149,
  0),
 ("['AE']->['AA', 'R']",
  0.26734129680233776,
  2.15470499581009,
  0.26734129680233776,
  0),
 ('italicised',
  0.26483787154652916,
  2.1345280049379456,
  0.26483787154652916,
  0),
 ("['TH']->['F']",
  0.24593656345737672,
  1.9821881178566234,
  0.24593656345737672,
  0),
 ("['TH']->['T']",
  0.23445056999205938,
  1.8896138399669649,
  0.23445056999205938,
  0),
 ("['T', 'ER']->[]",
  0.198993420783924,
  1.6038379518053985,
  

In [103]:
variations["['TH']->['T']"]

{"T'anks",
 "anyt'ing",
 'anyting',
 "eberyt'ing",
 'eberyting',
 'fortwid',
 'nuttin',
 "nuttin'",
 "pant'er",
 "pant'ers",
 'someting',
 "t'ing",
 "t'ings",
 "t'ink",
 "t'inks",
 "t'rough",
 'tief',
 'tink',
 'tinks',
 'troo'}