import pathlib
import typing
import numpy as np
from scipy.sparse import load_npz, lil_matrix, csr_matrix
import json
from tqdm import tqdm
import pandas as pd
from nltk.corpus import wordnet as wn
from functools import partial
import scipy
from pprint import pprint as pp
from copy import copy
import re
from collections import defaultdict
import os
from top2vec import Top2Vec
from tqdm.contrib.concurrent import process_map
from itertools import cycle

In [40]:
import pathlib
import typing
import numpy as np
from scipy.sparse import load_npz, lil_matrix, csr_matrix
import json
from tqdm import tqdm
import pandas as pd
from nltk.corpus import wordnet as wn
from functools import partial
import scipy
from pprint import pprint as pp
from copy import copy
import re
from collections import defaultdict
import os
from top2vec import Top2Vec
from tqdm.contrib.concurrent import process_map
from itertools import cycle

# Noun-Feature mutual affinity scores

The noun-feature mutual affinity scores are given by: 

```
    some metric proportional to (LLR-based) rank within row 
    
                            x
                            
    some metrix proportion to (LLR-based) rank within column
```                       
                      

<div>
<img src="Affinity_in_one.png" width="200"/>
</div>

but to avoid the unnecesary computational intensity we concentrate on retrieving the affinity scores for a row or for a column separately, as required, according to the following functions

i.e., wrt., function affinities_for_row:
<div>
<img src="Affinity_for_row.png" width="200"/>
</div>



In [41]:
def affinities_for_row(i, L, cutoff=0):
    """ Return the metrics to enable feature ranking wrt., some noun at row, i.
    
    
        for some row i, return ... 
        i) an array, corresponding to every column, of the proportion of other nouns with a greater llr than the row.
        ii) an array, corresponding to every column, of the proportion of other columns in the row with a higher llr.
    """
    
    # ------
    # get subset of L, for those columns where LLR(row) > cutoff
    # ------
    
    columns = np.array(range(L.shape[1]))
    row = L[i,:].toarray().squeeze()
    coincident_columns = row > cutoff
    M =  L[:,coincident_columns].toarray()
    
    # within--row ranks and within_col ranks for each element in the row
    rank_within_row = (M[i,:] > M[i,:].reshape(-1,1)).sum(axis=1) + 1  # Rr
    Mr = (1-(rank_within_row-1)/M.shape[1])
    
    ranks_within_cols = (M > M[i,:]).sum(axis=0) + 1  # Rc
    Nr = (M>cutoff).sum(axis=0) # i.e., number of rows above threshold wrt., each column, separately
    Mc = (1-(ranks_within_cols-1)/Nr)
    
    # affinity
    mutual_affinity = Mr * Mc  # high means high affinity
    
    # return sorted by descending mutual affinity 
    return sorted(list(zip(columns[coincident_columns], M[i,:], mutual_affinity, rank_within_row, [M.shape[1]]*M.shape[1], ranks_within_cols, Nr)), key=lambda x: x[2], reverse=True)


... and wrt, function affinities_for_column:

<div>
<img src="Affinity_for_col.png" width="200"/>
</div>

In [74]:
def affinities_for_column(j, L, cutoff=0):
    """ for some column, return ...
        i)
        ii) an array, corresponding to every row, of the proportion of other rows in the column with a higher llr
    """
    
    # ------
    # get L subset, for those rows where LLR(row, j) > cutoff
    # ------
    rows = np.array(range(L.shape[0]))
    column = L[:,j].toarray().squeeze()
    coincident_rows = column > cutoff 
    M =  L[coincident_rows,:].toarray()
    
    rank_within_col = (M[:,j] > M[:,j].reshape(-1,1)).sum(axis=1) + 1  # Rc
    Mc = (1-(rank_within_col-1)/M.shape[0])
    
    ranks_within_rows = (M > M[:,j].reshape(-1,1)).sum(axis=1) + 1  # Rr
    Nc = (M>cutoff).sum(axis=1)  # i.e., number of cols above threshold wrt., each row
    Mr = (1-(ranks_within_rows-1)/Nc)
    
    # affinity
    mutual_affinity = Mr * Mc  # high means high affinity
    
    return sorted(list(zip(rows[coincident_rows], M[:,j], mutual_affinity, ranks_within_rows, Nc, rank_within_col, [M.shape[0]]*M.shape[0])), key=lambda x: x[2], reverse=True)


In [43]:
from scipy.stats import binom

def LLR(c: np.ndarray, F: np.ndarray):
    """ Where c is a 1D array column of noun counts, from some feature.
        Return a 1D array of LLRs for each noun.
    """
    
    # a = study corpus, i.e., noun and noun' counts for feature
    fn = c 
    f_n = c.sum() - c
    
    # _a = reference corpus, i.e, noun and noun' counts for feature'
    _fn = F.sum(axis=1) - c 
    _f_n = F.sum() - fn - f_n - _fn
    
    # generative process 1, i.e., nouns are spawned according to different probabilities depending on associated feature group
    pnf = fn / (fn + f_n)  # P(n|f)
    pn_f = _fn / (_fn + _f_n)  # P(n|f')
    
    # generative process 2, i.e., nouns are spawned at same probability regardless of associated feature group
    p = (fn + _fn) / (F.sum())
    
    # LLR = 2 * log(L(observed noun & noun' freqs | generative process 1) / L( observed noun & noun' freqs | generative process 2))
    # where e.g., L(observed noun & noun' freqs | generative process 1) is product wrt., feature group and not feature group sets
    LL = lambda k, n, p: np.log(binom.pmf(k,n,p))
    LLRs = 2*(LL(fn, fn + f_n, pnf) + LL(_fn, _fn + _f_n, pn_f) - LL(fn, fn + f_n, p) - LL(_fn, _fn + _f_n, p))
    
    return LLRs

# test as per Dunning paper
F = np.array([[110, 2442],[111, 29114]])
c = F[:,0]
LLR(c,F)[0]

270.7218769362335

In [44]:
def log_binom(k: np.ndarray, n: float, p: np.ndarray) -> np.ndarray:

    result = np.zeros(len(k))
    # Note: @p==1, log(p) = 0: hence handled implicity

    # where p > 0 and p < 1
    mask = (p > 0) & (p < 1)
    result[mask] = k[mask] * np.log(p[mask]) + (n - k[mask]) * np.log(1 - p[mask])

    return result



def get_llr_profile(row_is: list[int], F) -> np.ndarray:
    """Build a llr profile for row_i
    where row_i is the study corpus and row_i' is the ref corpus.
    """

    # get the study corpus and ref_corpus
    study = F[row_is, :].toarray().sum(axis=0).squeeze()  # i.e., freqs wrt., noun
    global_profile = np.array(F.sum(axis=0)).squeeze()
    ref = (
        global_profile - study
    )  # freqs aggregated over all nouns except (excl. study results)

    # build llr profile
    llr_profile = np.zeros(len(study))

    n1 = study.sum()
    n2 = ref.sum()
    n = n1 + n2

    study_mle = study / n1
    ref_mle = ref / n2
    p = (study + ref) / n  # combined mle

    # i.e., where study = 0, make the result 0, as the entry is irrelevant to study
    mask = study > 0
    llr_profile[mask] = 2 * (
        # alt
        log_binom(study[mask], n1, study_mle[mask])
        + log_binom(ref[mask], n2, ref_mle[mask])
        # null
        - log_binom(study[mask], n1, p[mask])
        - log_binom(ref[mask], n2, p[mask])
    )

    return lil_matrix(llr_profile)


# English Literature

## token level

### Load the frequency tables and LLR tables

In [107]:
# load noun,adj pair info
L_adj = load_npz(pathlib.Path('LLR/llr_scores/PR/adj/llr_profiles.npz'))
F_adj = load_npz(pathlib.Path('LLR/llr_scores/PR/adj/freq_profiles.npz'))
print(L_adj.shape)
print(F_adj.sum())
with open('LLR/llr_scores/PR/adj/feature2i.json', "r", encoding='utf-8') as f:
    feature2j_adj =  json.load(f)
j2feature_adj = {j:feature for feature, j in feature2j_adj.items()}

          
with open('LLR/llr_scores/PR/adj/noun2i.json', "r", encoding='utf-8') as f:
    noun2i_adj =  json.load(f)
i2noun_adj = {i: noun for noun, i in noun2i_adj.items()}
    

(243529, 299557)
29763170


In [108]:
# load agent, verb info
L_agent = load_npz(pathlib.Path('LLR/llr_scores/PR/agent/llr_profiles.npz'))
F_agent = load_npz(pathlib.Path('LLR/llr_scores/PR/agent/freq_profiles.npz'))
print(L_agent.shape)
print(F_agent.sum())

with open('LLR/llr_scores/PR/agent/feature2i.json', "r", encoding='utf-8') as f:
    feature2j_agent =  json.load(f)
j2feature_agent = {j:feature for feature, j in feature2j_agent.items()}

          
with open('LLR/llr_scores/PR/agent/noun2i.json', "r", encoding='utf-8') as f:
    noun2i_agent =  json.load(f)
i2noun_agent = {i: noun for noun, i in noun2i_agent.items()}

(238142, 48704)
16157119


In [109]:
# load patient, verb info
L_patient = load_npz(pathlib.Path('LLR/llr_scores/PR/patient/llr_profiles.npz'))
F_patient = load_npz(pathlib.Path('LLR/llr_scores/PR/patient/freq_profiles.npz'))
print(L_patient.shape)
print(F_patient.sum())

with open('LLR/llr_scores/PR/patient/feature2i.json', "r", encoding='utf-8') as f:
    feature2j_patient =  json.load(f)
j2feature_patient = {j:feature for feature, j in feature2j_patient.items()}

          
with open('LLR/llr_scores/PR/patient/noun2i.json', "r", encoding='utf-8') as f:
    noun2i_patient =  json.load(f)
i2noun_patient = {i: noun for noun, i in noun2i_patient.items()}

(163198, 35484)
8996080


### quick feature -> noun

In [146]:
feature = "treat"
for feature2j, i2noun, L, F, feature_type in [(feature2j_adj, i2noun_adj, L_adj, F_adj, "adj"), (feature2j_agent, i2noun_agent, L_agent, F_agent, "agent"), (feature2j_patient, i2noun_patient, L_patient, F_patient, "patient")]:
    print(f"where feature is a {feature_type}")
    if feature in feature2j:
        pp([(i2noun[i], llr, f'{Rr}/{Nc}', f'{Rc}/{Nr}', rank, mutual_affinity) for rank, (i, llr, mutual_affinity, Rr, Nc, Rc, Nr) in enumerate(affinities_for_column(feature2j[feature], L), start=1)][:100])
            

where feature is a adj
where feature is a agent
[('husband', 153.55315428091853, '7/2209', '3/2730', 1, 0.9965532279701588),
 ('Writers', 42.23149923517485, '1/38', '15/2730', 2, 0.9948717948717949),
 ('fecklessness', 37.36393086754833, '1/3', '17/2730', 3, 0.9941391941391942),
 ('gudeman', 31.950964720876073, '1/35', '25/2730', 4, 0.9912087912087912),
 ('Tolstoy', 31.950964720876073, '1/41', '25/2730', 5, 0.9912087912087912),
 ('compromising', 31.280646647021058, '1/1', '28/2730', 6, 0.9901098901098901),
 ('Recommender', 31.280646647021058, '1/1', '28/2730', 7, 0.9901098901098901),
 ('Cavolfiore', 31.280646647021058, '1/1', '28/2730', 8, 0.9901098901098901),
 ('dacotah', 27.462364790582797, '1/2', '34/2730', 9, 0.987912087912088),
 ('world', 188.6730186945788, '35/2593', '1/2730', 10, 0.9868877747782492),
 ('father', 170.32571349559294, '48/3550', '2/2730', 11, 0.9863991126244648),
 ('author', 96.01161881857843, '23/1683', '5/2730', 12, 0.9854820560702914),
 ('Confident', 23.645689231

### quick noun -> feature

In [148]:
noun = "negress"
for noun2i, j2feature, L, F, feature_type in [(noun2i_adj, j2feature_adj, L_adj, F_adj, "adj"), (noun2i_agent, j2feature_agent, L_agent, F_agent, "agent"), (noun2i_patient, j2feature_patient, L_patient, F_patient, "patient")]:
    print(feature_type)
    if noun in noun2i:
        pp([(j2feature[j], llr, f'{Rr}/{Nc}', f'{Rc}/{Nr}', rank, mutual_affinity) for rank, (j, llr, mutual_affinity, Rr, Nc, Rc, Nr) in enumerate(affinities_for_row(noun2i[noun], L), start=1)][:50])
        

adj
[('old', 276.72010190039873, '1/87', '670/33270', 1, 0.9798917944093778),
 ('surly', 162.33497600474948, '2/87', '7/652', 2, 0.9794090684718991),
 ('full-blooded', 51.90150745264964, '3/87', '2/183', 3, 0.971672633628541),
 ('blind_as_negro', 23.52840633913273, '6/87', '1/1', 4, 0.9425287356321839),
 ('long_on_errand', 23.52840633913273, '6/87', '1/1', 5, 0.9425287356321839),
 ('thick-lipped', 29.266646675692073, '4/87', '4/35', 6, 0.8827586206896552),
 ('NEG_real', 21.100723623696467, '10/87', '6/297', 7, 0.8814582607686056),
 ('hideous', 22.48589895409532, '9/87', '250/2679', 8, 0.8236475267405491),
 ('poor', 28.99200202897191, '5/87', '2791/20374', 9, 0.8233798090647423),
 ('aged', 23.136929231201066, '8/87', '199/1890', 10, 0.8232074438970991),
 ('shriveled', 14.760171735489507, '15/87', '3/28', 11, 0.7791461412151067),
 ('unprotected', 18.43107686676376, '14/87', '29/281', 12, 0.7658199370065857),
 ('deformed', 19.97101309140271, '11/87', '31/210', 13, 0.7586206896551725),
 ('

### relate the original paragraph to the document topic

In [None]:
# build document (cleaned paragraph fed to top2vec) 2 topic
document2topic = {}
dict(zip(*model.get_topic_sizes()[::-1]))
topic2size = dict(zip(*model.get_topic_sizes()[::-1]))
for i in range(model.get_num_topics()):
    for doc, score, id_ in list(zip(*model.search_documents_by_topic(topic_num=i, num_docs=topic2size[i]))):
        document2topic[doc] = i

In [None]:
# build original paragraph (i.e, pre-cleaning version) to topic
df = pd.read_csv('/Users/ryanbrate/surfdrive/Data/LREC_2023/top2vecs/PR.csv')

In [None]:
paragraph2topic = {}
for original, document in tqdm(zip(df['text'].values, df['clean'].values)):
    try:
        topic = document2topic[document]
        paragraph2topic[original] = topic
    except:
        pass

# American Literature

In [173]:
# load noun,adj pair info
L_adj = load_npz(pathlib.Path('LLR/llr_scores/PS/adj/llr_profiles.npz'))
F_adj = load_npz(pathlib.Path('LLR/llr_scores/PS/adj/freq_profiles.npz'))
print(L_adj.shape)
print(F_adj.sum())

with open('LLR/llr_scores/PS/adj/feature2i.json', "r", encoding='utf-8') as f:
    feature2j_adj =  json.load(f)
j2feature_adj = {j:feature for feature, j in feature2j_adj.items()}

          
with open('LLR/llr_scores/PS/adj/noun2i.json', "r", encoding='utf-8') as f:
    noun2i_adj =  json.load(f)
i2noun_adj = {i: noun for noun, i in noun2i_adj.items()}

(166863, 232349)
22592953


In [150]:
# load agent, verb info
L_agent = load_npz(pathlib.Path('LLR/llr_scores/PS/agent/llr_profiles.npz'))
F_agent = load_npz(pathlib.Path('LLR/llr_scores/PS/agent/freq_profiles.npz'))
print(L_agent.shape)
print(F_agent.sum())

with open('LLR/llr_scores/PS/agent/feature2i.json', "r", encoding='utf-8') as f:
    feature2j_agent =  json.load(f)
j2feature_agent = {j:feature for feature, j in feature2j_agent.items()}

          
with open('LLR/llr_scores/PS/agent/noun2i.json', "r", encoding='utf-8') as f:
    noun2i_agent =  json.load(f)
i2noun_agent = {i: noun for noun, i in noun2i_agent.items()}

(180325, 29058)
14266666


In [151]:
# load patient, verb info
L_patient = load_npz(pathlib.Path('LLR/llr_scores/PS/patient/llr_profiles.npz'))
F_patient = load_npz(pathlib.Path('LLR/llr_scores/PS/patient/freq_profiles.npz'))
print(L_patient.shape)
print(F_patient.sum())

with open('LLR/llr_scores/PS/patient/feature2i.json', "r", encoding='utf-8') as f:
    feature2j_patient =  json.load(f)
j2feature_patient = {j:feature for feature, j in feature2j_patient.items()}

          
with open('LLR/llr_scores/PS/patient/noun2i.json', "r", encoding='utf-8') as f:
    noun2i_patient =  json.load(f)
i2noun_patient = {i: noun for noun, i in noun2i_patient.items()}

(116132, 22389)
7725577


## quick feature -> noun

In [70]:
(L_adj[:, feature2j_adj['savage']] > 0).shape

(166863, 1)

In [172]:
feature = "onery"
for feature2j, i2noun, L, F, feature_type in [(feature2j_adj, i2noun_adj, L_adj, F_adj, "adj"), (feature2j_agent, i2noun_agent, L_agent, F_agent, "agent"), (feature2j_patient, i2noun_patient, L_patient, F_patient, "patient")]:
    print(f"where feature is a {feature_type}")
    if feature in feature2j:
        pp([(i2noun[i], llr, f'{Rr}/{Nc}', f'{Rc}/{Nr}', rank, mutual_affinity) for rank, (i, llr, mutual_affinity, Rr, Nc, Rc, Nr) in enumerate(affinities_for_column(feature2j[feature], L), start=1)][:100])
            

where feature is a adj
[('gallinipper', 51.63140007185507, '1/1', '2/41', 1, 0.975609756097561),
 ('nigger', 69.51608022407709, '19/609', '1/41', 2, 0.9704433497536946),
 ('hiller', 25.797841986394133, '1/1', '4/41', 3, 0.926829268292683),
 ('gad', 35.85428607403742, '3/20', '3/41', 4, 0.8560975609756097),
 ('thief', 18.798363545157372, '69/700', '6/41', 5, 0.7927526132404181),
 ('head', 19.662037242644146, '687/3914', '5/41', 6, 0.7442700998292558),
 ('Slim', 14.997050034807899, '10/52', '9/41', 7, 0.6655722326454033),
 ('feller', 15.320459501776213, '164/796', '8/41', 8, 0.6594558156636843),
 ('galoot', 13.629633022768303, '17/83', '11/41', 9, 0.6103438142815163),
 ('niggah', 14.972664144525197, '10/40', '10/41', 10, 0.6048780487804879),
 ('McCoy', 17.4841060802014, '5/13', '7/41', 11, 0.5909943714821764),
 ('darkey', 11.927284297322103, '16/117', '15/41', 12, 0.574108818011257),
 ('cayuse', 12.46988052696156, '28/113', '12/41', 13, 0.5568745952946255),
 ('whelp', 12.375177460299255,

## quick noun -> feature

In [170]:
noun = "negro"
for noun2i, j2feature, L, F, feature_type in [(noun2i_adj, j2feature_adj, L_adj, F_adj, "adj"), (noun2i_agent, j2feature_agent, L_agent, F_agent, "agent"), (noun2i_patient, j2feature_patient, L_patient, F_patient, "patient")]:
    print(feature_type)
    if noun in noun2i:
        pp([(j2feature[j], llr, f'{Rr}/{Nc}', f'{Rc}/{Nr}', rank, mutual_affinity) for rank, (j, llr, mutual_affinity, Rr, Nc, Rc, Nr) in enumerate(affinities_for_row(noun2i[noun], L),start=1)][:50])
        
        

adj
[('old', 3166.254305335693, '1/872', '23/28879', 1, 0.9992382007687247),
 ('aged', 298.08812963202945, '4/872', '10/1612', 2, 0.9909957144808432),
 ('free', 519.7414618574549, '2/872', '33/4059', 3, 0.9909785367661851),
 ('full-blooded', 157.58774953667125, '9/872', '1/166', 4, 0.9908256880733946),
 ('ignorant', 245.19641768129077, '6/872', '6/1325', 5, 0.9905141076683399),
 ('jamaican', 122.47426499364894, '14/872', '1/24', 6, 0.9850917431192661),
 ('poor', 256.6365706899669, '5/872', '142/12979', 7, 0.9845989746315679),
 ('giant', 208.1344996400876, '7/872', '20/2147', 8, 0.9843305999837623),
 ('runaway', 366.30436805994395, '3/872', '6/353', 9, 0.9835746030095901),
 ('gigantic', 136.35241262504132, '11/872', '18/2085', 10, 0.9804721360526257),
 ('faithful', 170.66775751224486, '8/872', '36/2367', 11, 0.9773045274667349),
 ('burly', 137.83792073684162, '10/872', '9/573', 12, 0.975861392638135),
 ('frightened',
  132.79773896551342,
  '12/872',
  '25/1936',
  13,
  0.9751450072029

In [None]:
## readying to topic modelling:

e.g., L_feature > some_LLR gives us noun, feature pairs with some degree of significant association. we assume these 
are contextually meaningful sentences.

can we use bert topic to allocate topic to every span of text matching L_feature > some_LLR value?
then back-calculate a new LLR matrix, of 

In [None]:
(L_adj > 100).toarray().sum()

# find books matching tuples of interest 

In [None]:
tuples_of_interest = [
["barbarian", "negro", "adj"],
["gypsy", "witchy", "adj"],
["tribe", "savage", "adj"],
["Indians", "savage", "adj"],
["race", "savage", "adj"],
["native", "superstitious", "adj"],
["negro", "superstitious", "adj"],
["culture", "primitive", "adj"],
["race", "primitive", "adj"],
["savage", "wily", "adj"],
["Oriental", "wily", "adj"],
["redskin", "wily", "adj"],
["native", "ignorant", "adj"],
["native", "treacherous", "adj"],
["cripple", "miserable", "adj"],
["cripple", "hopeless", "adj"],
["cripple", "hapless", "adj"],
["cripple", "wretched", "adj"],
["cripple", "poor", "adj"],
["cripple", "deformed", "adj"],
["dwarf", "ugly", "adj"],
["dwarf", "misshapen", "adj"],
["dwarf", "hunchbacked", "adj"],
["dwarf", "grotesque", "adj"],
["dwarf", "shrivelled", "adj"],
["dwarf", "hideous", "adj"],
["dwarf", "crooked", "adj"],
["native", "uncivlised", "adj"],
["native", "ignorant", "adj"],
["native", "hostile", "adj"],
["native", "little", "adj"],
["native", "cannibal", "adj"],
["native", "wretched", "adj"],
["negress", "surly", "adj"],
["negress", "full-blooded", "adj"],
["negress", "slatternly", "adj"],
["negress", "big", "adj"],
["negress", "fat", "adj"],
["negress", "huge", "adj"],
["negress", "gigantic", "adj"],
["negro", "half-naked", "adj"],
["negro", "big", "adj"],
["negro", "naked", "adj"],
["negro", "tall", "adj"],
["negro", "burly", "adj"],
["negro", "huge", "adj"],
["negro", "giant", "adj"],
["negro", "diminutive", "adj"],
["negro", "pure-blooded", "adj"],
["negro", "stout", "adj"],
["negro", "gigantic", "adj"],
["negro", "full-blooded", "adj"],
["nigger", "silly", "adj"],
["nigger", "worthless", "adj"],
["slave", "negro", "adj"],
["servant", "native", "adj"],
["tribe", "savage", "adj"],
["tribe", "hostile", "adj"],
["tribe", "primitive", "adj"],
["tribe", "barbarous", "adj"],
["tribe", "warlike", "adj"],
["tribe", "wild", "adj"],
["tribesman", "wild", "adj"],
["tribesman", "savage", "adj"],
["tribesman", "ferocious", "adj"],
["tribesman", "primitive", "adj"],
["tribesman", "quarrelsome", "adj"],
["oriental", "lazy", "adj"],
["oriental", "insensible", "adj"],
["spinster", 'sour', 'adj'],
["spinster", 'unwanted', 'adj'],
["bachelor", 'eligible', 'adj'],
#
["negro", "grin", "agent"],
["negro", "obey", "agent"],
["negro", "shuffle", "agent"],
["negro", "murder", "agent"],
["negro", "mutilate", "agent"],
["negress", "grin", "agent"],
["nigger", "NEG_work", "agent"],
["nigger", "moan", "agent"],
["nigger", "rob", "agent"],
["nigger", "groan", "agent"],
["tribe", "fight", "agent"],
["gypsy", "steal", "agent"],
#
["negro", "lynch", "patient"],
["negro", "sell", "patient"],
["nigger", "marry", "patient"],
["nigger", "beat", "patient"],
["nigger", "shoot", "patient"],
["slave", "sell", "patient"],
["slave", "torment", "patient"],
]

In [None]:
def gen_dir(
    directory: pathlib.Path, *, pattern: re.Pattern = re.compile(".+")
) -> typing.Generator:
    """Return a generator yielding pathlib.Path objects in a directory,
    optionally matching a pattern.

    Args:
        dir (str): directory from which to retrieve file names [default: script dir]
        pattern (re.Pattern): regex pattern [default: all files]
    """

    for filename in os.listdir(directory):
        if re.match(pattern, filename):
            yield directory / filename
        else:
            continue

results = []
tuples_dir = pathlib.Path("/Users/ryanbrate/surfdrive/Data/LREC_2023/tuples/PG_sample")
for d in ['PR', 'PS']:
    for fp in tqdm(gen_dir(tuples_dir/d, pattern=re.compile(".*\.json$"))):
        
        with open(fp, "r") as f:
            doc = json.load(f)
            
        try:
            for i in range(0, len(doc)-1, 2):
                text = doc[i]
                for t in doc[i+1]:
                    if t[:3] in tuples_of_interest:
                        results.append((d, fp.stem, t[:3]))
        except:
            pass
        

In [None]:
df = pd.read_csv("/Users/ryanbrate/Projects/LREC_2023/sample_PG/PG_index/pg_catalog.csv")
df = df.set_index("Text#")


results_ = [ (d, i, df.loc[int(i),"Title"],df.loc[int(i),"Authors"],  t) for d, i, t in results]


In [None]:
[r for r in results if r[1]=="6173"]
# results

In [None]:
print(df.loc[61014,:])

In [None]:
print(len(results))
results_

In [None]:
[t for t in results_ if "Twain" in str(t[3])]

In [None]:
[t for t in results_ if "surly" in str(t[4])]

## Ranking by book

In [None]:
by_book = defaultdict(list)
by_author = defaultdict(list)
book2count = defaultdict(int)
author2count = defaultdict(int)
author2i = {}
for r in results_:
    
    by_book[r[1]].append((r[0], r[2], r[3], r[4]))
    book2count[r[1]] += 1
    
    if r[3] not in author2i.keys():
        author2i[r[3]] = len(author2i.keys())
        
    by_author[author2i[r[3]]].append((r[0], r[1], r[2], r[4], r[3]))
    author2count[author2i[r[3]]] += 1
    

In [None]:
sorted(book2count.items(), key=lambda x: x[1], reverse=True)

In [None]:
by_book['30125']

In [None]:
sorted(author2count.items(), key=lambda x: x[1], reverse=True)

In [None]:
by_author[118]