In [1]:
import sys
sys.path.append('..')
from osp import *

df_meta = get_corpus_metadata()

In [2]:


def get_recog_words(txt):
    return [
        w.lower() for w in tokenize_agnostic(txt)
        if w.strip().isalpha() and w.lower() in get_ok_words()
    ]

def count_recog_words(txt, n=SLICE_LEN):
    return Counter(get_recog_words(txt.lower())[:n])

def get_text_freqs(id, slice_len=SLICE_LEN, force=False):
    stash = HashStash(f'osp_freqs_slices_{slice_len}')
    if not force and id in stash:
        return {int(k):v for k,v in stash[id].items()}
    slices = get_text_slices(id)
    freqs = {
        int(slice_num): dict(count_recog_words(txt, slice_len))
        for slice_num, txt in slices.items()
    }
    stash[id] = freqs
    return freqs

In [3]:
def iter_slice_word_freqs(df_meta=None):
    df_meta = get_corpus_metadata() if df_meta is None else df_meta
    for id in tqdm(df_meta.index):
        for slice_num, freqs in get_text_freqs(id).items():
            yield id, slice_num, freqs


In [7]:
def get_words_freqs_slices(words, slice_len=SLICE_LEN):
    stash = HashStash(f'osp_word_freqs_slices_{slice_len}')
    # stash.clear()
    if not any(w not in stash for w in words):
        word2text2count = {
            w: stash[w]
            for w in words
        }
    else:        
        word2text2count = defaultdict(dict)
        for id, slice_num, freqs in iter_slice_word_freqs():
            for w, c in freqs.items():
                if w in words:
                    word2text2count[w][f'{id}__{slice_num:02d}'] = c

        for k,v in tqdm(list(word2text2count.items()), desc='saving to stash'):
            stash[k] = v

        for w in words:
            if w not in stash:
                stash[w] = {}

    return pd.DataFrame(word2text2count).rename_axis('id__slice')

In [8]:
# stash = HashStash(f'osp_word_freqs_slices_1000')
# stash

In [9]:
words = get_non_content_words()
# words =['insofar']
df_freqs = get_words_freqs_slices(words)
df_freqs

100%|██████████| 57620/57620 [05:38<00:00, 170.26it/s]
saving to stash: 100%|██████████| 641/641 [00:20<00:00, 30.83it/s] 


Unnamed: 0_level_0,it,of,the,few,in,and,its,here,not,without,...,streamside,blimey,shush,ahhhh,northeastward,downrange,meself,ciao,nyah,yuk
id__slice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
phil/10.2307/40231690__01,15.0,55.0,67.0,1.0,26.0,31.0,3.0,2.0,17.0,1.0,...,,,,,,,,,,
phil/10.2307/40231690__02,19.0,52.0,68.0,,14.0,23.0,1.0,,2.0,,...,,,,,,,,,,
phil/10.2307/40231690__03,22.0,48.0,57.0,,24.0,24.0,3.0,1.0,11.0,3.0,...,,,,,,,,,,
phil/10.2307/40231690__04,19.0,53.0,49.0,,26.0,24.0,4.0,1.0,13.0,2.0,...,,,,,,,,,,
phil/10.2307/40230399__01,22.0,51.0,49.0,,15.0,20.0,,1.0,12.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
phil/10.2307/42964179__01,,,,,,,,,,,...,,,,,,,,,,
phil/10.2307/42968544__01,,,,,,,,,,,...,,,,,,,,,,
lit/458048__01,,,,,,,,,,,...,,,,,,,,,,
lit/457913__01,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# df_freqs.sum(axis=1)

In [None]:
num_docs_s.loc['insofar']

In [None]:
non_content_words = get_non_content_words()
num_docs_s['is_non_content'] = num_docs_s.index.isin(non_content_words)
num_docs_s[num_docs_s.is_non_content].query('num_docs_rank < 10_000')