In [3]:
import sys; sys.path.append('..')
from osp import *

In [4]:
input_stash = HashStash('osp_slices_1000_nlp')
input_stash


Config,Param,Value
Path,Root Dir,/Users/ryan/.cache/hashstash/osp_slices_1000_nlp
,Filename,data.db
Engine,Engine,lmdb
,Serializer,hashstash
,Compress,lz4
,B64,True
Stats,Len,8756


In [5]:
key,docstr = next(input_stash.items())
doc = stanza.Document.from_serialized(docstr)
word = doc.sentences[1].words[5]
key

'phil/10.2307/20140633__02'

In [6]:
def get_word_context(doc, sent_i, word_i, context_len=2):
    sent = doc.sentences[sent_i]

    prev_context=''
    next_context=''

    words_forward=sent.words[word_i+1:]
    words_backward=reversed(sent.words[:word_i])
    for w in words_forward:
        if len(next_context) < context_len:
            next_context+=w.text+' '
        else:
            break
    for w in words_backward:
        if len(prev_context) < context_len:
            prev_context=w.text+' '+prev_context+' '
        else:
            break

    word = sent.words[word_i]
    out = f'{prev_context.strip()} {word.text.upper()} {next_context.strip()}'
    out = out.replace('\n',' ').replace(' ,',',').replace(' .', '.').replace(' !','!').replace(' ?','?').replace( ':',':').replace(' ;',';')
    out = out.replace('( ','(').replace('[ ','[').strip().replace(' )',')').replace(' ]',']').replace('"', ' ').replace("'"," ")
    return remove_left_right_punct(out.strip()).strip()

FEAT_N = 100
FEAT_MIN_COUNT = 0


In [7]:
STASH_POS_COUNTS = HashStash('osp_slices_1000_pos_counts')
STASH_FEAT2WORD2COUNT = HashStash('osp_slices_1000_feat2word2count')
STASH_FEAT2WORD2EG = HashStash('osp_slices_1000_feat2word2eg')
CONTEXT_LEN = 15


def get_pos_counts(doc, feat2word2count=None, feat2word2eg=None, context_len=CONTEXT_LEN):
    pos_counts = Counter()
    deprel_counts = Counter()
    if feat2word2count is None: feat2word2count = defaultdict(Counter)
    if feat2word2eg is None: feat2word2eg = defaultdict(dict)

    for sent_i,sent in enumerate(doc.sentences):
        for word_i,word in enumerate(sent.words):
            if word.pos in {'X'} or word.deprel in {'flat'}:
                continue
            pos = word.xpos
            deprel = word.deprel
            pos_counts[pos]+=1
            deprel_counts[deprel]+=1

            eg_word = word.text.lower()
            if feat2word2count is not None:
                feat2word2count[deprel][eg_word]+=1
                feat2word2count[pos][eg_word]+=1

            if feat2word2eg is not None:
                eg_context = get_word_context(doc, sent_i, word_i, context_len=context_len).strip()
                feat2word2eg[deprel][eg_word] = eg_context
                if not eg_word in feat2word2eg[pos]:
                    feat2word2eg[pos][eg_word] = eg_context

    sum_pos_counts = sum(pos_counts.values())
    # print(sum_pos_counts)
    pos_counts_rel = {k:int(round(v/sum_pos_counts*1000)) for k,v in pos_counts.items()}

    sum_deprel_counts = sum(deprel_counts.values())
    deprel_counts_rel = {k:int(round(v/sum_deprel_counts*1000)) for k,v in deprel_counts.items()}

    return {**pos_counts_rel,**deprel_counts_rel}, feat2word2count, feat2word2eg

def gen_pos_counts(id, force=False):
    if not force and id in STASH_POS_COUNTS and id in STASH_FEAT2WORD2COUNT and id in STASH_FEAT2WORD2EG:
        return STASH_POS_COUNTS[id], STASH_FEAT2WORD2COUNT[id], STASH_FEAT2WORD2EG[id]
    
    docstr = input_stash[id]
    doc = stanza.Document.from_serialized(docstr)
    counts, feat2word2count, feat2word2eg = get_pos_counts(doc)
    STASH_POS_COUNTS[id] = counts
    STASH_FEAT2WORD2COUNT[id] = feat2word2count
    STASH_FEAT2WORD2EG[id] = feat2word2eg
    return counts, feat2word2count, feat2word2eg


In [11]:
import random
ids = list(input_stash.keys())
random.shuffle(ids)

In [12]:
for id in tqdm(ids):
    gen_pos_counts(id, force=False)

100%|██████████| 8756/8756 [02:50<00:00, 51.43it/s]
