In [1]:
import sys
sys.path.append('..')
from osp import *

In [2]:
for id,docstr in JSTOR_STASH.items():
    doc = stanza.Document.from_serialized(docstr)
    break

In [5]:
for sent in doc.sentences:
    break

In [7]:
get_nlp()("Hello")

[
  [
    {
      "id": 1,
      "text": "Hello",
      "lemma": "hello",
      "upos": "INTJ",
      "xpos": "UH",
      "head": 0,
      "deprel": "root",
      "start_char": 0,
      "end_char": 5,
      "ner": "O",
      "multi_ner": [
        "O"
      ],
      "misc": "SpaceAfter=No"
    }
  ]
]

In [14]:
def get_sent(sent):
    if isinstance(sent, str):
        return get_nlp()(sent).sentences[0]
    return sent


In [19]:
sent = get_sent("Loose maxims and corruptive principles haunt us.")

In [24]:
tok=sent.tokens[0]

In [49]:
tok.to_dict()[0]

{'id': 1,
 'text': 'Loose',
 'lemma': 'Loose',
 'upos': 'ADJ',
 'xpos': 'JJ',
 'feats': 'Degree=Pos',
 'head': 2,
 'deprel': 'amod',
 'start_char': 0,
 'end_char': 5,
 'ner': 'O',
 'multi_ner': ('O',)}

In [None]:
tok.

In [120]:
def detokenize(tokens):
    l = []
    for tok in tokens:
        l.append(tok.text)
        l.append(tok.spaces_after)
    return ''.join(l).strip()

def get_punct_tok(tok):
    try:
        return tok.to_dict()[0]['upos']
    except Exception as e:
        return ''

def is_punct_tok(tok):
    return get_punct_tok(tok) == 'PUNCT'

def find_phrase_window(sent, tok_i, n, before=True):
    phrase = []
    phrase_nopunct = []
    if before:
        for tok in reversed(sent.tokens[:tok_i]):
            phrase.insert(0, tok)
            if not is_punct_tok(tok):
                phrase_nopunct.insert(0, tok)
            if len(phrase_nopunct) == n:
                break
    else:
        for tok in sent.tokens[tok_i+1:]:
            phrase.append(tok)
            if not is_punct_tok(tok):
                phrase_nopunct.append(tok)
            if len(phrase_nopunct) == n:
                break
    return phrase, phrase_nopunct



def find_parallelism(sent, max_n=10, center_pos = {'SCONJ', 'CCONJ', 'ADP'}):
    sent = get_sent(sent)
    ld = []
    for tok_i, tok in enumerate(sent.tokens):
        # if is_punct_tok(tok):
        #     continue
        if get_punct_tok(tok) not in center_pos:
            continue
        for n in range(2, max_n+1):
            before_phrase, before_phrase_nopunct = find_phrase_window(sent, tok_i, n, before=True)
            after_phrase, after_phrase_nopunct = find_phrase_window(sent, tok_i, n, before=False)

            if len(before_phrase_nopunct) != n or len(after_phrase_nopunct) != n:
                continue

            before_phrase_pos = [get_punct_tok(tok) for tok in before_phrase_nopunct]
            after_phrase_pos = [get_punct_tok(tok) for tok in after_phrase_nopunct]

            if before_phrase_pos != after_phrase_pos:
                continue

            phrase = before_phrase + [tok] + after_phrase

    
            d = {
                'phrase': detokenize(phrase),
                'phrase_pos': ' '.join([get_punct_tok(tok) for tok in phrase]),
                'part_pos': ' '.join(before_phrase_pos),
                'center_pos': get_punct_tok(tok),
                'phrase1': detokenize(before_phrase),
                'phrase2': tok.text,
                'phrase3': detokenize(after_phrase),
            }
            ld.append(d)
    return ld
    
            

In [121]:
sent = get_sent("Loose maxims, and, corruptive principles, haunt us.")

In [122]:
find_parallelism(sent)

[{'phrase': 'Loose maxims, and, corruptive principles',
  'phrase_pos': 'ADJ NOUN PUNCT CCONJ PUNCT ADJ NOUN',
  'part_pos': 'ADJ NOUN',
  'center_pos': 'CCONJ',
  'phrase1': 'Loose maxims,',
  'phrase2': 'and',
  'phrase3': ', corruptive principles'}]

In [123]:
def find_parallelism_in_doc(doc):
    ld = []
    for sent in doc.sentences:
        for d in find_parallelism(sent):
            ld.append({'sent':detokenize(sent.tokens), **d})
    return ld



In [129]:
STASH_PARALLELISM = HashStash('osp_parallel')

def find_parallelism_in_stash(stash, force=False):
    ld = []
    for id,docstr in tqdm(stash.items(), total=len(stash)):
        if not force and id in STASH_PARALLELISM:
            doc_ld = STASH_PARALLELISM[id]
        else:
            doc = stanza.Document.from_serialized(docstr)
            doc_ld = find_parallelism_in_doc(doc)
            STASH_PARALLELISM[id] = doc_ld
        
        for d in doc_ld:
            ld.append({'id':id, **d})
    return ld



In [131]:
# next(STASH_PARALLELISM.items())

In [132]:
ld_pmla = find_parallelism_in_stash(PMLA_STASH)

100%|██████████| 8840/8840 [1:36:36<00:00,  1.53it/s]  


In [133]:
ld_jstor = find_parallelism_in_stash(JSTOR_STASH)

100%|██████████| 16859/16859 [3:35:25<00:00,  1.30it/s]  
