In [1]:
import sys; sys.path.append('..')
from adjective_reading.parsing import *
import networkx as nx


OK


In [2]:
def get_keyword_sent_corpus():
    df = pd.read_csv(os.path.join(PATH_DATA, 'keyword_sent_corpus.csv.gz'))
    df['IN_STASH_NLP'] = [(url,sent) in STASH_NLP for url,sent in tqdm(zip(df['url'], df['context1']))]
    return df

df = get_keyword_sent_corpus()

91549it [00:02, 38935.85it/s]


In [3]:
df.IN_STASH_NLP.value_counts()

IN_STASH_NLP
False    48965
True     42584
Name: count, dtype: int64

In [4]:
keys = STASH_NLP.keys_l()
# Counter(url for url,sent in keys)
len(keys)

38943

In [5]:
def detokenize_tree(tree_str):
    toks = [x.split(')')[0] for x in tree_str.split() if x.endswith(')')]
    return ' '.join(toks).replace(" ,",",").replace(" .",".").replace(" !","!").replace(" ?","?")

def find_lowest_common_constituent(tree, id1, id2):
    """
    Find smallest constituent containing both word IDs.
    id1, id2 are 1-based word IDs from Stanza's dependency parse.
    """
    
    # 1. Map 1-based IDs to 0-based leaf indices
    idx1, idx2 = id1 - 1, id2 - 1
    
    # 2. Helper to assign (start, end) leaf indices to every node in the tree
    def assign_leaf_ranges(node, next_idx=0):
        if node.is_leaf():
            node.start_idx = next_idx
            node.end_idx = next_idx
            return next_idx + 1
        
        node.start_idx = float('inf')
        node.end_idx = float('-inf')
        
        for child in node.children:
            next_idx = assign_leaf_ranges(child, next_idx)
            node.start_idx = min(node.start_idx, child.start_idx)
            node.end_idx = max(node.end_idx, child.end_idx)
            
        return next_idx

    # Initialize ranges if they don't exist
    if not hasattr(tree, 'start_idx'):
        assign_leaf_ranges(tree)
    
    # 3. Recursive search using ranges
    def find_smallest(node, i1, i2):
        # Check if this node contains both indices
        if not (node.start_idx <= i1 <= node.end_idx and 
                node.start_idx <= i2 <= node.end_idx):
            return None
        
        # Check if any child contains both (to find the *lowest* common one)
        for child in node.children:
            result = find_smallest(child, i1, i2)
            if result is not None:
                return result
        
        # No child contains both, so this node is the LCA
        return node
    
    return find_smallest(tree, idx1, idx2)

def find_smallest_str(tree, id1, id2):
    lca = find_lowest_common_constituent(tree, id1, id2)
    # remove POS
    
    if lca is None:
        return None
    return str(lca)

    

def find_smallest_str_detokenized(tree, id1, id2):
    s = find_smallest_str(tree, id1, id2)
    return detokenize_tree(s)


In [6]:
# detokenize_tree('(S (NP (NNS Reading)) (VP (VBZ is) (NP (JJ immaterial)) (, ,) (ADVP (RB distant)) (VP (VBZ is) (NP (JJ pointless)))) (. .))')

In [7]:


def get_sentence_graph(sentence):
    """Create simplified syntactic graph with collapsed case markers and conjunctions"""
    G = nx.Graph()
    
    # Build index
    id2idstr = {word.id: f"{word.id:02d}_{word.text}" for word in sentence.words}
    
    # Find case markers and conjunction markers
    case_map = {}  # {governed_noun_id: (preposition_text, prep_id)}
    cc_map = {}    # {coordinated_word_id: (conjunction_text, cc_id)}
    
    for word in sentence.words:
        if word.deprel == 'case':
            case_map[word.head] = (word.text, word.id)
        elif word.deprel == 'cc':
            cc_map[word.head] = (word.text, word.id)
    
    # Add nodes (excluding case and cc markers)
    for word in sentence.words:
        if word.deprel in ['case', 'cc']:
            continue
        G.add_node(
            id2idstr[word.id],
            idx=word.id,
            text=word.text,
            pos=word.pos,
            lemma=word.lemma
        )

    # Add edges with simplified relations
    for word in sentence.words:
        if word.deprel in ['case', 'cc'] or word.head == 0:
            continue
        
        head_id = word.head
        child_id = word.id
        original_deprel = word.deprel
        
        # Default: relation is the same as the original deprel
        edge_attrs = {
            'deprel': original_deprel,
            'relation': original_deprel,
            'phrase': find_smallest_str_detokenized(sentence.constituency, word.id, head_id)
        }
        
        # 1. Collapse prepositions (case) -> meta category "prep"
        if child_id in case_map:
            prep_text, _ = case_map[child_id]
            edge_attrs['prep'] = prep_text
            edge_attrs['relation'] = 'prep'
            
        # 2. Collapse conjunctions (cc) -> meta category "conj_[word]"
        if child_id in cc_map:
            cc_text, _ = cc_map[child_id]
            edge_attrs['cc'] = cc_text
            # Only upgrade to meta-category if the link itself is a conjunction
            if original_deprel == 'conj':
                edge_attrs['relation'] = f'conj_{cc_text.lower()}'

        G.add_edge(
            id2idstr[head_id],
            id2idstr[child_id],
            **edge_attrs
        )

    return G

In [8]:
sentdoc = get_nlp_doc("Close or distant reading is immaterial.")
# sentdoc.sentences[0].words

In [9]:
g=get_sentence_graph(sentdoc.sentences[0])

In [10]:
list(g.edges(data=True))

[('01_Close',
  '04_reading',
  {'deprel': 'amod',
   'relation': 'amod',
   'phrase': 'Close or distant reading'}),
 ('01_Close',
  '03_distant',
  {'deprel': 'conj',
   'relation': 'conj_or',
   'phrase': 'Close or distant',
   'cc': 'or'}),
 ('04_reading',
  '06_immaterial',
  {'deprel': 'nsubj',
   'relation': 'nsubj',
   'phrase': 'Close or distant reading is immaterial.'}),
 ('05_is',
  '06_immaterial',
  {'deprel': 'cop', 'relation': 'cop', 'phrase': 'is immaterial'}),
 ('06_immaterial',
  '07_.',
  {'deprel': 'punct',
   'relation': 'punct',
   'phrase': 'Close or distant reading is immaterial.'})]

In [11]:
g.order(),g.size()

(6, 5)

In [12]:
# import random

# key=random.choice(keys)
# key
key = ('http://www.jstor.org/stable/44016498',
 "Miller's notes are usually very helpful and thorough but they provide the only documentation of his references and extensive further reading; unfortunately, there is no bibliography.")

In [13]:
import stanza

docstr = STASH_NLP[key]
doc = stanza.Document.from_serialized(docstr)
# doc

In [14]:
docg = get_sentence_graph(doc.sentences[0])

In [15]:
[(a,b,d) for a,b,d in docg.edges(data=True) if 'reading' in a.lower() or 'reading' in b.lower()]

[('18_references',
  '22_reading',
  {'deprel': 'conj',
   'relation': 'conj_and',
   'phrase': 'his references and extensive further reading',
   'cc': 'and'}),
 ('20_extensive',
  '22_reading',
  {'deprel': 'amod',
   'relation': 'amod',
   'phrase': 'extensive further reading'}),
 ('21_further',
  '22_reading',
  {'deprel': 'amod',
   'relation': 'amod',
   'phrase': 'extensive further reading'})]

In [16]:
# list(g.nodes(data=True))

In [17]:
def get_keyword_rels(g, keyword):
    def rename_idx(d):
        return {
            k if k != 'idx' else 'id': v
            for k,v in d.items()
        }
    
    out = []
    for n,d in g.nodes(data=True):
        if d.get('text', '').lower() == keyword:
            neighbors = list(g.neighbors(n))
            for neighbor in neighbors:
                neighbor_data = g.nodes[neighbor]
                edge_data = g.get_edge_data(n, neighbor)
                head_data = {
                    **{f'head_{k}': v for k,v in neighbor_data.items()}
                }
                this_data = {**rename_idx(d), **edge_data, **head_data, **d}
                out.append(this_data)
    return out


In [18]:
get_keyword_rels(docg, 'reading')

[{'id': 22,
  'text': 'reading',
  'pos': 'NOUN',
  'lemma': 'reading',
  'deprel': 'amod',
  'relation': 'amod',
  'phrase': 'extensive further reading',
  'head_idx': 20,
  'head_text': 'extensive',
  'head_pos': 'ADJ',
  'head_lemma': 'extensive',
  'idx': 22},
 {'id': 22,
  'text': 'reading',
  'pos': 'NOUN',
  'lemma': 'reading',
  'deprel': 'amod',
  'relation': 'amod',
  'phrase': 'extensive further reading',
  'head_idx': 21,
  'head_text': 'further',
  'head_pos': 'ADJ',
  'head_lemma': 'further',
  'idx': 22},
 {'id': 22,
  'text': 'reading',
  'pos': 'NOUN',
  'lemma': 'reading',
  'deprel': 'conj',
  'relation': 'conj_and',
  'phrase': 'his references and extensive further reading',
  'cc': 'and',
  'head_idx': 18,
  'head_text': 'references',
  'head_pos': 'NOUN',
  'head_lemma': 'reference',
  'idx': 22}]

In [19]:
# def get_all_keyword_rels(keys=None, n=None, shuffle=True    ):
#     if keys is None:
#         keys = STASH_NLP.keys_l()
#     if shuffle:
#         random.shuffle(keys)
#     out = []
#     for url,sent in tqdm(keys):
#         docstr = STASH_NLP[(url,sent)]
#         doc = stanza.Document.from_serialized(docstr)
#         docg = stanza_to_simplified_graph(doc.sentences[0])
#         for d in get_keyword_rels(docg, 'reading'):
#             out.append({
#                 'url': url,
#                 'context': sent,
#                 **d
#             })
#         if n is not None and len(out) >= n:
#             break
#     return out[:n]

        

def get_all_keyword_rels(keys=None, n=None, shuffle=True):
    df = get_keyword_sent_corpus()
    df_parsed = df[df.IN_STASH_NLP]
    out = []
    for i,row in tqdm(df_parsed.iterrows(), total=len(df_parsed)):
        url,sent = row['url'],row['context1']
        docstr = STASH_NLP[(url,sent)]
        doc = stanza.Document.from_serialized(docstr)
        docg = get_sentence_graph(doc.sentences[0])
        for d in get_keyword_rels(docg, 'reading'):
            out.append({
                **row.to_dict(),
                **d
            })
        if n is not None and len(out) >= n:
            break
    return out[:n]
        

In [20]:
odf = pd.DataFrame(get_all_keyword_rels(n=None))

91549it [00:01, 53118.50it/s]
100%|██████████| 42584/42584 [01:15<00:00, 560.40it/s]


In [24]:
odf.iloc[0]

url                            http://www.jstor.org/stable/461288
page_num                                                        4
sent_num                                                        4
sent            We are all our lifetime __reading__ the copiou...
context0        194 Pascal et le d6s6quilibre ment cette circu...
context1        We are all our lifetime reading the copious se...
context2        One moral we have already deduced, in consider...
token_num                                                       5
token0                                                   lifetime
token1                                                    reading
token2                                                        the
lemma                                                        read
pos                                                          VERB
year                                                         1967
decade                                                       1960
period    

In [25]:
odf.to_csv('../data/keyword_sent_corpus_parsed.csv.gz', index=False)

In [23]:
odf.iloc[0].phrase

'are all our lifetime reading the copious sense of this first of forms'