In [1]:
import numpy as np
import pandas as pd
import jsonlines

from scicite.compute_features import get_formulaic_features, get_agent_features
from scicite.resources.lexicons import FORMULAIC_PATTERNS, AGENT_PATTERNS



In [2]:
FILE_TEST_PRED = "preds/origin/out_test.jsonl"
FILE_TEST_TRUE = "scicite/data/acl-arc/test.jsonl"

FILE_DEV_PRED = "preds/origin/out_dev.jsonl"
FILE_DEV_TRUE = "scicite/data/acl-arc/dev.jsonl"

FILE_TRAIN_PRED = "preds/origin/out_train.jsonl"
FILE_TRAIN_TRUE = "scicite/data/acl-arc/train.jsonl"

In [40]:
def get_patterns_from_cit(citation):
    cit_sent = citation['sents_before'][0]
    print("Citation text:\n")
    print(citation['text'], end='\n\n')
    
    f_features, f_feature_names, f_patterns = get_formulaic_features(cit_sent, count=False)
    print("Formulaic patterns:\n")
    for pat in f_patterns:
        print(pat)
    
    print()
    
    a_features, a_feature_names, a_patterns = get_agent_features(cit_sent, count=False)
    print("Agent patterns:\n")
    for pat in a_patterns:
        print(pat)

In [34]:
dev_list = []
with jsonlines.open(FILE_DEV_TRUE) as reader:
    for obj in reader:
        dev_list.append(obj)

In [35]:
get_patterns_from_cit(dev_list[11])

Citation text:

Our knowledge extractors rely extensively on MetaMap ( Aronson 2001 ) , a system for identifying segments of text that correspond to concepts in the UMLS Metathesaurus .

Formulaic patterns:

a @WORK_NOUN for
@WORK_NOUN for #VV
@SELF_NOM

Agent patterns:



In [36]:
get_patterns_from_cit(dev_list[111])

Citation text:

OT therefore holds out the promise of simplifying grammars , by factoring all complex phenomena into simple surface-level constraints that partially mask one another .1 Whether this is always possible under an appropriate definition of `` simple constraints '' ( e.g. , Eisner 1997b ) is of course an empirical question .

Formulaic patterns:

@GOOD_ADJ

Agent patterns:



In [37]:
get_patterns_from_cit(dev_list[27])

Citation text:

This Principle of Finitism is also assumed by Johnson-Laird ( 1983 ) , Jackendoff ( 1983 ) , Kamp ( 1981 ) , and implicitly or explicitly by almost all researchers in computational linguistics .

Formulaic patterns:

researcher in @DISCIPLINE
@PROFESSIONALS

Agent patterns:



In [38]:
get_patterns_from_cit(dev_list[54])

Citation text:

More specifically , the notion of the phrasal lexicon ( used first by Becker 1975 ) has been used successfully in a number of areas :

Formulaic patterns:


Agent patterns:



In [39]:
get_patterns_from_cit(dev_list[99])

Citation text:

Discrepancies in length throw constituents off balance , and so prosodic phrasing will cross constituent boundaries in order to give the phrases similar lengths ; this is the case in Chickens were eating II the remaining green vegetables , where the subject-predicate boundary finds no prosodic correspondent .4 The most explicit version of this approach is the analysis presented in Gee and Grosjean ( 1983 ) ( henceforth G&G ) .

Formulaic patterns:

in order to
the @WORK_NOUN @GIVEN
the @WORK_NOUN

Agent patterns:



***Pattern coverage***

In [45]:
train_list = []
with jsonlines.open(FILE_TRAIN_TRUE) as reader:
    for obj in reader:
        train_list.append(obj)

In [46]:
len(train_list)

1688

all patterns:

In [55]:
all_patterns = set()
for pat_type in FORMULAIC_PATTERNS.values():
    all_patterns.update(pat_type)
for pat_type in AGENT_PATTERNS.values():
    all_patterns.update(pat_type)

In [56]:
len(all_patterns)

618

TRAIN ( only citation )

In [47]:
train_met_patterns = set()
for cit in train_list:
    cit_sent = cit['sents_before'][0]
    _, _, f_pats = get_formulaic_features(cit_sent, count=False)
    _, _, a_pats = get_agent_features(cit_sent, count=False)
    train_met_patterns.update(f_pats + a_pats)

In [48]:
len(train_met_patterns)

117

TRAIN ( all )

In [58]:
all_train_met_patterns = set()
for cit in train_list:
    
    for cit_sent in cit['sents_before']:
        _, _, f_pats = get_formulaic_features(cit_sent, count=False)
        _, _, a_pats = get_agent_features(cit_sent, count=False)
        all_train_met_patterns.update(f_pats + a_pats)
        
    for cit_sent in cit['sents_after']:
        _, _, f_pats = get_formulaic_features(cit_sent, count=False)
        _, _, a_pats = get_agent_features(cit_sent, count=False)
        all_train_met_patterns.update(f_pats + a_pats)

In [60]:
len(train_met_patterns)

180

all pattern types:

In [75]:
all_pattern_types = set()
for pat_type in FORMULAIC_PATTERNS.keys():
    all_pattern_types.add(pat_type)
for pat_type in AGENT_PATTERNS.keys():
    all_pattern_types.add(pat_type)

In [76]:
len(all_pattern_types)

42

TRAIN ( only citation )

In [70]:
train_met_pattern_types = set()
for cit in train_list:
    cit_sent = cit['sents_before'][0]
    _, _, f_pats = get_formulaic_features(cit_sent, count=False)
    _, _, a_pats = get_agent_features(cit_sent, count=False)
    
    for pat in f_pats + a_pats:
        for key, value_list in FORMULAIC_PATTERNS.items():
            if pat in value_list:
                train_met_pattern_types.add(key)
        for key, value_list in AGENT_PATTERNS.items():
            if pat in value_list:
                train_met_pattern_types.add(key)

In [71]:
len(train_met_pattern_types)

35

TRAIN ( all )

In [77]:
all_train_met_pattern_types = set()
for cit in train_list:
    
    for cit_sent in cit['sents_before']:
        _, _, f_pats = get_formulaic_features(cit_sent, count=False)
        _, _, a_pats = get_agent_features(cit_sent, count=False)
        
        for pat in f_pats + a_pats:
            for key, value_list in FORMULAIC_PATTERNS.items():
                if pat in value_list:
                    all_train_met_pattern_types.add(key)
            for key, value_list in AGENT_PATTERNS.items():
                if pat in value_list:
                    all_train_met_pattern_types.add(key)
        
    for cit_sent in cit['sents_after']:
        _, _, f_pats = get_formulaic_features(cit_sent, count=False)
        _, _, a_pats = get_agent_features(cit_sent, count=False)
        
        for pat in f_pats + a_pats:
            for key, value_list in FORMULAIC_PATTERNS.items():
                if pat in value_list:
                    all_train_met_pattern_types.add(key)
            for key, value_list in AGENT_PATTERNS.items():
                if pat in value_list:
                    all_train_met_pattern_types.add(key)

In [78]:
len(all_train_met_pattern_types)

36