In [16]:
import jsonlines
import numpy as np
from typing import Dict

from scicite.compute_features import get_formulaic_features, get_agent_features

In [18]:
FILE_TEST_TRUE = "scicite/data/acl-arc/test.jsonl"
FILE_DEV_TRUE = "scicite/data/acl-arc/dev.jsonl"
FILE_TRAIN_TRUE = "scicite/data/acl-arc/train.jsonl"

CLASSES = ['Background', 'Uses', 'CompareOrContrast', 'Motivation', 'Extends', 'Future']

In [26]:
def compute_all_patterns_features(item: Dict):
    formulaic_features, fn_1, __ = get_formulaic_features(item['sents_before'][0], prefix='InCitSent:')
    agent_features, fn_2, _ = get_agent_features(item['sents_before'][0], prefix='InCitSent:')
    fn_3, fn_4 = [], []
    
    formulaic_clause_features = formulaic_features
    agent_clause_features = agent_features
    if len(item['sents_before']) > 1:
        for cur_sentence in item['sents_before'][1:]:
            _formulaic_features, fn_3, _ = get_formulaic_features(cur_sentence, prefix='InClause:')
            _agent_features, fn_4, _ = get_agent_features(cur_sentence, prefix='InClause:')
            formulaic_clause_features = [f_1 or f_2 for f_1, f_2 in zip(formulaic_clause_features,
                                                                        _formulaic_features)]
            agent_clause_features = [f_1 or f_2 for f_1, f_2 in zip(agent_clause_features,
                                                                    _agent_features)]
    for cur_sentence in item['sents_after']:
        _formulaic_features, _, _ = get_formulaic_features(cur_sentence, prefix='InClause:')
        _agent_features, _, _ = get_agent_features(cur_sentence, prefix='InClause:')
        formulaic_clause_features = [f_1 or f_2 for f_1, f_2 in zip(formulaic_clause_features,
                                                                    _formulaic_features)]
        agent_clause_features = [f_1 or f_2 for f_1, f_2 in zip(agent_clause_features,
                                                                _agent_features)]
        

    x = formulaic_features + agent_features + formulaic_clause_features + agent_clause_features 
    y = CLASSES.index(item['intent'])
    feature_names = fn_1 + fn_2
    if fn_3:
        feature_names += fn_3 + fn_4 
    return x, y, feature_names

In [30]:
def create_data():
    X = []
    y = []

    with jsonlines.open(FILE_TEST_TRUE, 'r') as reader:
        for item in reader:
            _x, _y, _ = compute_all_patterns_features(item)
            X.append(_x)
            y.append(_y)
    
    with jsonlines.open(FILE_DEV_TRUE, 'r') as reader:
        for item in reader:
            _x, _y, _ = compute_all_patterns_features(item)
            X.append(_x)
            y.append(_y)
    
    with jsonlines.open(FILE_TRAIN_TRUE, 'r') as reader:
        for item in reader:
            _x, _y, _ = compute_all_patterns_features(item)
            X.append(_x)
            y.append(_y)
    
    return np.array(X), np.array(y)

In [31]:
X, y = create_data()

In [32]:
X.shape

(1941, 112)

In [33]:
y.shape

(1941,)

In [48]:
with jsonlines.open(FILE_TRAIN_TRUE, 'r') as reader:
    for item in reader:
        _, _, feature_names = compute_all_patterns_features(item)
        break

In [49]:
feature_names

['InCitSent:GENERAL_FORMULAIC',
 'InCitSent:THEM_FORMULAIC',
 'InCitSent:US_PREVIOUS_FORMULAIC',
 'InCitSent:TEXTSTRUCTURE_FORMULAIC',
 'InCitSent:HERE_FORMULAIC',
 'InCitSent:METHOD_FORMULAIC',
 'InCitSent:CONTINUE_FORMULAIC',
 'InCitSent:DISCOURSE_CONTRAST_FORMULAIC',
 'InCitSent:GRAPHIC_FORMULAIC',
 'InCitSent:CONTRAST2_FORMULAIC',
 'InCitSent:COMPARISON_FORMULAIC',
 'InCitSent:CONTRAST_FORMULAIC',
 'InCitSent:ALIGN_FORMULAIC',
 'InCitSent:AFFECT_FORMULAIC',
 'InCitSent:GOOD_FORMULAIC',
 'InCitSent:TRADITION_FORMULAIC',
 'InCitSent:IN_ORDER_TO_FORMULAIC',
 'InCitSent:DETAIL_FORMULAIC',
 'InCitSent:NO_TEXTSTRUCTURE_FORMULAIC',
 'InCitSent:USE_FORMULAIC',
 'InCitSent:FUTURE_WORK_FORMULAIC',
 'InCitSent:HEDGING_FORMULAIC',
 'InCitSent:PRESENT_WORK_FORMULAIC',
 'InCitSent:EXTENDING_WORK_FORMULAIC',
 'InCitSent:EXTENDING_WORK2_FORMULAIC',
 'InCitSent:USEFUL_FORMULAIC',
 'InCitSent:MOTIVATING_FORMULAIC',
 'InCitSent:PRIOR_WORK_FORMULAIC',
 'InCitSent:US_AGENT (AS_FORM)',
 'InCitSent:REF_U

In [41]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score

In [57]:
clf = LogisticRegression(multi_class='multinomial')
scores = cross_val_score(clf, X, y, scoring='f1_macro', cv=5)

In [43]:
scores.mean()

0.4828052889281137

In [44]:
scores = cross_val_score(clf, X, y, scoring='f1_micro', cv=5)

In [45]:
scores.mean()

0.6234145177960936

In [59]:
clf.fit(X, y)

LogisticRegression(multi_class='multinomial')

In [65]:
coeffs = clf.coef_

By class:

      'Background': 997,
 
      'Uses': 364,
      
      'CompareOrContrast': 351,
      
      'Motivation': 88,
      
      'Extends': 72,
      
      'Future': 69    

In [71]:
CLASSES

['Background', 'Uses', 'CompareOrContrast', 'Motivation', 'Extends', 'Future']

In [89]:
def get_cv_score_by_class(class_: str, metric: str = 'f1'):
    y_ = y == CLASSES.index(class_)
    clf = LogisticRegression(multi_class='multinomial')
    scores = cross_val_score(clf, X, y_, scoring=metric, cv=5)
    return scores.mean()

In [90]:
for cls in CLASSES:
    print(f"Class={cls}, F1={get_cv_score_by_class(cls)}\n")

Class=Background, F1=0.7261306355435259

Class=Uses, F1=0.4565809737046118

Class=CompareOrContrast, F1=0.26949236350631156

Class=Motivation, F1=0.4437678413540483

Class=Extends, F1=0.4432995532309033

Class=Future, F1=0.4645680332739156



In [91]:
for cls in CLASSES:
    print(f"Class={cls}, Precision={get_cv_score_by_class(cls, 'precision')}\n")

Class=Background, Precision=0.6748134880595462

Class=Uses, Precision=0.7746706805327495

Class=CompareOrContrast, Precision=0.6372750503710256

Class=Motivation, Precision=0.8011904761904762

Class=Extends, Precision=0.7314285714285714

Class=Future, Precision=0.8019480519480521



In [92]:
for cls in CLASSES:
    print(f"Class={cls}, Recall={get_cv_score_by_class(cls, 'recall')}\n")

Class=Background, Recall=0.7863618090452261

Class=Uses, Recall=0.3269786910197869

Class=CompareOrContrast, Recall=0.17106639839034204

Class=Motivation, Recall=0.31830065359477117

Class=Extends, Recall=0.31999999999999995

Class=Future, Recall=0.34835164835164834

