In [7]:
import numpy as np
import jsonlines

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

In [8]:
from typing import Dict

from scicite.compute_features import get_formulaic_features, get_agent_features

In [9]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
FILE_TEST_PRED = "[TEST]_pred_weights.jsonl"
FILE_TEST_PRED_ONLY_BERT = "pred_test_with_bert.jsonl"
FILE_TEST_TRUE = "scicite/data/acl-arc/test.jsonl"

FILE_DEV_PRED = "[DEV]_pred_weights.jsonl"
FILE_DEV_PRED_ONLY_BERT = "pred_dev_with_bert.jsonl"
FILE_DEV_TRUE = "scicite/data/acl-arc/dev.jsonl"

FILE_TRAIN_PRED = "[TRAIN]_pred_weights.jsonl"
FILE_TRAIN_PRED_ONLY_BERT = "pred_train_with_bert.jsonl"
FILE_TRAIN_TRUE = "scicite/data/acl-arc/train.jsonl"


CLASSES = ['Background', 'Uses', 'CompareOrContrast', 'Motivation', 'Extends', 'Future']

In [11]:
def compute_all_patterns_features(item):
    formulaic_features, fn_1, __ = get_formulaic_features(item['sents_before'][0], prefix='InCitSent:')
    agent_features, fn_2, _ = get_agent_features(item['sents_before'][0], prefix='InCitSent:')
    fn_3, fn_4 = [], []
    
    formulaic_clause_features = formulaic_features
    agent_clause_features = agent_features
    if len(item['sents_before']) > 1:
        for cur_sentence in item['sents_before'][1:]:
            _formulaic_features, fn_3, _ = get_formulaic_features(cur_sentence, prefix='InClause:')
            _agent_features, fn_4, _ = get_agent_features(cur_sentence, prefix='InClause:')
            formulaic_clause_features = [f_1 or f_2 for f_1, f_2 in zip(formulaic_clause_features,
                                                                        _formulaic_features)]
            agent_clause_features = [f_1 or f_2 for f_1, f_2 in zip(agent_clause_features,
                                                                    _agent_features)]
    for cur_sentence in item['sents_after']:
        _formulaic_features, _, _ = get_formulaic_features(cur_sentence, prefix='InClause:')
        _agent_features, _, _ = get_agent_features(cur_sentence, prefix='InClause:')
        formulaic_clause_features = [f_1 or f_2 for f_1, f_2 in zip(formulaic_clause_features,
                                                                    _formulaic_features)]
        agent_clause_features = [f_1 or f_2 for f_1, f_2 in zip(agent_clause_features,
                                                                _agent_features)]
        

    x = formulaic_features + agent_features + formulaic_clause_features + agent_clause_features 
    y = CLASSES.index(item['intent'])
    feature_names = fn_1 + fn_2
    if fn_3:
        feature_names += fn_3 + fn_4 
    return x, y, feature_names

In [12]:
def create_data():
    X = []
    y = []

    with jsonlines.open(FILE_TEST_TRUE, 'r') as reader:
        for item in reader:
            _x, _y, _ = compute_all_patterns_features(item)
            X.append(_x)
            y.append(_y)
    
    with jsonlines.open(FILE_DEV_TRUE, 'r') as reader:
        for item in reader:
            _x, _y, _ = compute_all_patterns_features(item)
            X.append(_x)
            y.append(_y)
    
    with jsonlines.open(FILE_TRAIN_TRUE, 'r') as reader:
        for item in reader:
            _x, _y, _ = compute_all_patterns_features(item)
            X.append(_x)
            y.append(_y)
    
    return np.array(X), np.array(y)

In [13]:
X, y = create_data()

KeyboardInterrupt: 

In [None]:
X.shape

In [None]:
y.shape

In [None]:
with jsonlines.open(FILE_TRAIN_TRUE, 'r') as reader:
    for item in reader:
        _, _, feature_names = compute_all_patterns_features(item)
        break

In [None]:
#feature_names

In [None]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score

In [None]:
clf = LogisticRegression(multi_class='multinomial')
scores = cross_val_score(clf, X, y, scoring='f1_macro', cv=5)

In [None]:
scores.mean()

In [None]:
scores = cross_val_score(clf, X, y, scoring='f1_micro', cv=5)

In [None]:
scores.mean()

In [None]:
clf.fit(X, y)

In [None]:
coeffs = clf.coef_

By class:

      'Background': 997,
 
      'Uses': 364,
      
      'CompareOrContrast': 351,
      
      'Motivation': 88,
      
      'Extends': 72,
      
      'Future': 69    

In [None]:
CLASSES

In [None]:
def get_cv_score_by_class(class_: str, metric: str = 'f1'):
    y_ = y == CLASSES.index(class_)
    clf = LogisticRegression(multi_class='multinomial')
    scores = cross_val_score(clf, X, y_, scoring=metric, cv=5)
    return scores.mean()

In [None]:
for cls in CLASSES:
    print(f"Class={cls}, F1={get_cv_score_by_class(cls)}\n")

In [None]:
for cls in CLASSES:
    print(f"Class={cls}, Precision={get_cv_score_by_class(cls, 'precision')}\n")

In [None]:
for cls in CLASSES:
    print(f"Class={cls}, Recall={get_cv_score_by_class(cls, 'recall')}\n")

Error matrix:

In [None]:
FILE_TEST_PRED = "[TEST]_pred_weights.jsonl"
FILE_TEST_PRED_ONLY_BERT = "pred_test_with_bert.jsonl"
FILE_TEST_TRUE = "scicite/data/acl-arc/test.jsonl"

FILE_DEV_PRED = "[DEV]_pred_weights.jsonl"
FILE_DEV_PRED_ONLY_BERT = "pred_dev_with_bert.jsonl"
FILE_DEV_TRUE = "scicite/data/acl-arc/dev.jsonl"

FILE_TRAIN_PRED = "[TRAIN]_pred_weights.jsonl"
FILE_TRAIN_PRED_ONLY_BERT = "pred_train_with_bert.jsonl"
FILE_TRAIN_TRUE = "scicite/data/acl-arc/train.jsonl"

In [None]:
def get_true_pred(true_file, pred_file):
    # true values
    true = dict()
    with jsonlines.open(true_file) as reader:
        for obj in reader:
            true[obj['citation_id']] = obj['intent']
    
    # true+pred values
    pairs = dict()
    with jsonlines.open(pred_file) as reader:
        for obj in reader:
            pairs[obj['citation_id']] = [true[obj['citation_id']], obj['prediction']]
    
    # encode
    y_true_pred = list(pairs.values())
    y = []
    for pair in y_true_pred:
        y.append([CLASSES.index(pair[0]), CLASSES.index(pair[1])])
    y = np.array(y)
    y_true = y[:, 0]
    y_pred = y[:, 1]
    
    return y_true, y_pred

In [None]:
y_true_test, y_pred_test = get_true_pred(FILE_TEST_TRUE, FILE_TEST_PRED)

In [None]:
y_true_dev, y_pred_dev = get_true_pred(FILE_DEV_TRUE, FILE_DEV_PRED)

In [None]:
y_true_train, y_pred_train = get_true_pred(FILE_TRAIN_TRUE, FILE_TRAIN_PRED)

In [None]:
y_true_all = np.concatenate((y_true_test, y_true_dev, y_true_train))
y_pred_all = np.concatenate((y_pred_test, y_pred_dev, y_pred_train))

In [None]:
error_matrix_all = confusion_matrix(y_true_all, y_pred_all)

In [None]:
error_matrix = confusion_matrix(y_true_test, y_pred_test)

***Only validation:***

In [None]:
df_cm = pd.DataFrame(error_matrix, index = CLASSES, columns = CLASSES)
plt.figure(figsize = (10,7))
cmap = sn.cm.rocket_r
sn.heatmap(df_cm, annot=True, cmap=cmap)

plt.ylabel("Predicted labels")
plt.xlabel("True labels")
plt.show()

***All data:***

In [None]:
df_cm = pd.DataFrame(error_matrix_all, index = CLASSES, columns = CLASSES)
plt.figure(figsize = (10,7))
cmap = sn.cm.rocket_r
sn.heatmap(df_cm, annot=True, cmap=cmap)

plt.title("Scibert with class weights")
plt.ylabel("Predicted labels")
plt.xlabel("True labels")
plt.show()

only sciBERT

In [None]:
y_true_test_only, y_pred_test_only = get_true_pred(FILE_TEST_TRUE, FILE_TEST_PRED_ONLY_BERT)

In [None]:
y_true_dev_only, y_pred_dev_only = get_true_pred(FILE_DEV_TRUE, FILE_DEV_PRED_ONLY_BERT)

In [None]:
y_true_train_only, y_pred_train_only = get_true_pred(FILE_TRAIN_TRUE, FILE_TRAIN_PRED_ONLY_BERT)

In [None]:
y_true_all_only = np.concatenate((y_true_test_only, y_true_dev_only, y_true_train_only))
y_pred_all_only = np.concatenate((y_pred_test_only, y_pred_dev_only, y_pred_train_only))

In [None]:
error_matrix_test_all_only = confusion_matrix(y_true_test_only, y_true_test_only)

In [None]:
error_matrix_all_only = confusion_matrix(y_true_all_only, y_pred_all_only)

***All data***:

In [None]:
df_cm = pd.DataFrame(error_matrix_all_only, index = CLASSES, columns = CLASSES)
plt.figure(figsize = (10,7))
cmap = sn.cm.rocket_r
sn.heatmap(df_cm, annot=True, cmap=cmap)

plt.title("Only scibert")
plt.ylabel("Predicted labels")
plt.xlabel("True labels")
plt.show()

***Linear combination***

In [15]:
def create_train_data():
    X_train = []
    y_train = []
    
    with jsonlines.open(FILE_TRAIN_TRUE, 'r') as reader:
        for item in reader:
            _x, _y, _ = compute_all_patterns_features(item)
            X_train.append(_x)
            y_train.append(_y)
    
    return np.array(X_train), np.array(y_train)

In [16]:
X_train, y_train = create_train_data()

In [17]:
def create_test_data(clf):
    # true values
    true = dict()
    with jsonlines.open(FILE_TEST_TRUE) as reader:
        for obj in reader:
            cur_obj = obj
            _x, _y, _ = compute_all_patterns_features(obj)
            cur_obj['patterns'] = _x
            true[obj['citation_id']] = cur_obj
    with jsonlines.open(FILE_DEV_TRUE) as reader:
        for obj in reader:
            cur_obj = obj
            _x, _y, _ = compute_all_patterns_features(obj)
            cur_obj['patterns'] = _x
            true[obj['citation_id']] = cur_obj
           
    # pred values
    result = {}
    with jsonlines.open(FILE_TEST_PRED, 'r') as reader:
        for item in reader:
            cur_item = {}
            cur_item['patterns'] = true[item['citation_id']]['patterns']
            cur_item['citation_id'] = item['citation_id']
            cur_item['intent'] = true[item['citation_id']]['intent']
            cur_item['prediction'] = CLASSES.index(item['prediction'])
            cur_item['probabilities'] = item['probabilities']
            cur_item['clf_prediction'] = clf.predict([_x])[0]
            cur_item['clf_probabilities'] = clf.predict_proba([_x])
            result[item['citation_id']] = cur_item
    
    with jsonlines.open(FILE_DEV_PRED, 'r') as reader:
        for item in reader:
            cur_item = {}
            cur_item['patterns'] = true[item['citation_id']]['patterns']
            cur_item['citation_id'] = item['citation_id']
            cur_item['intent'] = true[item['citation_id']]['intent']
            cur_item['prediction'] = CLASSES.index(item['prediction'])
            cur_item['probabilities'] = item['probabilities']
            cur_item['clf_prediction'] = clf.predict([_x])[0]
            cur_item['clf_probabilities'] = clf.predict_proba([_x])
            result[item['citation_id']] = cur_item
    
    return result

In [18]:
def create_test_data_info():
    X_info = []
    y_info = []
    
    with jsonlines.open(FILE_TEST_TRUE) as reader:
        for obj in reader:
            X_info.append(obj['citation_id'])
            y_info.append(CLASSES.index(obj['intent']))
    with jsonlines.open(FILE_DEV_TRUE) as reader:
        for obj in reader:
            X_info.append(obj['citation_id'])
            y_info.append(CLASSES.index(obj['intent']))
            
    return X_info, y_info

In [19]:
clf = LGBMClassifier()
scores = cross_val_score(clf, X_train, y_train, scoring='f1_macro', cv=5)
scores.mean()

0.5392998088369465

In [20]:
clf = LGBMClassifier().fit(X_train, y_train)

In [21]:
test_data = create_test_data(clf)

In [23]:
X_test_info, y_test_info = create_test_data_info()

In [50]:
all_weights = []
for w_1 in np.linspace(0, 1, 5):
    for w_2 in np.linspace(0, 1, 5):
        for w_3 in np.linspace(0, 1, 5):
            for w_4 in np.linspace(0, 1, 5):
                for w_5 in np.linspace(0, 1, 5):
                    for w_6 in np.linspace(0, 1, 5):
                        all_weights.append([w_1, w_2, w_3, w_4, w_5, w_6])

In [52]:
def compute_score(X_probs, y, weights):
    y_pred = []
    for _pair in X_probs:
        clf_1_probs, clf_2_probs = _pair
        clf_1_probs = np.array(clf_1_probs)
        clf_2_probs = np.array(clf_2_probs)
        weights = np.array(weights)
    
        res_probs = clf_1_probs * weights + clf_2_probs * (1 - weights)
        y_pred.append(np.argmax(res_probs))
        
    return f1_score(y, y_pred, average='macro')

In [53]:
skf = StratifiedKFold(n_splits = 5, shuffle=True)
weights_info = [{'weights': _w, 'test_scores': [], 'val_scores': []} for _w in all_weights]

X_test_info = np.array(X_test_info)
y_test_info = np.array(y_test_info)
for train_index, test_index in skf.split(X_test_info, y_test_info):
    _X_train_info, _X_test_info = X_test_info[train_index], X_test_info[test_index]
    y_train, y_test = y_test_info[train_index], y_test_info[test_index]
    X_train_probs = [[test_data[_id]['probabilities'], test_data[_id]['clf_probabilities']] 
                     for _id in _X_train_info]
    X_test_probs = [[test_data[_id]['probabilities'], test_data[_id]['clf_probabilities']] 
                     for _id in _X_test_info]
    
    for cur_weights in weights_info:
        cur_weights['test_scores'].append(compute_score(X_train_probs, y_train, cur_weights['weights']))
        cur_weights['val_scores'].append(compute_score(X_test_probs, y_test, cur_weights['weights']))
        
    print("done")

done
done
done
done
done


In [55]:
for cur_weight in weights_info:
    cur_weight['test_f1'] = np.mean(cur_weight['test_scores'])
    cur_weight['val_f1'] = np.mean(cur_weight['val_scores'])

In [61]:
best_weights = sorted(weights_info, key=lambda _info: _info['test_f1'], reverse=True) 

In [64]:
best_weights[:10]

[{'weights': [1.0, 1.0, 1.0, 0.0, 0.5, 0.5],
  'test_scores': [0.7674327601410934,
   0.7411269812336164,
   0.7591717757155104,
   0.7495047384203962,
   0.7857261895099391],
  'val_scores': [0.741111111111111,
   0.8044011544011545,
   0.7810457516339869,
   0.8303530662663791,
   0.630357142857143],
  'test_f1': 0.7605924890041111,
  'val_f1': 0.7574536452539549},
 {'weights': [1.0, 1.0, 1.0, 0.0, 0.5, 0.75],
  'test_scores': [0.7674327601410934,
   0.7411269812336164,
   0.7591717757155104,
   0.7495047384203962,
   0.7857261895099391],
  'val_scores': [0.741111111111111,
   0.8044011544011545,
   0.7810457516339869,
   0.8303530662663791,
   0.630357142857143],
  'test_f1': 0.7605924890041111,
  'val_f1': 0.7574536452539549},
 {'weights': [1.0, 1.0, 1.0, 0.0, 0.5, 1.0],
  'test_scores': [0.7674327601410934,
   0.7411269812336164,
   0.7591717757155104,
   0.7495047384203962,
   0.7857261895099391],
  'val_scores': [0.741111111111111,
   0.8044011544011545,
   0.7810457516339869,
 

In [65]:
weights_info[-1]

{'weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 'test_scores': [0.7530961230345032,
  0.7271520146520146,
  0.7404309874705537,
  0.7388431358200674,
  0.7803058776253117],
 'val_scores': [0.741111111111111,
  0.8044011544011545,
  0.7810457516339869,
  0.8052688911822039,
  0.6290992812219227],
 'test_f1': 0.7479656277204901,
 'val_f1': 0.7521852379100757}