In [33]:
import transformers
import pandas as pd
import ast
import torch
import json

from data_utils import *
from feature_utils import *

from transformers import AutoModelForTokenClassification, AutoTokenizer

import warnings
# Suppress FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

In [34]:
BASELINE = False

if BASELINE:
    MODEL_NAME = '../models/distilbert-base-uncased-finetuned-baseline-argument-classification'
else:
    MODEL_NAME = '../models/distilbert-base-uncased-finetuned-advanced-argument-classification'
    
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Argument labels mapping to IDs differ from baseline and advanced models, because the order of distinct arguments in training
changed when running for different models (unforseen error). However, during training the evaluation results on test set where saved at 'train resulst/' folder. Thus, the correct mappings were extracted.

In [35]:
if BASELINE:
    arg_labels =["ARGM-ADJ","ARGM-ADV","ARGM-COM","ARGM-MNR",
                 "ARGM-PRR","ARGM-PRP","ARGM-REC","O",
                 "ARGM-CXN","ARGM-DIR","ARGM-DIS","ARGM-CAU",
                 "ARG3","ARGA","ARGM-LVB","ARG5",
                 "ARG2","ARGM-MOD","ARGM-LOC","ARGM-NEG",
                 "ARG1","ARGM-EXT","ARG0","ARG4",
                  "ARGM-PRD","ARG1-DSP","ARGM-GOL","ARGM-TMP"]
    label2id = {label: i for i, label in enumerate(arg_labels)}
    id2label = {i: label for i, label in enumerate(arg_labels)}
else:
    arg_labels =['ARGM-ADJ','ARGM-ADV','ARGM-COM','ARGM-MNR',
                 'ARGM-PRR','ARGM-PRP','ARGM-REC','O','ARGM-CXN',
                 'ARGM-DIR','ARGM-DIS','ARGM-CAU','ARG3','ARGA', 
                 'ARGM-LVB','ARG5','ARG2','ARGM-MOD','ARGM-LOC', 
                 'ARGM-NEG','ARG1','ARGM-EXT','ARG0','ARG4','ARGM-PRD',
                 'ARG1-DSP','ARGM-GOL','ARGM-TMP']
    label2id = {label: i for i, label in enumerate(arg_labels)}
    id2label = {i: label for i, label in enumerate(arg_labels)}

## functions for analyzing MFT test type:

1) preprocess_mft_data - converts string like lists to python lists, adds predicate tokens to sentences depending on the model.
2) tokenize_and_align_labels - Tokenizes the input examples and aligns argument labels and ids which is required due to   subtokenization from distilbert model.
3) get_prediction - Gets predictions from tokenized data by aggregating subtoken logits so that it matches input lenghts.
4) evaluate_predictions - calculate failure rate (failure_rate = count_of_fail_predictions / total_ARG0-4_occurances) 

In [36]:
def process_mft_data(data, baseline):
    """
    Prepares mft dataset for predictions.
    
    params:
    data: DataFrame of mft challange dataset.
    """
    data = data.copy()
    # string split for sentence
    data['sentence'] = data['sentence'].apply(lambda x: x.split())
    # convert labels and is_predicate to string format
    data['labels'] = data['labels'].apply(ast.literal_eval)
    data['is_predicate'] = data['is_predicate'].apply(ast.literal_eval)

    predicate_pos = [i.index('1') for i in data['is_predicate'].tolist()]
    # add predicate tokens for baseline or advanced models
    for idx, sent in enumerate(data['sentence']):
        if baseline:
            sent.append('[SEP]')
            sent.append(sent[predicate_pos[idx]])
            data['sentence'][idx] = sent
            
            data['labels'][idx].append('O')
            data['labels'][idx].append('O')
        else:
            sent.insert(predicate_pos[idx], '[PREDICATE]')
            data['sentence'][idx] = sent
            data['labels'][idx].insert(predicate_pos[idx], 'O')

    # map labels to integers corresponding to those in training
    data['labels_mapped'] = data['labels'].apply(lambda x: [label2id[label] for label in x])
    
    return data

In [37]:
def tokenize_and_align_labels(data, sentence_ref, mapping_ref):
    """
    Tokenizes the input examples and aligns argument labels and ids.

    Parameters:
    data: DataFrame containing tokens, sentence IDs, and argument labels/ids.
    sentence_ref: sentences containing column name
    mapping_ref: label_mapping containing column name

    Returns:
    list: A list of new examples with tokenized inputs and aligned labels.
    """
    sentence_lists = data[sentence_ref].tolist()
    sentence_ids = data['ID'].tolist()

    # Tokenize sentences:
    tokenized_inputs = tokenizer(sentence_lists, truncation=True, is_split_into_words=True)

    aligned_examples = []
    
    for i,  arg_label in enumerate(data[mapping_ref]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        arg_ids = []
        labels = []
        for word_idx in word_ids:
            if word_idx is None: # set arg id and label to -100 for first and last special tokens
                arg_ids.append(-100)
                labels.append(-100)
            elif word_idx != previous_word_idx:
                labels.append(arg_label[word_idx])
            else:
                labels.append(arg_label[word_idx])

            previous_word_idx = word_idx

        
        aligned_examples.append({
            'sentence_id': sentence_ids[i],
            'sentence': sentence_lists[i],
            'word_ids': word_ids,
            'input_ids': tokenized_inputs['input_ids'][i],
            'attention_mask': tokenized_inputs['attention_mask'][i],
            'labels': labels,
        })
        

    return aligned_examples

In [38]:
def get_prediction(tokenized_examples):
    '''
    Gets predictions from tokenized data by aggregating subtoken logits so that
    it matches input lenghts.
    
    params:
    tokenized_examples: dict containing tokenizer outpus and labels
    
    returns:
    predicted_labels: list of predicted labels mapped back to propbank annotations.
    '''
    predicted_labels = []
    for example in tokenized_examples:
        
        input_ids = torch.tensor(example['input_ids']).unsqueeze(0)
        attention_mask = torch.tensor(example['attention_mask']).unsqueeze(0)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        # subtoken aggregation is done by summing logits of given word subtokens and taking argmax.
        aggregated_logits = aggregate_subtoken_logits([example], logits.detach().numpy())[0]
        aggregated_predictions = np.argmax(aggregated_logits, axis=1)
        # mapping arg ids back to propbank labels
        pred_labels = [id2label[label_id] for label_id in aggregated_predictions]
        predicted_labels.append(pred_labels)
    return predicted_labels

In [39]:
def evaluate_predictions(predicted_labels, gold_labels):
    """
    Compute failure rate by counting incorrect predictions for ARG1-5 labels (ARGMs are not scope of challange data).
    failure_rate = count_of_fail_predictions / total_ARG0-4_occurance
    
    params:
    predicted_labels: list of predicted labels.
    gold_labels: list of gold labels.
    
    returns: 
    failure_rate: float value.
    """
    total_gold_labels = 0
    fail_prediction = 0
    for i in range(len(predicted_labels)):
        
        for pred, label in zip(predicted_labels[i], gold_labels[i]):
            if label in ['ARG0', 'ARG1', 'ARG2', 'ARG3', 'ARG4']:
                total_gold_labels += 1
                if label != pred:
                    fail_prediction += 1
    failure_rate = fail_prediction/total_gold_labels
    return total_gold_labels, fail_prediction,failure_rate

# Conducting tests for MFT dataset.
tested capabilities with minimal functionality test type dataset:
1) Polysemy
2) Passive voice comprehension
3) Robustness

In [40]:
challange_mft_df = pd.read_csv('../Data/challangedataset_mft.csv', encoding='utf-8')
challange_mft_df.head()

Unnamed: 0,ID,sentence,labels,capability,test_type,broad_capability,is_predicate,target_predicate,target_arguments
0,passive_voice_mrt_1,The dog was adopted by family .,"['ARG1', 'ARG1', 'O', 'O', 'O', 'ARG0', 'O']",passive_voice,mft,argument_alternation,"['0', '0', '0', '1', '0', '0', '0']",token 3,"tokens 0, 1, 5"
1,passive_voice_mrt_2,All the cookies have been eaten by children .,"['ARG1', 'ARG1', 'ARG1', 'O', 'O', 'O', 'O', '...",passive_voice,mft,argument_alternation,"['0', '0', '0', '0', '1', '0', '0', '0']",token4,"tokens 0, 1, 6, 7"
2,passive_voice_mrt_3,A novel was being written by the author .,"['ARG1', 'ARG1', 'O', 'O', 'O', 'O', 'ARG0', '...",passive_voice,mft,argument_alternation,"['0', '0', '0', '1', '0', '0', '0', '0', '0', ...",token 3,"tokens 0, 1, 6, 7"
3,passive_voice_mrt_4,The song was sung by the choir with great emot...,"['ARG1', 'ARG1', 'O', 'O', 'O', 'ARG0', 'ARG0'...",passive_voice,mft,argument_alternation,"['0', '0', '0', '1', '0', '0', '0', '0', '0', ...",token 3,"tokens 0, 1, 5, 6"
4,passive_voice_mrt_5,The project was completed by the group ahead o...,"['ARG1', 'ARG1', 'O', 'O', 'O', 'ARG0', 'ARG0'...",passive_voice,mft,argument_alternation,"['0', '0', '0', '1', '0', '0', '0', '0', '0', ...",token 3,"tokens 0, 1, 5, 6"


In [41]:
mft_df = process_mft_data(challange_mft_df, baseline=BASELINE)

In [42]:
# tokenize examples
tokenized_examples_mft = tokenize_and_align_labels(mft_df,'sentence','labels_mapped')
# get predictions
predicted_labels = get_prediction(tokenized_examples_mft)
# append prediction to mft_df
mft_df['predicted_labels'] = predicted_labels
# unique test names
unique_tests = mft_df['capability'].unique()

failure_rates = []
target_totals = []
fail_counts = []
# looping over test names
for capability in unique_tests:
    # row indexes of distinct test
    test_indexes = mft_df[mft_df['capability'] == capability].index
    # get metrics from evaluation
    total_gold_labels, total_fails, failure_rate = evaluate_predictions(predicted_labels[test_indexes[0]:test_indexes[-1]],
                                        mft_df['labels'].tolist()[test_indexes[0]:test_indexes[-1]])
    failure_rates.append(failure_rate), 
    target_totals.append(total_gold_labels)
    fail_counts.append(total_fails)
# saving results to dict
results = {'capabilities': unique_tests,
          'failure_rate': failure_rates,
           'total_targets': target_totals,
           'fail_count': fail_counts,
           'test_types': ['MFT','MFT','MFT']}

In [43]:
pd.DataFrame(results)

Unnamed: 0,capabilities,failure_rate,total_targets,fail_count,test_types
0,passive_voice,0.611111,36,22,MFT
1,polysemy,0.717949,39,28,MFT
2,robustness,0.4,15,6,MFT


In [44]:
if BASELINE:
    # saving challange_mft data with label predictions by baseline model
    # NOTE the sentences here include predicate tokens seperated by "[SEP] predicate_word" at the end of sequence
    mft_df.to_csv('../evaluation results/challange_mft_with_predictions_baseline_model.csv')
    # saving results to this file path
    results_df = pd.DataFrame(results)
    results_df.to_csv('../evaluation results/mft_test_results_baseline_model.csv')

else:
    # saving challange_mft data with label predictions by baseline model
    # NOTE the predicate words in the sentences have special token [PREDICATE] appended before it.
    mft_df.to_csv('../evaluation results/challange_mft_with_predictions_advanced_model.csv')
    # saving results to this file path
    results_df = pd.DataFrame(results)
    results_df.to_csv(f'../evaluation results/mft_test_results_advanced_model.csv')


## functions for analyzing INV test type:


In [45]:
def process_inv_data(data, baseline):
    """
    Prepares challange_inv dataset for tokenization.
    
    params:
    data: DataFrame of mft challange dataset.
    baseline: bool specifying baseline or else advanced model.
    
    returns:
    data: processed DataFrame
    """
    data = data.copy()
    # string split for sentence
    data['sentence1'] = data['sentence1'].apply(lambda x: x.split())
    data['sentence2'] = data['sentence2'].apply(lambda x: x.split())
    # convert labels and is_predicate to string format
    data['labels1'] = data['labels1'].apply(ast.literal_eval)
    data['labels2'] = data['labels2'].apply(ast.literal_eval)

    data['is_predicate1'] = data['is_predicate1'].apply(ast.literal_eval)
    data['is_predicate2'] = data['is_predicate2'].apply(ast.literal_eval)
    
    data['target_tokens2'] = data['target_tokens2'].apply(ast.literal_eval)
    data['target_tokens2'] = data['target_tokens2'].apply(lambda x: [int(i) for i in x])
    data['expected_prediction'] = data['expected_prediction'].apply(ast.literal_eval)


    predicate_pos1 = [i.index('1') for i in data['is_predicate1'].tolist()]
    predicate_pos2 = [i.index('1') for i in data['is_predicate2'].tolist()]
    # add predicate tokens for baseline or advanced models
    for idx, (sent1, sent2) in enumerate(zip(data['sentence1'], data['sentence2'])):
        if baseline:
            sent1.append('[SEP]')
            sent1.append(sent1[predicate_pos1[idx]])
            data['sentence1'][idx] = sent1
            
            data['labels1'][idx].append('O')
            data['labels1'][idx].append('O')
                                         
            sent2.append('[SEP]')
            sent2.append(sent2[predicate_pos2[idx]])
            data['sentence2'][idx] = sent2
            
            data['labels2'][idx].append('O')
            data['labels2'][idx].append('O')
        else:
            sent1.insert(predicate_pos1[idx], '[PREDICATE]')
            data['sentence1'][idx] = sent1
            data['labels1'][idx].insert(predicate_pos1[idx], 'O')
                                         
            sent2.insert(predicate_pos2[idx], '[PREDICATE]')
            data['sentence2'][idx] = sent2
            data['labels2'][idx].insert(predicate_pos2[idx], 'O')

    # map labels to integers corresponding to those in training
    data['labels_mapped1'] = data['labels1'].apply(lambda x: [label2id[label] for label in x])
    data['labels_mapped2'] = data['labels2'].apply(lambda x: [label2id[label] for label in x])
    
    return data

In [46]:
def evaluate_invariance(predictions, target_tokens, expected_predictions):
    """
    Evaluate invariance examples by checking if predictions match expected labels
    after introducing variance.
    
    params:
    predictions: list containing sequence prediction labels.
    target_tokens: array-like object containing target token indexes.
    expected_prediction: array-like object containing expected prediction labels.

    returns:
    total_targets: Total number of targets.
    count_fails: Number of fail predictions.
    failure_rate: float percentage of failure rate. 
    """
    
    total_targets = 0
    count_fails = 0

    for idx, pred in enumerate(predictions):
        current_targets = target_tokens[idx]
        current_gold = expected_predictions[idx]
        total_targets+=len(current_gold)
        for target_index, target_label in zip(current_targets, current_gold):
            
            if pred[target_index] != target_label:
                count_fails+=1
    failure_rate = count_fails/total_targets

    return total_targets, count_fails, failure_rate

# Conducting tests for INV dataset.
tested capabilities with Invariance test type dataset:
1) Verbal phrase ellipsis
2) Noun phrase ellipsis

In [47]:
challange_inv_df = pd.read_csv('../Data/challangedataset_inv.csv', encoding='utf-8')
challange_inv_df.head()

Unnamed: 0,ID,sentence1,sentence2,labels1,labels2,capability,test_type,is_predicate1,is_predicate2,target_tokens2,expected_prediction
0,verb_ellipsis_1,"John can play the guitar , and Mary can play t...","John can play the guitar , and Mary can too .","['ARG0', 'O', 'O', 'ARG1', 'ARG1', 'O' ,'O', '...","['ARG0', 'O', 'O', 'ARG1', 'ARG1', 'O', 'O', '...",verb_ellipsis,inv,"['0', '0', '0', '0', '0', '0', '0', '0', '0', ...","['0', '0', '1', '0', '0', '0', '0', '0', '0', ...",['7'],['ARG0']
1,verb_ellipsis_2,"They have been to Italy , and we have been to ...","They have been to Italy , and we have as well .","['ARG0', 'O', 'O', 'ARG1', 'ARG1', 'O', 'O', '...","['ARG0', 'O', 'O', 'ARG1', 'ARG1', 'O', 'O', '...",verb_ellipsis,inv,"['0', '0', '0', '0', '0', '0', '0', '0', '0', ...","['0', '0', '1', '0', '0', '0', '0', '0', '0', ...",['7'],['ARG0']
2,verb_ellipsis_3,"We could see the movie tonight , or we could s...","We could see the movie tonight , or we could t...","['ARG0', 'O', 'O', 'ARG1', 'ARG1', 'O', 'O', '...","['ARG0', 'O', 'O', 'ARG1', 'ARG1', 'O', 'O', '...",verb_ellipsis,inv,"['0', '0', '0', '0', '0', '0', '0', '0', '0', ...","['0', '0', '1', '0', '0', '0', '0', '0', '0', ...",['8'],['ARG0']
3,verb_ellipsis_4,"You should call your mom more often , and you ...","You should call your mom more often, and your ...","['ARG0', 'O', 'O', 'ARG1', 'ARG1', 'O', 'O', '...","['ARG0', 'O', 'O', 'ARG1', 'ARG1', 'O', 'O', '...",verb_ellipsis,inv,"['0', '0', '0', '0', '0', '0', '0', '0', '0', ...","['0', '0', '1', '0', '0', '0', '0', '0', '0', ...","['9', '10']","['ARG1', 'ARG1']"
4,verb_ellipsis_5,"I used to play the piano , and my brother used...","I used to play the piano , and my brother the ...","['ARG0', 'O', 'O', 'O', 'ARG1', 'ARG1', 'O', '...","['ARG0', 'O', 'O', 'O', 'ARG1', 'ARG1', 'O', '...",verb_ellipsis,inv,"['0', '0', '0', '0', '0', '0', '0', '0', '0', ...","['0', '0', '0', '1', '0', '0', '0', '0', '0', ...","['7', '8' , '9', '10']","['ARG0', 'ARG0', 'ARG1','ARG1']"


In [52]:
inv_df = process_inv_data(challange_inv_df, baseline=BASELINE)
inv_df.head()

Unnamed: 0,ID,sentence1,sentence2,labels1,labels2,capability,test_type,is_predicate1,is_predicate2,target_tokens2,expected_prediction,labels_mapped1,labels_mapped2
0,verb_ellipsis_1,"[John, can, play, the, guitar, ,, and, Mary, c...","[John, can, [PREDICATE], play, the, guitar, ,,...","[ARG0, O, O, ARG1, ARG1, O, O, ARG0, O, O, O, ...","[ARG0, O, O, O, ARG1, ARG1, O, O, ARG0, O, O, O]",verb_ellipsis,inv,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",[7],[ARG0],"[22, 7, 7, 20, 20, 7, 7, 22, 7, 7, 7, 7, 7]","[22, 7, 7, 7, 20, 20, 7, 7, 22, 7, 7, 7]"
1,verb_ellipsis_2,"[They, have, been, to, Italy, ,, and, we, have...","[They, have, [PREDICATE], been, to, Italy, ,, ...","[ARG0, O, O, ARG1, ARG1, O, O, ARG0, O, O, O, ...","[ARG0, O, O, O, ARG1, ARG1, O, O, ARG0, O, O, ...",verb_ellipsis,inv,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[7],[ARG0],"[22, 7, 7, 20, 20, 7, 7, 22, 7, 7, 7, 20, 20, ...","[22, 7, 7, 7, 20, 20, 7, 7, 22, 7, 7, 7, 7]"
2,verb_ellipsis_3,"[We, could, see, the, movie, tonight, ,, or, w...","[We, could, [PREDICATE], see, the, movie, toni...","[ARG0, O, O, ARG1, ARG1, O, O, O, ARG0, O, O, ...","[ARG0, O, O, O, ARG1, ARG1, O, O, O, ARG0, O, ...",verb_ellipsis,inv,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[8],[ARG0],"[22, 7, 7, 20, 20, 7, 7, 7, 22, 7, 7, 7, 20, 2...","[22, 7, 7, 7, 20, 20, 7, 7, 7, 22, 7, 7, 7]"
3,verb_ellipsis_4,"[You, should, call, your, mom, more, often, ,,...","[You, should, [PREDICATE], call, your, mom, mo...","[ARG0, O, O, ARG1, ARG1, O, O, O, O, ARG0, O, ...","[ARG0, O, O, O, ARG1, ARG1, O, O, O, O, ARG1, ...",verb_ellipsis,inv,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 10]","[ARG1, ARG1]","[22, 7, 7, 20, 20, 7, 7, 7, 7, 22, 7, 7, 7, 20...","[22, 7, 7, 7, 20, 20, 7, 7, 7, 7, 20, 20, 7, 7]"
4,verb_ellipsis_5,"[I, used, to, play, the, piano, ,, and, my, br...","[I, used, to, [PREDICATE], play, the, piano, ,...","[ARG0, O, O, O, ARG1, ARG1, O, O, ARG0, ARG0, ...","[ARG0, O, O, O, O, ARG1, ARG1, O, O, ARG0, ARG...",verb_ellipsis,inv,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[7, 8, 9, 10]","[ARG0, ARG0, ARG1, ARG1]","[22, 7, 7, 7, 20, 20, 7, 7, 22, 22, 7, 7, 7, 7...","[22, 7, 7, 7, 7, 20, 20, 7, 7, 22, 22, 20, 20, 7]"


In [49]:
# tokenized sentence2 column which contain ellipsis examples
tokenized_examples2_inv = tokenize_and_align_labels(inv_df, 'sentence2', 'labels_mapped2')
# get predictions for sentence2 data
predicted_labels2 = get_prediction(tokenized_examples2_inv)
# unique tests
unique_tests = inv_df['capability'].unique()
# target tokens and expected predictions
target_tokens = inv_df['target_tokens2'].tolist()
expected_predictions = inv_df['expected_prediction'].tolist()

# placeholders
failure_rates = []
target_totals = []
fail_counts = []

results = {}
# loop over test names to get results per distinct test
for capability in unique_tests:
    test_results = {}
    test_indexes = inv_df[inv_df['capability'] == capability].index
    # getting evaluation metrics
    total_gold_labels, total_fails, failure_rate = evaluate_invariance(predicted_labels2[test_indexes[0]:test_indexes[-1]],
                                                                 target_tokens[test_indexes[0]:test_indexes[-1]],
                                                                 expected_predictions[test_indexes[0]:test_indexes[-1]])
    
    failure_rates.append(failure_rate), 
    target_totals.append(total_gold_labels)
    fail_counts.append(total_fails)
# save results
results = {'capabilities': unique_tests,
          'failure_rate': failure_rates,
           'total_targets': target_totals,
           'fail_count': fail_counts,
           'test_types': ['INV', 'INV']}

In [50]:
results

{'capabilities': array(['verb_ellipsis', 'noun_ellipsis'], dtype=object),
 'failure_rate': [1.0, 0.5555555555555556],
 'total_targets': [5, 9],
 'fail_count': [5, 5],
 'test_types': ['INV', 'INV']}

In [51]:
if BASELINE:
    # saving challange_inv data with label predictions by baseline model
    # NOTE the sentences here include predicate tokens seperated by "[SEP] predicate_word" at the end of sequence
    inv_df.to_csv('../evaluation results/challange_inv_with_predictions_baseline_model.csv')
    # saving results to this file path
    results_df = pd.DataFrame(results)
    results_df.to_csv('../evaluation results/inv_test_results_baseline_model.csv')

else:
    # saving challange_inv data with label predictions by baseline model
    # NOTE the predicate words in the sentences have special token [PREDICATE] appended before it.
    inv_df.to_csv('../evaluation results/challange_inv_with_predictions_advanced_model.csv')
    # saving results to this file path
    results_df = pd.DataFrame(results)
    results_df.to_csv('../evaluation results/inv_test_results_advanced_model.csv')
