In [1]:
import transformers
import pandas as pd
import ast
import torch
from data_utils import *
from feature_utils import *

from transformers import AutoModelForTokenClassification, AutoTokenizer



In [29]:
MODEL_NAME = '../models/distilbert-base-uncased-finetuned-advanced-argument-classification'
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [30]:
challange_mft_df = pd.read_csv('../Data/challangedataset_mft.csv', encoding='utf-8')
challange_mft_df.head()

Unnamed: 0,ID,sentence,labels,capability,test_type,broad_capability
0,passive_voice_mrt_1,The dog was adopted by family .,"['ARG1', 'ARG1', 'O', 'O', 'O', 'ARG0', 'O']",passive_voice,mft,argument_alternation
1,passive_voice_mrt_2,All the cookies have been eaten by children .,"['ARG1', 'ARG1', 'ARG1', 'O', 'O', 'O', 'O', '...",passive_voice,mft,argument_alternation
2,passive_voice_mrt_3,A novel was being written by the author .,"['ARG1', 'ARG1', 'O', 'O', 'O', 'O', 'ARG0', '...",passive_voice,mft,argument_alternation
3,passive_voice_mrt_4,The song was sung by the choir with great emot...,"['ARG1', 'ARG1', 'O', 'O', 'O', 'ARG0', 'ARG0'...",passive_voice,mft,argument_alternation
4,passive_voice_mrt_5,The project was completed by the group ahead o...,"['ARG1', 'ARG1', 'O', 'O', 'O', 'ARG0', 'ARG0'...",passive_voice,mft,argument_alternation


Argument labels mapping to IDs from training:

In [31]:
labels_mapping = {'ARGM-MOD': 0,
 'ARGM-ADJ': 1,
 'ARG1-DSP': 2,
 'O': 3,
 'ARGA': 4,
 'ARGM-DIS': 5,
 'ARGM-COM': 6,
 'ARGM-CXN': 7,
 'ARGM-ADV': 8,
 'ARGM-CAU': 9,
 'ARGM-LOC': 10,
 'ARGM-NEG': 11,
 'ARGM-PRP': 12,
 'ARGM-REC': 13,
 'ARGM-DIR': 14,
 'ARG0': 15,
 'ARG3': 16,
 'ARGM-GOL': 17,
 'ARG5': 18,
 'ARG4': 19,
 'ARGM-EXT': 20,
 'ARGM-PRD': 21,
 'ARGM-PRR': 22,
 'ARGM-LVB': 23,
 'ARGM-MNR': 24,
 'ARG1': 25,
 'ARGM-TMP': 26,
 'ARG2': 27}

In [32]:
def process_mft_data(data):
    """
    Prepares mft dataset for predictions.
    
    params:
    data: DataFrame of mft challange dataset.
    """
    # string split for sentence
    data['sentence'] = data['sentence'].apply(lambda x: x.split())
    # convert labels to string format
    data['labels'] = data['labels'].apply(ast.literal_eval)
    # map labels to integers corresponding to those in training
    data['labels_mapped'] = data['labels'].apply(lambda x: [labels_mapping[label] for label in x])
    
    return data

In [33]:
mft_df = process_mft_data(challange_mft_df)

In [34]:
mft_df.head()

Unnamed: 0,ID,sentence,labels,capability,test_type,broad_capability,labels_mapped
0,passive_voice_mrt_1,"[The, dog, was, adopted, by, family, .]","[ARG1, ARG1, O, O, O, ARG0, O]",passive_voice,mft,argument_alternation,"[25, 25, 3, 3, 3, 15, 3]"
1,passive_voice_mrt_2,"[All, the, cookies, have, been, eaten, by, chi...","[ARG1, ARG1, ARG1, O, O, O, O, ARG0, O]",passive_voice,mft,argument_alternation,"[25, 25, 25, 3, 3, 3, 3, 15, 3]"
2,passive_voice_mrt_3,"[A, novel, was, being, written, by, the, autho...","[ARG1, ARG1, O, O, O, O, ARG0, ARG0, O]",passive_voice,mft,argument_alternation,"[25, 25, 3, 3, 3, 3, 15, 15, 3]"
3,passive_voice_mrt_4,"[The, song, was, sung, by, the, choir, with, g...","[ARG1, ARG1, O, O, O, ARG0, ARG0, O, O, O, O]",passive_voice,mft,argument_alternation,"[25, 25, 3, 3, 3, 15, 15, 3, 3, 3, 3]"
4,passive_voice_mrt_5,"[The, project, was, completed, by, the, group,...","[ARG1, ARG1, O, O, O, ARG0, ARG0, O, O, O, O]",passive_voice,mft,argument_alternation,"[25, 25, 3, 3, 3, 15, 15, 3, 3, 3, 3]"


In [35]:
def tokenize_and_align_labels(data):
    """
    Tokenizes the input examples and aligns argument labels and ids.

    Parameters:
    data: DataFrame containing tokens, sentence IDs, and argument labels/ids.
    multilabel: True for argument classifcation else argument identification (binary).
    label_all_tokens: bool for labeling all tokens.

    Returns:
    list: A list of new examples with tokenized inputs and aligned labels.
    """
    sentence_lists = data['sentence'].tolist()
    sentence_ids = data['ID'].tolist()

    # Tokenize sentences:
    tokenized_inputs = tokenizer(sentence_lists, truncation=True, is_split_into_words=True)

    aligned_examples = []

    for i,  arg_label in enumerate(data['labels_mapped']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        arg_ids = []
        labels = []
        for word_idx in word_ids:
            if word_idx is None: # set arg id and label to -100 for first and last special tokens
                arg_ids.append(-100)
                labels.append(-100)
            elif word_idx != previous_word_idx:
                labels.append(arg_label[word_idx])
            else:
                labels.append(arg_label[word_idx])

            previous_word_idx = word_idx

        
        aligned_examples.append({
            'sentence_id': sentence_ids[i],
            'sentence': sentence_lists[i],
            'word_ids': word_ids,
            'input_ids': tokenized_inputs['input_ids'][i],
            'attention_mask': tokenized_inputs['attention_mask'][i],
            'labels': labels,
        })
        

    return aligned_examples

In [36]:
tokenized_examples_mft = tokenize_and_align_labels(mft_df)

In [37]:
tokenized_examples_mft[4]['sentence']

['The',
 'project',
 'was',
 'completed',
 'by',
 'the',
 'group',
 'ahead',
 'of',
 'schedule',
 '.']

In [38]:
tokenized_examples_mft[4]['labels']

[-100, 25, 25, 3, 3, 3, 15, 15, 3, 3, 3, 3, -100]

In [43]:
inputs = tokenizer(tokenized_examples_mft[15]['sentence'], return_tensors="pt")
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=2)
#predictions = [id2label[prediction] for prediction in predictions[0].tolist()]
predictions

tensor([[7, 7, 7],
        [7, 7, 7],
        [7, 7, 7],
        [7, 7, 7],
        [7, 7, 7],
        [7, 7, 7],
        [7, 7, 7]])

In [13]:
len(tokenized_examples_mft[4]['sentence'])

11

In [14]:
len(predictions)

11

In [25]:
predicted_labels = []

example = tokenized_examples_mft[0]
input_ids = torch.tensor(example['input_ids']).unsqueeze(0)
attention_mask = torch.tensor(example['attention_mask']).unsqueeze(0)
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
aggregated_logits = aggregate_subtoken_logits([example], logits.detach().numpy())[0]
aggregated_predictions = np.argmax(aggregated_logits, axis=1)
#pred_labels = [id2label[label_id] for label_id in aggregated_predictions]
#predicted_labels.append(pred_labels)


In [26]:
len(tokenized_examples_mft[0])

6

In [27]:
aggregated_predictions

array([16, 16, 16, 16, 16, 16, 16])

In [19]:
tokenized_examples_mft[0]

{'sentence_id': 'passive_voice_mrt_1',
 'sentence': ['The', 'dog', 'was', 'adopted', 'by', 'family', '.'],
 'word_ids': [None, 0, 1, 2, 3, 4, 5, 6, None],
 'input_ids': [101, 1996, 3899, 2001, 4233, 2011, 2155, 1012, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 25, 25, 3, 3, 3, 15, 3, -100]}