# Create and prepare the dataset for fine-tuning.

In [1]:
import os, torch, ast
import pandas as pd
from tqdm import tqdm
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, RobertaForSequenceClassification

In [2]:
PATH = '/mount/studenten/arbeitsdaten-studenten1/semantic-plausibility/plausible-parrots/'
CACHE_DIR = PATH + 'cache/'

## Load all the 6 data splits. Merge pap_train and pep_train, pap_dev and pep_dev, as well as pap_test and pep_test, respectively.

In [3]:
dataframes = {}
for root, dirs, files in os.walk('../input/'):
    print(root)
    print(dirs)
    print(files)
    for file in files:
        if file.endswith('.csv'):
            fn = file.replace('.csv', '')
            fp = root + file
            dataframes[fn] = pd.read_csv(fp)
    break
            
print(len(dataframes))
dataframes.keys()

../input/
['.ipynb_checkpoints', 'deprecated']
['et_pap_dev_wikidata.csv', 'et_pap_train_wikidata.csv', 'et_pap_test_wikidata.csv', 'et_pep_test_wikidata.csv', 'et_pep_train_wikidata.csv', 'et_pep_dev_wikidata.csv']
6


dict_keys(['et_pap_dev_wikidata', 'et_pap_train_wikidata', 'et_pap_test_wikidata', 'et_pep_test_wikidata', 'et_pep_train_wikidata', 'et_pep_dev_wikidata'])

### train

In [4]:
et_pap_train_wikidata = dataframes['et_pap_train_wikidata']
et_pap_train_wikidata.drop('original_label', axis=1, inplace=True)
et_pap_train_wikidata = et_pap_train_wikidata[['text', 'event_type', 'subject_types', 'object_types', 'label']]

et_pep_train_wikidata = dataframes['et_pep_train_wikidata']
et_pep_train_wikidata = et_pep_train_wikidata[['text', 'event_type', 'subject_types', 'object_types', 'label']]

train_wikidata = pd.concat([et_pap_train_wikidata, et_pep_train_wikidata])
train_wikidata

Unnamed: 0,text,event_type,subject_types,object_types,label
0,Event occurs year.,"{'sen_id': 0, 'sentence': 'Event occurs year.'...","[{'wd_qid': 'Q1656682', 'wd_label': 'event', '...","[{'wd_qid': 'Q577', 'wd_label': 'year', 'descr...",1
1,Tortoise brings limb.,"{'sen_id': 1, 'sentence': 'Tortoise brings lim...","[{'wd_qid': 'Q729', 'wd_label': 'animal', 'des...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...",1
2,Headliner overpowers function.,"{'sen_id': 2, 'sentence': 'Headliner overpower...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...",1
3,County receives hour.,"{'sen_id': 3, 'sentence': 'County receives hou...","[{'wd_qid': 'Q15284', 'wd_label': 'municipalit...","[{'wd_qid': 'Q25235', 'wd_label': 'hour', 'des...",0
4,Traveler acknowledges recognition.,"{'sen_id': 4, 'sentence': 'Traveler acknowledg...","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...","[{'wd_qid': 'Q618779', 'wd_label': 'award', 'd...",1
...,...,...,...,...,...
2444,Wool clip dust.,"{'sen_id': 2444, 'sentence': 'Wool clip dust.'...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...",0
2445,Rope hook pan.,"{'sen_id': 2445, 'sentence': 'Rope hook pan.',...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...",1
2446,Bag contain tree.,"{'sen_id': 2446, 'sentence': 'Bag contain tree...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...","[{'wd_qid': 'Q756', 'wd_label': 'plant', 'desc...",0
2447,Gorilla bury leaf.,"{'sen_id': 2447, 'sentence': 'Gorilla bury lea...","[{'wd_qid': 'Q729', 'wd_label': 'animal', 'des...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...",1


### dev

In [5]:
et_pap_dev_wikidata = dataframes['et_pap_dev_wikidata']
et_pap_dev_wikidata.drop('original_label', axis=1, inplace=True)
et_pap_dev_wikidata = et_pap_dev_wikidata[['text', 'event_type', 'subject_types', 'object_types', 'label']]

et_pep_dev_wikidata = dataframes['et_pep_dev_wikidata']
et_pep_dev_wikidata = et_pep_dev_wikidata[['text', 'event_type', 'subject_types', 'object_types', 'label']]

dev_wikidata = pd.concat([et_pap_dev_wikidata, et_pep_dev_wikidata])
dev_wikidata

Unnamed: 0,text,event_type,subject_types,object_types,label
0,Method seizes bacterium.,"{'sen_id': 0, 'sentence': 'Method seizes bacte...","[{'wd_qid': 'Q1799072', 'wd_label': 'method', ...","[{'wd_qid': 'Q7239', 'wd_label': 'organism', '...",0
1,Technician visits community.,"{'sen_id': 1, 'sentence': 'Technician visits c...","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...","[{'wd_qid': 'Q16334295', 'wd_label': 'group of...",1
2,Inclusion expands range.,"{'sen_id': 2, 'sentence': 'Inclusion expands r...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...","[{'wd_qid': 'Q4257161', 'wd_label': 'range', '...",1
3,Pencil puts norm.,"{'sen_id': 3, 'sentence': 'Pencil puts norm.',...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...","[{'wd_qid': 'Q151885', 'wd_label': 'concept', ...",0
4,Solution musters team.,"{'sen_id': 4, 'sentence': 'Solution musters te...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...","[{'wd_qid': 'Q16334295', 'wd_label': 'group of...",1
...,...,...,...,...,...
301,Girl slide water.,"{'sen_id': 301, 'sentence': 'Girl slide water....","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...","[{'wd_qid': 'Q283', 'wd_label': 'water', 'desc...",0
302,Plant bury air.,"{'sen_id': 302, 'sentence': 'Plant bury air.',...","[{'wd_qid': 'Q756', 'wd_label': 'plant', 'desc...","[{'wd_qid': 'Q7391292', 'wd_label': 'air', 'de...",0
303,Hand roll plane.,"{'sen_id': 303, 'sentence': 'Hand roll plane.'...","[{'wd_qid': 'Q33767', 'wd_label': 'hand', 'des...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...",0
304,Cup spill water.,"{'sen_id': 304, 'sentence': 'Cup spill water.'...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...","[{'wd_qid': 'Q283', 'wd_label': 'water', 'desc...",1


### test

In [6]:
et_pap_test_wikidata = dataframes['et_pap_test_wikidata']
et_pap_test_wikidata.drop('original_label', axis=1, inplace=True)
et_pap_test_wikidata = et_pap_test_wikidata[['text', 'event_type', 'subject_types', 'object_types', 'label']]

et_pep_test_wikidata = dataframes['et_pep_test_wikidata']
et_pep_test_wikidata = et_pep_test_wikidata[['text', 'event_type', 'subject_types', 'object_types', 'label']]

test_wikidata = pd.concat([et_pap_test_wikidata, et_pep_test_wikidata])
test_wikidata

Unnamed: 0,text,event_type,subject_types,object_types,label
0,Interpretation construes title.,"{'sen_id': 0, 'sentence': 'Interpretation cons...","[{'wd_qid': 'Q151885', 'wd_label': 'concept', ...","[{'wd_qid': 'Q216353', 'wd_label': 'title', 'd...",1
1,Mask sustains axis.,"{'sen_id': 1, 'sentence': 'Mask sustains axis....","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...",0
2,Trader ensures strategy.,"{'sen_id': 2, 'sentence': 'Trader ensures stra...","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...","[{'wd_qid': 'Q131841', 'wd_label': 'idea', 'de...",1
3,Animator comprises trip.,"{'sen_id': 3, 'sentence': 'Animator comprises ...","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...","[{'wd_qid': 'Q50847295', 'wd_label': 'journey'...",1
4,Welfare constructs hundred.,"{'sen_id': 4, 'sentence': 'Welfare constructs ...","[{'wd_qid': 'Q151885', 'wd_label': 'concept', ...","[{'wd_qid': 'Q11563', 'wd_label': 'number', 'd...",0
...,...,...,...,...,...
302,Air peel bush.,"{'sen_id': 302, 'sentence': 'Air peel bush.', ...","[{'wd_qid': 'Q7391292', 'wd_label': 'air', 'de...","[{'wd_qid': 'Q756', 'wd_label': 'plant', 'desc...",0
303,Man pull ant.,"{'sen_id': 303, 'sentence': 'Man pull ant.', '...","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...",0
304,Hand fasten crab.,"{'sen_id': 304, 'sentence': 'Hand fasten crab....","[{'wd_qid': 'Q33767', 'wd_label': 'hand', 'des...","[{'wd_qid': 'Q729', 'wd_label': 'animal', 'des...",1
305,Student beat man.,"{'sen_id': 305, 'sentence': 'Student beat man....","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...",1


## Load tokenizer and model.

In [7]:
tokenizer = AutoTokenizer.from_pretrained('roberta-large', cache_dir=CACHE_DIR)
model = RobertaForSequenceClassification.from_pretrained('roberta-large', cache_dir=CACHE_DIR)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
vocab = tokenizer.get_vocab()
print("Vocab size:", len(vocab))
tokenizer.special_tokens_map

Vocab size: 50265


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [9]:
special_tokens = {
    'additional_special_tokens': ['[ETYPE]',
                                  '[/ETYPE]',
                                  '[DEF]',
                                  '[/DEF]',
                                  '[EVT]',
                                  '[/EVT]',
                                 ]
}

tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

vocab = tokenizer.get_vocab()
print("Vocab size:", len(vocab))

tokenizer.special_tokens_map

Vocab size: 50271


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['[ETYPE]',
  '[EVT]',
  '[DEF]',
  '[/EVT]',
  '[/ETYPE]',
  '[/DEF]']}

In [10]:
tokenizer.max_model_input_sizes

{'roberta-base': 512,
 'roberta-large': 512,
 'roberta-large-mnli': 512,
 'distilroberta-base': 512,
 'roberta-base-openai-detector': 512,
 'roberta-large-openai-detector': 512}

## Create dataset.

In [11]:
TEMPLATE_SENT = """[EVT]{sent}[/EVT]"""

# TEMPLATE_SUBJ_BASIC = """\nThe subject "{subj}" has type [STYPE]{stype}[/STYPE], which means [DEF]{stype_desc}[/DEF]. """

# TEMPLATE_SUBJ_EXTEND = """It can also have type [STYPE]{stype}[/STYPE], which means [DEF]{stype_desc}[/DEF]. """

# TEMPLATE_SUBJ_UNK = """\nThe subject "{subj}" has an unknown type. """

TEMPLATE_VERB = """\nThe verb "{verb}" has type [ETYPE]{etype}[/ETYPE], which means [DEF]{etype_desc}[/DEF]. """

TEMPLATE_VERB_UNK = """\nThe verb "{verb}" has an unknown type. """

# TEMPLATE_OBJ_BASIC = """\nThe object "{obj}" has type [OTYPE]{otype}[/OTYPE], which means [DEF]{otype_desc}[/DEF]. """

# TEMPLATE_OBJ_EXTEND = """It can also have type [OTYPE]{otype}[/OTYPE], which means [DEF]{otype_desc}[/DEF]. """

# TEMPLATE_OBJ_UNK = """\nThe object "{obj}" has an unknown type. """

In [12]:
# def populate_templates(sent, subj, verb, obj, stype, etype, otype, stype_desc, etype_desc, otype_desc):
#     """
#     Populate the templates using the information extracted from an item (i.e. a row) in the dataframe.
    
#     Args:
#         sent:str         The sentence expressing the event.
#         subj:str         The subject of the sentence.
#         verb:str         The verb of the sentence.
#         obj:str          The object of the sentence.
#         stype:str        The subject type name.
#         etype:str        The verb type (i.e. event type) name.
#         otype:str        The object type name.
#         stype_desc:str   The subject type description.
#         etype_desc:str   The verb type (i.e. event type) description.
#         otype_desc:str   The object type description.
        
#     Return:
#         prompt:str       A prompt constructed from the templates and the item.
#     """
    
#     prompt = TEMPLATE_SENT.replace('<sent>', sent)
#     prompt += TEMPLATE_SUBJ_BASIC

In [13]:
def populate_templates(row:pd.core.series.Series):
    
    sentence = row['text']
    svo = sentence.split(" ")
    s = svo[0]
    v = svo[1]
    o = svo[2][:-1]
    #print(f"s:{s}, v:{v}, o:{o}")
    
    # Initialize the prompt with the event sentence.
    prompt = TEMPLATE_SENT.format(sent=sentence)
    
    # Populate subject type.
#     if len(row['subject_types']) > 0:
#         for i, st in enumerate(row['subject_types']):
#             subject_type_name = st['wd_label']
#             subject_type_desc = st['description']
#             if i < 1:
#                 prompt += TEMPLATE_SUBJ_BASIC.format(subj=s, stype=subject_type_name, stype_desc=subject_type_desc)
#             else:
#                 prompt += TEMPLATE_SUBJ_EXTEND.format(subj=s, stype=subject_type_name, stype_desc=subject_type_desc)
#     else:
#         prompt += TEMPLATE_SUBJ_UNK
    
    # Populate event type.
    event_type = row['event_type']
    if len(event_type['predicted_mentions']) > 0:
        event_trigger = row['event_type']['predicted_mentions'][0]['trigger_words']
        if v == event_trigger:
            event_type = row['event_type']['predicted_mentions'][0]['event_type']
            event_type_name = event_type['name']
            event_type_desc = event_type['description']
            prompt += TEMPLATE_VERB.format(verb=v, etype=event_type_name, etype_desc=event_type_desc)
        else:
            prompt += TEMPLATE_VERB_UNK.format(verb=v)
    else:
        prompt += TEMPLATE_VERB_UNK.format(verb=v)
        
    # Populate object type.
#     if len(row['object_types']) > 0:
#         for i, ot in enumerate(row['object_types']):
#             object_type_name = ot['wd_label']
#             object_type_desc = ot['description']
#             if i < 1:
#                 prompt += TEMPLATE_OBJ_BASIC.format(obj=o, otype=object_type_name, otype_desc=object_type_desc)
#             else:
#                 prompt += TEMPLATE_OBJ_EXTEND.format(obj=o, otype=object_type_name, otype_desc=object_type_desc)
#     else:
#         prompt += TEMPLATE_OBJ_UNK
    
    return prompt

In [14]:
def create_dataset(data_df:pd.core.frame.DataFrame, tokenizer):

    prompts = []
    for i, row in tqdm(data_df.iterrows()):
        row['event_type'] = ast.literal_eval(row['event_type'])
        row['subject_types'] = ast.literal_eval(row['subject_types'])
        row['object_types'] = ast.literal_eval(row['object_types'])

        prompt = populate_templates(row)
        prompts.append(prompt)

    ids = list(range(len(prompts)))
    input_ids = []
    attention_mask = []
    labels = data_df['label'].tolist()
#     for i, prompt in tqdm(enumerate(prompts)):
#         encoded_inputs = tokenizer(
#             prompt,
#             return_tensors='pt',
#             padding=True, 
#             truncation=True, 
#             max_length=tokenizer.max_len_single_sentence, 
#             add_special_tokens=True)
# #         print(encoded_inputs)
#         input_ids.append(encoded_inputs['input_ids'].squeeze())
#         attention_mask.append(encoded_inputs['attention_mask'].squeeze())
#         ids.append(i)
    encoded_inputs = tokenizer(
        prompts,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        add_special_tokens=True,
    )
    print(len(ids))
    print(encoded_inputs)
    print(encoded_inputs['input_ids'].shape)
    print(encoded_inputs['attention_mask'].shape)
    print(len(labels))
        
    dataset_dict = {
        'id': ids, 
        'prompt': prompts,
        'input_ids': encoded_inputs['input_ids'], 
        'attention_mask': encoded_inputs['attention_mask'], 
        'labels': labels
    }
        
    hf_dataset = Dataset.from_dict(dataset_dict)

    return hf_dataset

In [15]:
hf_dataset_train = create_dataset(train_wikidata, tokenizer)
print(len(hf_dataset_train))
hf_dataset_train

4911it [00:00, 6010.04it/s]


4911
{'input_ids': tensor([[    0, 50266, 44879,  ...,     1,     1,     1],
        [    0, 50266,   565,  ...,     1,     1,     1],
        [    0, 50266, 28873,  ...,     1,     1,     1],
        ...,
        [    0, 50266,   387,  ...,     1,     1,     1],
        [    0, 50266,   534,  ...,     1,     1,     1],
        [    0, 50266,   104,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([4911, 512])
torch.Size([4911, 512])
4911
4911


Dataset({
    features: ['id', 'prompt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 4911
})

In [16]:
hf_dataset_dev = create_dataset(dev_wikidata, tokenizer)
print(len(hf_dataset_dev))
hf_dataset_dev

614it [00:00, 6186.85it/s]

614
{'input_ids': tensor([[    0, 50266, 47967,  ...,     1,     1,     1],
        [    0, 50266, 40529,  ...,     1,     1,     1],
        [    0, 50266,  1121,  ...,     1,     1,     1],
        ...,
        [    0, 50266, 21292,  ...,     1,     1,     1],
        [    0, 50266,   347,  ...,     1,     1,     1],
        [    0, 50266, 12645,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([614, 512])
torch.Size([614, 512])
614
614





Dataset({
    features: ['id', 'prompt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 614
})

In [17]:
hf_dataset_test = create_dataset(test_wikidata, tokenizer)
print(len(hf_dataset_test))
hf_dataset_test

615it [00:00, 6196.48it/s]

615
{'input_ids': tensor([[    0, 50266, 26267,  ...,     1,     1,     1],
        [    0, 50266, 47661,  ...,     1,     1,     1],
        [    0, 50266, 12667,  ...,     1,     1,     1],
        ...,
        [    0, 50266, 21292,  ...,     1,     1,     1],
        [    0, 50266, 43541,  ...,     1,     1,     1],
        [    0, 50266,   347,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([615, 512])
torch.Size([615, 512])
615
615





Dataset({
    features: ['id', 'prompt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 615
})

In [18]:
hf_dataset_train_dev_test = DatasetDict(
    {
        'train': hf_dataset_train,
        'dev': hf_dataset_dev,
        'test': hf_dataset_test
    }
)
hf_dataset_train_dev_test

DatasetDict({
    train: Dataset({
        features: ['id', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4911
    })
    dev: Dataset({
        features: ['id', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 614
    })
    test: Dataset({
        features: ['id', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 615
    })
})

In [19]:
hf_dataset_train_dev_test['dev']['prompt'][0]

'[EVT]Method seizes bacterium.[/EVT]\nThe verb "seizes" has an unknown type. '

In [20]:
hf_dataset_train_dev_test.save_to_disk('./output/dataset_5-2_3')

Saving the dataset (0/1 shards):   0%|          | 0/4911 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/614 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/615 [00:00<?, ? examples/s]

## Create pap_test and pep_test separately for evaluation.

### Pap

In [21]:
et_pap_test_wikidata

Unnamed: 0,text,event_type,subject_types,object_types,label
0,Interpretation construes title.,"{'sen_id': 0, 'sentence': 'Interpretation cons...","[{'wd_qid': 'Q151885', 'wd_label': 'concept', ...","[{'wd_qid': 'Q216353', 'wd_label': 'title', 'd...",1
1,Mask sustains axis.,"{'sen_id': 1, 'sentence': 'Mask sustains axis....","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...",0
2,Trader ensures strategy.,"{'sen_id': 2, 'sentence': 'Trader ensures stra...","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...","[{'wd_qid': 'Q131841', 'wd_label': 'idea', 'de...",1
3,Animator comprises trip.,"{'sen_id': 3, 'sentence': 'Animator comprises ...","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...","[{'wd_qid': 'Q50847295', 'wd_label': 'journey'...",1
4,Welfare constructs hundred.,"{'sen_id': 4, 'sentence': 'Welfare constructs ...","[{'wd_qid': 'Q151885', 'wd_label': 'concept', ...","[{'wd_qid': 'Q11563', 'wd_label': 'number', 'd...",0
...,...,...,...,...,...
303,Majority stops helmet.,"{'sen_id': 303, 'sentence': 'Majority stops he...","[{'wd_qid': 'Q16334295', 'wd_label': 'group of...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...",0
304,Beach picks involvement.,"{'sen_id': 304, 'sentence': 'Beach picks invol...","[{'wd_qid': 'Q40080', 'wd_label': 'beach', 'de...","[{'wd_qid': 'Q1656682', 'wd_label': 'event', '...",0
305,Book realizes size.,"{'sen_id': 305, 'sentence': 'Book realizes siz...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...","[{'wd_qid': 'Q322481', 'wd_label': 'size', 'de...",0
306,Landfill intersects number.,"{'sen_id': 306, 'sentence': 'Landfill intersec...","[{'wd_qid': 'Q98929991', 'wd_label': 'place', ...","[{'wd_qid': 'Q11563', 'wd_label': 'number', 'd...",0


In [22]:
hf_dataset_pap_test = create_dataset(et_pap_test_wikidata, tokenizer)
hf_dataset_pap_test

308it [00:00, 5958.15it/s]

308
{'input_ids': tensor([[    0, 50266, 26267,  ...,     1,     1,     1],
        [    0, 50266, 47661,  ...,     1,     1,     1],
        [    0, 50266, 12667,  ...,     1,     1,     1],
        ...,
        [    0, 50266, 24751,  ...,     1,     1,     1],
        [    0, 50266, 26902,  ...,     1,     1,     1],
        [    0, 50266, 44644,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([308, 512])
torch.Size([308, 512])
308





Dataset({
    features: ['id', 'prompt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 308
})

### Pep

In [23]:
et_pep_test_wikidata

Unnamed: 0,text,event_type,subject_types,object_types,label
0,Worm enter cave.,"{'sen_id': 0, 'sentence': 'Worm enter cave.', ...","[{'wd_qid': 'Q7239', 'wd_label': 'organism', '...","[{'wd_qid': 'Q35509', 'wd_label': 'cave', 'des...",1
1,Elephant toss cat.,"{'sen_id': 1, 'sentence': 'Elephant toss cat.'...","[{'wd_qid': 'Q729', 'wd_label': 'animal', 'des...","[{'wd_qid': 'Q729', 'wd_label': 'animal', 'des...",1
2,Beak tap purse.,"{'sen_id': 2, 'sentence': 'Beak tap purse.', '...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...",1
3,Wolf push cup.,"{'sen_id': 3, 'sentence': 'Wolf push cup.', 'p...","[{'wd_qid': 'Q729', 'wd_label': 'animal', 'des...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...",1
4,Pen etch oil.,"{'sen_id': 4, 'sentence': 'Pen etch oil.', 'pr...","[{'wd_qid': 'Q35120', 'wd_label': 'entity', 'd...","[{'wd_qid': 'Q378078', 'wd_label': 'substance'...",0
...,...,...,...,...,...
302,Air peel bush.,"{'sen_id': 302, 'sentence': 'Air peel bush.', ...","[{'wd_qid': 'Q7391292', 'wd_label': 'air', 'de...","[{'wd_qid': 'Q756', 'wd_label': 'plant', 'desc...",0
303,Man pull ant.,"{'sen_id': 303, 'sentence': 'Man pull ant.', '...","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...",0
304,Hand fasten crab.,"{'sen_id': 304, 'sentence': 'Hand fasten crab....","[{'wd_qid': 'Q33767', 'wd_label': 'hand', 'des...","[{'wd_qid': 'Q729', 'wd_label': 'animal', 'des...",1
305,Student beat man.,"{'sen_id': 305, 'sentence': 'Student beat man....","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...","[{'wd_qid': 'Q215627', 'wd_label': 'person', '...",1


In [24]:
hf_dataset_pep_test = create_dataset(et_pep_test_wikidata, tokenizer)
hf_dataset_pep_test

307it [00:00, 6330.20it/s]


307
{'input_ids': tensor([[    0, 50266,   771,  ...,     1,     1,     1],
        [    0, 50266, 28888,  ...,     1,     1,     1],
        [    0, 50266,  9325,  ...,     1,     1,     1],
        ...,
        [    0, 50266, 21292,  ...,     1,     1,     1],
        [    0, 50266, 43541,  ...,     1,     1,     1],
        [    0, 50266,   347,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([307, 512])
torch.Size([307, 512])
307


Dataset({
    features: ['id', 'prompt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 307
})

In [25]:
hf_dataset_test = DatasetDict(
    {
        'pap': hf_dataset_pap_test,
        'pep': hf_dataset_pep_test,
    }
)
hf_dataset_test

DatasetDict({
    pap: Dataset({
        features: ['id', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 308
    })
    pep: Dataset({
        features: ['id', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 307
    })
})

In [26]:
hf_dataset_test.save_to_disk('./output/testsets_5-2_3')

Saving the dataset (0/1 shards):   0%|          | 0/308 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/307 [00:00<?, ? examples/s]