In [1]:
import os, torch, ast
import pandas as pd
from tqdm import tqdm
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, RobertaForSequenceClassification

In [2]:
PATH = '/mount/studenten/arbeitsdaten-studenten1/semantic-plausibility/plausible-parrots/'
CACHE_DIR = PATH + 'cache/'
DATA_PATH = PATH + 'datasets_augmented_preprocessed/'

## Load all the 6 data splits. Merge pap_train and pep_train, pap_dev and pep_dev, as well as pap_test and pep_test, respectively.

In [3]:
dataframes = {}
for root, dirs, files in os.walk(DATA_PATH):
    print(root)
    print(dirs)
    print(files)
    for file in files:
        if file.endswith('.csv'):
            fn = file.replace('.csv', '')
            fp = root + file
            dataframes[fn] = pd.read_csv(fp)
            
print(len(dataframes))
dataframes.keys()

/mount/studenten/arbeitsdaten-studenten1/semantic-plausibility/plausible-parrots/datasets_augmented_preprocessed/
[]
['pap_dev_processed_augmented.csv', 'pep_dev_processed.csv', 'pap_test_processed_augmented.csv', 'pep_train_processed.csv', 'pap_train_processed_augmented.csv', 'pep_test_processed.csv']
6


dict_keys(['pap_dev_processed_augmented', 'pep_dev_processed', 'pap_test_processed_augmented', 'pep_train_processed', 'pap_train_processed_augmented', 'pep_test_processed'])

### train

In [4]:
# Pap train
pap_train = dataframes['pap_train_processed_augmented']
pap_train.drop('original_label', axis=1, inplace=True)
pap_train = pap_train[['text', 'label']]

# Pep train
pep_train = dataframes['pep_train_processed']
pep_train = pep_train[['text', 'label']]

# Merge Pap train and Pep train
train_wikidata = pd.concat([pap_train, pep_train])
train_wikidata

Unnamed: 0,text,label
0,Event occurs year.,1
1,Tortoise brings limb.,1
2,Headliner overpowers function.,1
3,County receives hour.,0
4,Traveler acknowledges recognition.,1
...,...,...
2444,Wool clip dust.,0
2445,Rope hook pan.,1
2446,Bag contain tree.,0
2447,Gorilla bury leaf.,1


### dev

In [5]:
# Pap dev
pap_dev = dataframes['pap_dev_processed_augmented']
pap_dev.drop('original_label', axis=1, inplace=True)
pap_dev = pap_dev[['text', 'label']]

# Pep dev
pep_dev = dataframes['pep_dev_processed']
pep_dev = pep_dev[['text', 'label']]

# Merge Pap dev and Pep dev
dev_wikidata = pd.concat([pap_dev, pep_dev])
dev_wikidata

Unnamed: 0,text,label
0,Method seizes bacterium.,0
1,Technician visits community.,1
2,Inclusion expands range.,1
3,Pencil puts norm.,0
4,Solution musters team.,1
...,...,...
301,Girl slide water.,0
302,Plant bury air.,0
303,Hand roll plane.,0
304,Cup spill water.,1


### test

In [6]:
# Pap test
pap_test = dataframes['pap_test_processed_augmented']
pap_test.drop('original_label', axis=1, inplace=True)
pap_test = pap_test[['text', 'label']]

# Pep test
pep_test = dataframes['pep_test_processed']
pep_test = pep_test[['text', 'label']]

# Merge Pap test and Pep test
test_wikidata = pd.concat([pap_test, pep_test])
test_wikidata

Unnamed: 0,text,label
0,Interpretation construes title.,1
1,Mask sustains axis.,0
2,Trader ensures strategy.,1
3,Animator comprises trip.,1
4,Welfare constructs hundred.,0
...,...,...
302,Air peel bush.,0
303,Man pull ant.,0
304,Hand fasten crab.,1
305,Student beat man.,1


In [7]:
type(test_wikidata['label'].values[0])

numpy.int64

## Load tokenizer and model.

In [8]:
tokenizer = AutoTokenizer.from_pretrained('roberta-large', cache_dir=CACHE_DIR)
model = RobertaForSequenceClassification.from_pretrained('roberta-large', cache_dir=CACHE_DIR)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
vocab = tokenizer.get_vocab()
print("Vocab size:", len(vocab))
tokenizer.special_tokens_map

Vocab size: 50265


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [10]:
tokenizer.max_model_input_sizes

{'roberta-base': 512,
 'roberta-large': 512,
 'roberta-large-mnli': 512,
 'distilroberta-base': 512,
 'roberta-base-openai-detector': 512,
 'roberta-large-openai-detector': 512}

In [11]:
def create_dataset(data_df:pd.core.frame.DataFrame, tokenizer):
        
    ids = list(range(len(data_df)))
    input_ids = []
    attention_mask = []
    labels = data_df['label'].tolist()
    
    encoded_inputs = tokenizer(
        data_df['text'].tolist(),
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        add_special_tokens=True,
    )
    print(len(ids))
    print(encoded_inputs)
    print(encoded_inputs['input_ids'].shape)
    print(encoded_inputs['attention_mask'].shape)
    print(len(labels))
    
    dataset_dict = {
        'id': ids, 
        'input_ids': encoded_inputs['input_ids'], 
        'attention_mask': encoded_inputs['attention_mask'], 
        'labels': labels
    }
    
    hf_dataset = Dataset.from_dict(dataset_dict)

    return hf_dataset

In [12]:
hf_dataset_train = create_dataset(train_wikidata, tokenizer)
print(len(hf_dataset_train))
hf_dataset_train

4911
{'input_ids': tensor([[    0, 44879, 11493,  ...,     1,     1,     1],
        [    0,   565,  2723,  ...,     1,     1,     1],
        [    0, 28873, 17656,  ...,     1,     1,     1],
        ...,
        [    0,   387,  1073,  ...,     1,     1,     1],
        [    0,   534,   368,  ...,     1,     1,     1],
        [    0,   104,   808,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([4911, 512])
torch.Size([4911, 512])
4911
4911


Dataset({
    features: ['id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 4911
})

In [13]:
hf_dataset_dev = create_dataset(dev_wikidata, tokenizer)
print(len(hf_dataset_dev))
hf_dataset_dev

614
{'input_ids': tensor([[    0, 47967,   842,  ...,     1,     1,     1],
        [    0, 40529, 14932,  ...,     1,     1,     1],
        [    0,  1121, 27953,  ...,     1,     1,     1],
        ...,
        [    0, 21292,  3825,  ...,     1,     1,     1],
        [    0,   347,   658,  ...,     1,     1,     1],
        [    0, 12645, 45937,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([614, 512])
torch.Size([614, 512])
614
614


Dataset({
    features: ['id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 614
})

In [14]:
hf_dataset_test = create_dataset(test_wikidata, tokenizer)
print(len(hf_dataset_test))
hf_dataset_test

615
{'input_ids': tensor([[    0, 26267, 42354,  ...,     1,     1,     1],
        [    0, 47661, 29237,  ...,     1,     1,     1],
        [    0, 12667,  7292,  ...,     1,     1,     1],
        ...,
        [    0, 21292,  1769,  ...,     1,     1,     1],
        [    0, 43541,  1451,  ...,     1,     1,     1],
        [    0,   347,   658,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([615, 512])
torch.Size([615, 512])
615
615


Dataset({
    features: ['id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 615
})

In [15]:
hf_dataset_train_dev_test = DatasetDict(
    {
        'train': hf_dataset_train,
        'dev': hf_dataset_dev,
        'test': hf_dataset_test
    }
)
hf_dataset_train_dev_test

DatasetDict({
    train: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4911
    })
    dev: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 614
    })
    test: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 615
    })
})

In [16]:
hf_dataset_train_dev_test.save_to_disk('./output/dataset_baseline')

Saving the dataset (0/1 shards):   0%|          | 0/4911 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/614 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/615 [00:00<?, ? examples/s]

## Create test sets for Pap and Pep individually.

In [17]:
pap_test

Unnamed: 0,text,label
0,Interpretation construes title.,1
1,Mask sustains axis.,0
2,Trader ensures strategy.,1
3,Animator comprises trip.,1
4,Welfare constructs hundred.,0
...,...,...
303,Majority stops helmet.,0
304,Beach picks involvement.,0
305,Book realizes size.,0
306,Landfill intersects number.,0


In [18]:
hf_testset_pap = create_dataset(pap_test, tokenizer)
hf_testset_pap

308
{'input_ids': tensor([[    0, 26267, 42354,  ...,     1,     1,     1],
        [    0, 47661, 29237,  ...,     1,     1,     1],
        [    0, 12667,  7292,  ...,     1,     1,     1],
        ...,
        [    0, 24751, 26558,  ...,     1,     1,     1],
        [    0, 26902, 29238,  ...,     1,     1,     1],
        [    0, 44644, 27995,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([308, 512])
torch.Size([308, 512])
308


Dataset({
    features: ['id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 308
})

In [19]:
pep_test

Unnamed: 0,text,label
0,Worm enter cave.,1
1,Elephant toss cat.,1
2,Beak tap purse.,1
3,Wolf push cup.,1
4,Pen etch oil.,0
...,...,...
302,Air peel bush.,0
303,Man pull ant.,0
304,Hand fasten crab.,1
305,Student beat man.,1


In [20]:
hf_testset_pep = create_dataset(pep_test, tokenizer)
hf_testset_pep

307
{'input_ids': tensor([[    0,   771,  8693,  ...,     1,     1,     1],
        [    0, 28888, 44254,  ...,     1,     1,     1],
        [    0,  9325,   677,  ...,     1,     1,     1],
        ...,
        [    0, 21292,  1769,  ...,     1,     1,     1],
        [    0, 43541,  1451,  ...,     1,     1,     1],
        [    0,   347,   658,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([307, 512])
torch.Size([307, 512])
307


Dataset({
    features: ['id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 307
})

In [21]:
hf_testsets = DatasetDict(
    {
        'pap': hf_testset_pap,
        'pep': hf_testset_pep,
    }
)
hf_testsets

DatasetDict({
    pap: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 308
    })
    pep: Dataset({
        features: ['id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 307
    })
})

In [22]:
hf_testsets.save_to_disk('./output/testsets_baseline')

Saving the dataset (0/1 shards):   0%|          | 0/308 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/307 [00:00<?, ? examples/s]