In [1]:
from openprompt import PromptDataLoader, PromptForClassification
from openprompt.data_utils import InputExample, InputFeatures
from openprompt.plms import load_plm
from openprompt.prompts import ManualTemplate, ManualVerbalizer, ManualTemplate
from tqdm import tqdm
import numpy as np
from datasets import load_dataset
import torch
import pandas as pd



In [2]:
plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Prepare Data

Load dataset

In [3]:
# load data
pokemon_descriptions = load_dataset('../data/dataset/', delimiter=';')
NUM_CLASSES = np.unique(pokemon_descriptions['train']['labels'])

Using custom data configuration dataset-294e9b13f49dafc6
Found cached dataset csv (C:/Users/fst/.cache/huggingface/datasets/csv/dataset-294e9b13f49dafc6/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Make split

In [4]:
# train test split
split_pokemon_descriptions = pokemon_descriptions['train'].train_test_split(
    test_size=0.2, shuffle=True)

Generate InputExamples from existing dataset

In [5]:
dataset = {}
for split in ['train']:
    dataset[split] = []
    for sample in split_pokemon_descriptions[split]:
        input_example = InputExample(text_a = sample['text'], label=int(sample['labels']))
        dataset[split].append(input_example)

Create template

In [6]:
promptTemplate = ManualTemplate(
    text = '{"placeholder":"text_a"} the pokemon is {"mask"}',
    tokenizer = tokenizer,
)

wrapped_example = promptTemplate.wrap_one_example(dataset['train'][0])
print(wrapped_example)

[[{'text': 'There are several small, yellowish crystals scattered across its body and iron sand attracted by magnetism', 'loss_ids': 0, 'shortenable_ids': 1}, {'text': ' the pokemon is', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0}], {'label': 74}]


Create verbalizer

In [7]:
mappings = pd.read_csv('../data/pokemon_mapping.csv')
name_to_label_dict = mappings[["name","index"]].set_index('index').to_dict()["name"]

In [8]:
promptVerbalizer = ManualVerbalizer(
    classes = NUM_CLASSES,
    label_words = name_to_label_dict,
    tokenizer = tokenizer,
)

Create dataloader

In [9]:
train_dataloader = PromptDataLoader(
  dataset=dataset["train"],
  template=promptTemplate, 
  tokenizer=tokenizer,
  tokenizer_wrapper_class=WrapperClass, 
  shuffle=True,
  truncate_method="head",
  decoder_max_length=3,
  batch_size=16,
  teacher_forcing=False,
  predict_eos_token=False,
  max_seq_length=80,
)

tokenizing: 8631it [00:07, 1191.73it/s]


Create model

In [10]:
promptModel = PromptForClassification(
    template = promptTemplate,
    plm = plm,
    verbalizer = promptVerbalizer,
    freeze_plm= False
)
promptModel= promptModel.cuda()

In [11]:
epochs = 5
no_decay = ['bias', 'layer_norm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in promptModel.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in promptModel.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = torch.optim.AdamW(params=optimizer_grouped_parameters)
loss_func = torch.nn.CrossEntropyLoss()

In [12]:
for epoch in range(epochs):
    tot_loss = 0
    for step, inputs in enumerate(train_dataloader):
        inputs = inputs.cuda()
        logits = promptModel(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()

        if step %100 == 0:
            print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)

Epoch 0, average loss: 7.08217716217041
Epoch 0, average loss: 6.1215168509152855
Epoch 0, average loss: 5.61420868403876
Epoch 0, average loss: 5.438236718161954
Epoch 0, average loss: 5.348947370438801
Epoch 0, average loss: 5.293366494055042
Epoch 1, average loss: 4.999392032623291
Epoch 1, average loss: 5.071452324933345
Epoch 1, average loss: 5.072045409264256
Epoch 1, average loss: 5.07404838447951
Epoch 1, average loss: 5.069258875383106
Epoch 1, average loss: 5.069009586722551
Epoch 2, average loss: 4.9311604499816895
Epoch 2, average loss: 5.05151632516691
Epoch 2, average loss: 5.057443704178084
Epoch 2, average loss: 5.057116315214341
Epoch 2, average loss: 5.059043224315691
Epoch 2, average loss: 5.059736605889783
Epoch 3, average loss: 5.115413665771484
Epoch 3, average loss: 5.047688078172136
Epoch 3, average loss: 5.048951258113728
Epoch 3, average loss: 5.045740878463188
Epoch 3, average loss: 5.046302028427695
Epoch 3, average loss: 5.045262187302946
Epoch 4, average l

In [13]:
torch.save(promptModel.state_dict(),"checkp/bert_trained_model.cp")

### Versuch 1

- True = [133, 12, 46, 44, 70, 0, 101, 134, 113, 109, 136, 38, 91, 143, 59, 110, 127, 116, 98, 80, 149, 48, 46]
- Predicted = [ 94,  78,  94,  94,  78, 105,  94,  94,  94,  94,  54,  94,  94,  78, 98,  94,  78,  94,  94, 105,  28,  78,  94]

In [14]:
dataset_test = {}
for split in ['test']:
    dataset_test[split] = []
    for sample in split_pokemon_descriptions[split]:
        input_example = InputExample(text_a = sample['text'], label=int(sample['labels']))
        dataset_test[split].append(input_example)

In [15]:
test_dataloader = PromptDataLoader(dataset=dataset_test["test"], template=promptTemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=250, decoder_max_length=3,
    batch_size=1,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

tokenizing: 2158it [00:01, 1147.08it/s]


In [16]:
allpreds = []
alllabels = []
for step, inputs in enumerate(test_dataloader):
    inputs = inputs.cuda()
    logits = promptModel(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cuda().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cuda().tolist())

acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print(acc)

0.007877664504170528
