In [1]:
import pandas as pd
import numpy as np

df_data = pd.read_csv('../data/Friends_A_whole.tsv', sep = '\t')
df = df_data[['utterance', 'labels']]


from sklearn.model_selection import train_test_split
Uttr_train, Uttr_test, label_train, label_test = \
    train_test_split(df['utterance'], df['labels'], test_size=0.1, random_state=42, stratify=df['labels'])

Uttr_train, Uttr_valid, label_train, label_valid = \
    train_test_split(Uttr_train, label_train, test_size=0.1, random_state=42, stratify=label_train)

## 构建template

In [2]:
from openprompt.data_utils import InputExample
 

dataset = {}

for split in ['train', 'validation', 'test']:
    dataset[split] = []
    cnt = 0
    for u,l in zip(Uttr_train, label_train):
        input_sample = InputExample(text_a=u, label=int(l),guid=cnt)
        cnt += 1
        dataset[split].append(input_sample)
        
        
print(dataset['train'][0])

{
  "guid": 0,
  "label": 0,
  "meta": {},
  "text_a": "  Okay, (reading the card) Fonzy gives you two thumbs up, collect two cool points. Yeah.  Okay, come on! (blows on the dice) Daddy needs a new pair of electromagnetic microscopes for the Prehistoric Forensics Department! (They all look at him, and he shuts up and rolls the dice.) (he moves his piece) Okay. (reading a card) Take Pinky Tuscadero up to Inspiration Point, collect three cool points!! Yeah! Which gives me five, and let's see who is gonna lose their clothes. Ummmm, I think I pick our strip poker sponsor Mr. Joey Tribianni.",
  "text_b": "",
  "tgt_text": null
}



In [3]:
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
from openprompt.prompts import ManualTemplate
mytemplate = ManualTemplate(
    text = '{"placeholder":"text_a"} He is {"mask"}',
    tokenizer = tokenizer,
)

wrapped_example = mytemplate.wrap_one_example(dataset['train'][0])
print(wrapped_example)

[[{'text': "  Okay, (reading the card) Fonzy gives you two thumbs up, collect two cool points. Yeah.  Okay, come on! (blows on the dice) Daddy needs a new pair of electromagnetic microscopes for the Prehistoric Forensics Department! (They all look at him, and he shuts up and rolls the dice.) (he moves his piece) Okay. (reading a card) Take Pinky Tuscadero up to Inspiration Point, collect three cool points!! Yeah! Which gives me five, and let's see who is gonna lose their clothes. Ummmm, I think I pick our strip poker sponsor Mr. Joey Tribianni.", 'loss_ids': 0, 'shortenable_ids': 1}, {'text': ' He is', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0}], {'guid': 0, 'label': 0}]


In [5]:
wrapped_tokenizer = WrapperClass(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")

In [6]:
tokenized_example = wrapped_tokenizer.tokenize_one_example(wrapped_example, teacher_forcing=False)
print(tokenized_example)
print(tokenizer.convert_ids_to_tokens(tokenized_example['input_ids']))

{'input_ids': [101, 3956, 117, 113, 3455, 1103, 3621, 114, 143, 1320, 6482, 3114, 1128, 1160, 18680, 1146, 117, 7822, 1160, 4348, 1827, 119, 2814, 119, 3956, 117, 1435, 1113, 106, 113, 14977, 1113, 1103, 26104, 114, 9979, 2993, 170, 1207, 3111, 1104, 19805, 17599, 15300, 1116, 1111, 1103, 11689, 27516, 2772, 1596, 1370, 5026, 4724, 1951, 106, 113, 1220, 1155, 1440, 1120, 1140, 117, 1105, 1119, 3210, 1116, 1146, 1105, 12205, 1103, 26104, 119, 114, 113, 1119, 5279, 1117, 2727, 114, 3956, 119, 113, 3455, 170, 3621, 114, 5055, 10763, 1183, 17037, 26996, 2692, 1186, 1146, 1106, 1130, 21240, 4221, 117, 7822, 1210, 4348, 1827, 106, 106, 2814, 106, 5979, 3114, 1143, 1421, 117, 1105, 1519, 112, 188, 1267, 1150, 1110, 6100, 3857, 1147, 3459, 1124, 1110, 103, 102], 'loss_ids': [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [7]:
model_inputs = {}
for split in ['train', 'validation', 'test']:
    model_inputs[split] = []
    for sample in dataset[split]:
        tokenized_example = wrapped_tokenizer.tokenize_one_example(mytemplate.wrap_one_example(sample), teacher_forcing=False)
        model_inputs[split].append(tokenized_example)

Token indices sequence length is longer than the specified maximum sequence length for this model (578 > 512). Running this sequence through the model will result in indexing errors


In [8]:
from openprompt import PromptDataLoader

train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")
# next(iter(train_dataloader))


tokenizing: 575it [00:00, 1759.88it/s]


In [9]:
from openprompt.prompts import ManualVerbalizer
import torch

# for example the verbalizer contains multiple label words in each class
myverbalizer = ManualVerbalizer(tokenizer, num_classes=2,
                        label_words=[["disagreeable"], ["agreeable"]])

print(myverbalizer.label_words_ids)
logits = torch.randn(2,len(tokenizer)) # creating a pseudo output from the plm, and
print(myverbalizer.process_logits(logits)) # see what the verbalizer do

Parameter containing:
tensor([[[23423,  1895]],

        [[ 5340,  1895]]])
tensor([[-0.4659, -0.9877],
        [-0.2609, -1.4712]])


In [10]:
from openprompt import PromptForClassification

use_cuda = True
prompt_model = PromptForClassification(plm=plm,template=mytemplate, verbalizer=myverbalizer, freeze_plm=False)
if use_cuda:
    prompt_model=  prompt_model.cuda()

In [11]:
from transformers import  AdamW, get_linear_schedule_with_warmup
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']
# it's always good practice to set no decay to biase and LayerNorm parameters
optimizer_grouped_parameters = [
    {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)

for epoch in range(10):
    tot_loss = 0
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if step %100 ==1:
            print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)

# Evaluate
validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

allpreds = []
alllabels = []
for step, inputs in enumerate(validation_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print(acc)

Epoch 0, average loss: 4.126529097557068




Epoch 0, average loss: 0.8543983506513577
Epoch 1, average loss: 0.6764234602451324
Epoch 1, average loss: 0.6893489284842622
Epoch 2, average loss: 0.6995877027511597
Epoch 2, average loss: 0.6833851664674049
Epoch 3, average loss: 0.752496600151062
Epoch 3, average loss: 0.7129354307464525
Epoch 4, average loss: 0.7239862978458405
Epoch 4, average loss: 0.6950551268516802
Epoch 5, average loss: 0.7413607835769653
Epoch 5, average loss: 0.7062824911930982
Epoch 6, average loss: 0.7496736645698547
Epoch 6, average loss: 0.6900721946183372
Epoch 7, average loss: 0.6635996699333191
Epoch 7, average loss: 0.6916192197332195
Epoch 8, average loss: 0.620771199464798
Epoch 8, average loss: 0.6913934899311439
Epoch 9, average loss: 0.5646192729473114
Epoch 9, average loss: 0.6917744246768016


tokenizing: 575it [00:00, 1748.21it/s]


0.5704347826086956
