In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '3'


from openprompt.data_utils import InputExample


import json
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict, Features, ClassLabel, Value

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainerCallback
import evaluate
from datasets import load_metric
import numpy as np
import time




SEED = 42

df_data = pd.read_csv( '../new_data/CPED_A_with_role_1.tsv', sep = '\t')
df_data.head()



Unnamed: 0,sent,labels,emotions,dialog_state,origin_sent,personality_description,affective_prompt
0,Speaker : what a coincidence; Speaker : the ca...,0,"['neutral', 'neutral', 'neutral', 'neutral']","[1, 1, 0, 1]",Speaker : what a coincidence; Speaker : the ca...,"Speaker is friendly, cooperative, empathetic, ...","The emotion of Speaker is initially neutral, S..."
1,Speaker : right; Others : Your kids go to scho...,1,"['neutral', 'neutral', 'neutral']","[1, 0, 1]",Speaker : right; Others : Your kids go to scho...,"Speaker is friendly, cooperative, empathetic, ...","The emotion of Speaker is initially neutral, t..."
2,Speaker : What a coincidence my son is here; S...,0,"['neutral', 'neutral', 'neutral', 'neutral']","[1, 1, 0, 1]",Speaker : What a coincidence my son is here; S...,"Speaker is friendly, cooperative, empathetic, ...","The emotion of Speaker is initially neutral, S..."
3,Others : class three; Speaker : My son is also...,0,"['neutral', 'neutral', 'neutral', 'neutral', '...","[0, 1, 1, 1, 0]",Others : class three; Speaker : My son is also...,"Speaker is friendly, cooperative, empathetic, ...","First, the emotion of others is neutral, Speak..."
4,"Speaker : Ji Yangyang, it seems that I heard F...",0,"['neutral', 'neutral', 'neutral', 'neutral', '...","[1, 1, 1, 0, 1]","Speaker : Ji Yangyang, it seems that I heard F...","Speaker is friendly, cooperative, empathetic, ...","The emotion of Speaker is initially neutral, S..."


In [2]:
def load_data(tsv_file):
    df = pd.read_csv(tsv_file, sep='\t')
    df['sent_affective'] = df['origin_sent'] + ' ' + df['affective_prompt']
    data_path = '../data/tmp.jsonl'
    json_data = df[['sent_affective', 'personality_description', 'labels']].to_dict(orient="records")
    with open(data_path, 'w') as outfile:
        for row in json_data:
            json.dump(row, outfile)
            outfile.write('\n')

    class_names = ['no', 'yes']
    features = Features({'sent_affective': Value('string'), 'personality_description': Value('string'), 'labels': ClassLabel(names=class_names)})
    dataset_dict = load_dataset("json", data_files=data_path, features=features)

    tmp_dict = dataset_dict['train'].train_test_split(test_size=0.2, shuffle=True, seed=SEED)
    train_dataset, remaining_dataset = tmp_dict['train'], tmp_dict['test']
    tmp_dict = remaining_dataset.train_test_split(test_size=0.5, shuffle=True, seed=SEED)
    valid_dataset, test_dataset = tmp_dict['train'], tmp_dict['test']
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': valid_dataset,
        'test': test_dataset
    })
    return dataset_dict

tsv_file = '../new_data/CPED_A_with_role_1.tsv'
dataset_dict = load_data(tsv_file)
dataset_dict

Downloading and preparing dataset json/default to /home/wenzhy/.cache/huggingface/datasets/json/default-c99757b34ec36bff/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/wenzhy/.cache/huggingface/datasets/json/default-c99757b34ec36bff/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sent_affective', 'personality_description', 'labels'],
        num_rows: 10611
    })
    validation: Dataset({
        features: ['sent_affective', 'personality_description', 'labels'],
        num_rows: 1326
    })
    test: Dataset({
        features: ['sent_affective', 'personality_description', 'labels'],
        num_rows: 1327
    })
})

In [3]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['sent_affective', 'personality_description', 'labels'],
        num_rows: 10611
    })
    validation: Dataset({
        features: ['sent_affective', 'personality_description', 'labels'],
        num_rows: 1326
    })
    test: Dataset({
        features: ['sent_affective', 'personality_description', 'labels'],
        num_rows: 1327
    })
})

In [4]:
dataset = {}

for split in ['train', 'validation', 'test']:
    dataset[split] = []
    cnt = 0
    for data in dataset_dict[split]:
        input_example = InputExample(text_a = data['sent_affective'], text_b = data['personality_description'], label=int(data['labels']), guid=cnt)
        dataset[split].append(input_example)
        cnt += 1
print(dataset['train'][0])

{
  "guid": 0,
  "label": 0,
  "meta": {},
  "text_a": "Speaker : you take this and sort it out; Others : what is this; Speaker : recording evidence; Speaker : Provided by Lou Xuan; Others : Lou Xuan; Speaker : That's Miss Xuan; Others : he is willing to testify; Speaker : Duancongxin found that it contained all his phone calls with Liu Jun in the past three years.; Others : he recorded it all; Speaker : he has the habit of recording; Others : such a terrible habit;  The emotion of Speaker is initially neutral, the emotion of others is neutral, Speaker respond with neutral, Speaker respond with neutral, the emotion of others is neutral, Speaker respond with neutral, the emotion of others is happy, Speaker respond with neutral, the emotion of others is happy, Speaker respond with neutral, the emotion of others is fear, ",
  "text_b": "Speaker is friendly, cooperative, empathetic, and compassionate, often prioritizing harmonious relationships and the well-being of others.",
  "tgt_text":

In [5]:
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base")

from openprompt.prompts import ManualTemplate
template_text = '{"placeholder":"text_a"} Question: {"placeholder":"text_b"}? Is it correct? {"mask"}.'
mytemplate = ManualTemplate(tokenizer=tokenizer, text=template_text)

# To better understand how does the template wrap the example, we visualize one instance.

wrapped_example = mytemplate.wrap_one_example(dataset['train'][0])
print(wrapped_example)

[[{'text': "Speaker : you take this and sort it out; Others : what is this; Speaker : recording evidence; Speaker : Provided by Lou Xuan; Others : Lou Xuan; Speaker : That's Miss Xuan; Others : he is willing to testify; Speaker : Duancongxin found that it contained all his phone calls with Liu Jun in the past three years.; Others : he recorded it all; Speaker : he has the habit of recording; Others : such a terrible habit;  The emotion of Speaker is initially neutral, the emotion of others is neutral, Speaker respond with neutral, Speaker respond with neutral, the emotion of others is neutral, Speaker respond with neutral, the emotion of others is happy, Speaker respond with neutral, the emotion of others is happy, Speaker respond with neutral, the emotion of others is fear, ", 'loss_ids': 0, 'shortenable_ids': 1}, {'text': ' Question:', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': ' Speaker is friendly, cooperative, empathetic, and compassionate, often prioritizing harmonious relati

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
from openprompt.plms import T5TokenizerWrapper
wrapped_t5tokenizer= T5TokenizerWrapper(max_seq_length=256, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")

# You can see what a tokenized example looks like by
tokenized_example = wrapped_t5tokenizer.tokenize_one_example(wrapped_example, teacher_forcing=False)
print(tokenized_example)
print(tokenizer.convert_ids_to_tokens(tokenized_example['input_ids']))
print(tokenizer.convert_ids_to_tokens(tokenized_example['decoder_input_ids']))

{'input_ids': [16778, 3, 10, 25, 240, 48, 11, 1843, 34, 91, 117, 14818, 3, 10, 125, 19, 48, 117, 16778, 3, 10, 5592, 2084, 117, 16778, 3, 10, 7740, 26, 57, 11884, 3, 4, 76, 152, 117, 14818, 3, 10, 11884, 3, 4, 76, 152, 117, 16778, 3, 10, 466, 31, 7, 5964, 3, 4, 76, 152, 117, 14818, 3, 10, 3, 88, 19, 4403, 12, 794, 4921, 117, 16778, 3, 10, 970, 152, 1018, 122, 226, 77, 435, 24, 34, 6966, 66, 112, 951, 3088, 28, 1414, 76, 10745, 16, 8, 657, 386, 203, 5, 117, 14818, 3, 10, 3, 88, 4381, 34, 66, 117, 16778, 3, 10, 3, 88, 65, 8, 7386, 13, 5592, 117, 14818, 3, 10, 224, 3, 9, 9412, 7386, 117, 37, 13868, 13, 16778, 19, 7513, 7163, 6, 8, 13868, 13, 717, 19, 7163, 6, 16778, 3531, 28, 7163, 6, 16778, 3531, 28, 7163, 6, 8, 13868, 13, 717, 19, 7163, 6, 16778, 3531, 28, 7163, 6, 8, 13868, 13, 717, 19, 1095, 6, 16778, 3531, 28, 7163, 6, 8, 13868, 13, 717, 19, 1095, 6, 16778, 3531, 28, 7163, 6, 8, 13868, 13, 717, 19, 2971, 6, 11860, 10, 16778, 19, 2609, 6, 20270, 6, 3, 15, 51, 27826, 6, 11, 21801, 6, 5

In [7]:
model_inputs = {}
for split in ['train', 'validation', 'test']:
    model_inputs[split] = []
    for sample in dataset[split]:
        tokenized_example = wrapped_t5tokenizer.tokenize_one_example(mytemplate.wrap_one_example(sample), teacher_forcing=False)
        model_inputs[split].append(tokenized_example)

Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


In [8]:
from openprompt import PromptDataLoader

train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=3,
    batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")
# next(iter(train_dataloader))


tokenizing: 10611it [00:16, 631.89it/s]


In [9]:
from openprompt.prompts import ManualVerbalizer
import torch

# for example the verbalizer contains multiple label words in each class
myverbalizer = ManualVerbalizer(tokenizer, num_classes=2,
                        label_words=[["no"], ["yes"]])

print(myverbalizer.label_words_ids)
logits = torch.randn(2,len(tokenizer)) # creating a pseudo output from the plm, and
print(myverbalizer.process_logits(logits)) # see what the verbalizer do

Parameter containing:
tensor([[[ 150]],

        [[4273]]])
tensor([[-0.9410, -0.4947],
        [-0.5593, -0.8477]])


In [10]:
from openprompt import PromptForClassification

use_cuda = True
prompt_model = PromptForClassification(plm=plm,template=mytemplate, verbalizer=myverbalizer, freeze_plm=False)
if use_cuda:
    prompt_model=  prompt_model.cuda()

In [11]:
from transformers import  AdamW, get_linear_schedule_with_warmup
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']
# it's always good practice to set no decay to biase and LayerNorm parameters
optimizer_grouped_parameters = [
    {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)

for epoch in range(10):
    tot_loss = 0
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if step %100 ==1:
            print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)

# Evaluate
validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

allpreds = []
alllabels = []
for step, inputs in enumerate(validation_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print(acc)



Epoch 0, average loss: 1.7040561437606812
Epoch 0, average loss: 0.7777448433871362
Epoch 0, average loss: 0.7493293055951005
Epoch 0, average loss: 0.7307587437183651
Epoch 0, average loss: 0.7258040940954318
Epoch 0, average loss: 0.7202348490338402
Epoch 0, average loss: 0.714383659147939
Epoch 0, average loss: 0.7114625106807109
Epoch 0, average loss: 0.709003147731844
Epoch 0, average loss: 0.7047489849962046
Epoch 0, average loss: 0.7047207327094859
Epoch 0, average loss: 0.7019664605575767
Epoch 0, average loss: 0.6974901690732025
Epoch 0, average loss: 0.6972014470835618
Epoch 0, average loss: 0.6967823299102368
Epoch 0, average loss: 0.6961770435167692
Epoch 0, average loss: 0.6938605670085858
Epoch 0, average loss: 0.6914871523001359
Epoch 0, average loss: 0.6909613525877385
Epoch 0, average loss: 0.6891216582586337
Epoch 0, average loss: 0.687702362130691
Epoch 0, average loss: 0.6867435375471324
Epoch 0, average loss: 0.6858001969734394
Epoch 1, average loss: 0.611584676812

Epoch 7, average loss: 0.029356451914799082
Epoch 7, average loss: 0.02800854023282915
Epoch 7, average loss: 0.029577363353567775
Epoch 7, average loss: 0.0322625037812614
Epoch 7, average loss: 0.03274802302734844
Epoch 7, average loss: 0.03226741234982478
Epoch 7, average loss: 0.03132250087921733
Epoch 7, average loss: 0.0308871243525511
Epoch 7, average loss: 0.0299997952374312
Epoch 7, average loss: 0.030624240568078647
Epoch 7, average loss: 0.030961175842163374
Epoch 7, average loss: 0.03282383960143831
Epoch 7, average loss: 0.032659994309072626
Epoch 7, average loss: 0.032583309999057036
Epoch 7, average loss: 0.03321843351294593
Epoch 7, average loss: 0.033089033917180696
Epoch 7, average loss: 0.03378937547852216
Epoch 8, average loss: 0.00011076860755565576
Epoch 8, average loss: 0.01106238060236039
Epoch 8, average loss: 0.023683162036833197
Epoch 8, average loss: 0.020166970134245493
Epoch 8, average loss: 0.017803861196949614
Epoch 8, average loss: 0.018099096255366202


tokenizing: 1326it [00:01, 754.11it/s]


0.5950226244343891
