<a href="https://colab.research.google.com/github/samin9796/arg2keypoint/blob/main/T5_OpenPrompt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#OpenPrompt Installation
OpenPrompt is a modular and flexible platform to develop a prompt-learning pipeline

In [None]:
!pip install openprompt



#Data Pre-Processing

In [None]:
import pandas as pd
train_df = pd.read_csv("train.csv")
dev_df = pd.read_csv("dev.csv")
test_df = pd.read_csv("test.csv")

In [None]:
train_df.columns=['idx', 'topic', 'argument', 'key_point', 'stance', 'label']
dev_df.columns=['idx', 'topic', 'argument', 'key_point', 'stance', 'label']
test_df.columns=['idx', 'topic', 'argument', 'key_point', 'stance', 'label']


In [None]:
from openprompt.data_utils import InputExample
dataset = {}

dataset['train'] = []
dataset['dev'] = []
dataset['test'] = []
for index, data in train_df.iterrows():
  input_example = InputExample(text_a = data['argument'], text_b = data['key_point'], label=int(data['label']), guid=data['idx'])
  dataset['train'].append(input_example)
for index, data in dev_df.iterrows():
  input_example = InputExample(text_a = data['argument'], text_b = data['key_point'], label=int(data['label']), guid=data['idx'])
  dataset['dev'].append(input_example)
for index, data in test_df.iterrows():
  input_example = InputExample(text_a = data['argument'], text_b = data['key_point'], label=int(data['label']), guid=data['idx'])
  dataset['test'].append(input_example)
print(dataset['train'][2])

{
  "guid": 10139,
  "label": 0,
  "meta": {},
  "text_a": "a person created through cloning could potentially have developmental problems caused by imperfections in the cloning process.",
  "text_b": "Cloning is unnatural",
  "tgt_text": null
}



#Obtain a PLM

In [None]:
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


#Define a Template

###Manual Template

In [None]:
from openprompt.prompts import ManualTemplate
template_text = 'The argument: {"placeholder":"text_a"} and the Keypoint: {"placeholder":"text_b"} are {"mask"}.'
mytemplate = ManualTemplate(tokenizer=tokenizer, text=template_text)

###Mixed Template

In [None]:
# from openprompt.prompts import MixedTemplate

# mytemplate1 = MixedTemplate(model=plm, tokenizer=tokenizer, text='Argument: {"placeholder": "text_a"} Keypiont: {"placeholder": "text_b"} {"soft": "Does"} {"soft": "the", "soft_id": 1} argument matches {"soft_id": 1} keypoint? {"mask"}')

# mytemplate = MixedTemplate(model=plm, tokenizer=tokenizer, text='{"placeholder":"text_a"} {"soft"} {"soft"} {"soft"} {"placeholder":"text_b"} {"soft"} {"mask"}.')

In [None]:
wrapped_example = mytemplate.wrap_one_example(dataset['train'][0])
print(wrapped_example)

[[{'text': 'Argument:', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': ' a person created through cloning could potentially have developmental problems caused by imperfections in the cloning process.', 'loss_ids': 0, 'shortenable_ids': 1}, {'text': ' Keypoint:', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': ' Cloning is not understood enough yet', 'loss_ids': 0, 'shortenable_ids': 1}, {'text': ' Are they matched?', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0}, {'text': '.', 'loss_ids': 0, 'shortenable_ids': 0}], {'guid': 10137, 'label': 0}]


##Tokenization

In [None]:
wrapped_t5tokenizer = WrapperClass(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")

In [None]:
tokenized_example = wrapped_t5tokenizer.tokenize_one_example(wrapped_example, teacher_forcing=False)
print(tokenized_example)
print(tokenizer.convert_ids_to_tokens(tokenized_example['input_ids']))
print(tokenizer.convert_ids_to_tokens(tokenized_example['decoder_input_ids']))

{'input_ids': [26583, 10, 3, 9, 568, 990, 190, 3, 3903, 29, 53, 228, 6149, 43, 20697, 982, 2953, 57, 31475, 16, 8, 3, 3903, 29, 53, 433, 5, 4420, 2700, 10, 8932, 29, 53, 19, 59, 7571, 631, 780, 1521, 79, 3, 10304, 58, 32099, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'decoder_input_ids': [0, 32099, 0], 'loss_ids': [0, 1, 0]}
['▁Argument', ':', '▁', 'a', '▁person', '▁created', '▁throu

In [None]:
model_inputs = {}
for split in ['train', 'dev', 'test']:
    model_inputs[split] = []
    for sample in dataset[split]:
        tokenized_example = wrapped_t5tokenizer.tokenize_one_example(mytemplate.wrap_one_example(sample), teacher_forcing=False)
        model_inputs[split].append(tokenized_example)



In [None]:
from openprompt import PromptDataLoader

train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

tokenizing: 17019it [00:26, 642.85it/s]


# Define a Verbalizer

In [None]:
from openprompt.prompts import ManualVerbalizer
import torch

# for example the verbalizer contains multiple label words in each class
myverbalizer = ManualVerbalizer(tokenizer, num_classes=3,
                        label_words=[["matched"], ["not matched"]])

print(myverbalizer.label_words_ids)
logits = torch.randn(2,len(tokenizer)) # creating a pseudo output from the plm, and
print(myverbalizer.process_logits(logits)) # see what the verbalizer do

Parameter containing:
tensor([[[4273]],

        [[ 150]]])
tensor([[-2.6860, -0.0706],
        [-0.8528, -0.5555]])


In [None]:
from openprompt import PromptForClassification

use_cuda = True
prompt_model = PromptForClassification(plm=plm,template=mytemplate, verbalizer=myverbalizer, freeze_plm=False)
if use_cuda:
    prompt_model=  prompt_model.cuda()

##Training

In [None]:
from transformers import  AdamW, get_linear_schedule_with_warmup
loss_func = torch.nn.CrossEntropyLoss()

no_decay = ['bias', 'LayerNorm.weight']

# it's always good practice to set no decay to biase and LayerNorm parameters
optimizer_grouped_parameters1 = [
    {'params': [p for n, p in prompt_model.plm.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.plm.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# Using different optimizer for prompt parameters and model parameters
optimizer_grouped_parameters2 = [
    {'params': [p for n,p in prompt_model.template.named_parameters() if "raw_embedding" not in n]}
]

optimizer1 = AdamW(optimizer_grouped_parameters1, lr=1e-4)
optimizer2 = AdamW(optimizer_grouped_parameters2, lr=1e-3)

for epoch in range(3):
    tot_loss = 0
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer1.step()
        optimizer1.zero_grad()
        optimizer2.step()
        optimizer2.zero_grad()
        print(tot_loss/(step+1))




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0.08325915407248964
0.08326281579207791
0.08324045907496207
0.08326768055928338
0.08324646680221472
0.08323089188711912
0.08321179866542003
0.0832650252604568
0.08324464992170551
0.08322450302942508
0.08320510773882905
0.08319209574190398
0.0832652094290352
0.08334322060371827
0.08332011827793227
0.0832987908888145
0.08327692686334302
0.08325903800683104
0.08323629383071661
0.08321319686822483
0.08320670269609581
0.08318413239878451
0.08323726902592773
0.08321436968772522
0.08320874860164854
0.0833262879303969
0.08330443016481454
0.08328406092902181
0.08326073323699786
0.08323847288121539
0.08322658516170878
0.08320340811065532
0.08318079488002482
0.08316253367663376
0.08316276649135734
0.08314538972280139
0.08312213399365793
0.08309987189080685
0.08309328172945232
0.08343183594495533
0.0834139773969344
0.08339076260933924
0.08336843532491049
0.08334530515918148
0.08332210566662923
0.08329908901795052
0.08327624397831893


In [None]:
# from transformers import  AdamW, get_linear_schedule_with_warmup
# loss_func = torch.nn.CrossEntropyLoss()
# no_decay = ['bias', 'LayerNorm.weight']
# # it's always good practice to set no decay to biase and LayerNorm parameters
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]

# optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)

# for epoch in range(4):
#     tot_loss = 0
#     for step, inputs in enumerate(train_dataloader):
#         if use_cuda:
#             inputs = inputs.cuda()
#         logits = prompt_model(inputs)
#         labels = inputs['label']
#         loss = loss_func(logits, labels)
#         loss.backward()
#         tot_loss += loss.item()
#         optimizer.step()
#         optimizer.zero_grad()
#         if step %100 ==1:
#             print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)



Epoch 0, average loss: 0.844295471906662
Epoch 0, average loss: 0.5866554927300004
Epoch 0, average loss: 0.585295420103144
Epoch 0, average loss: 0.5566939960745786
Epoch 0, average loss: 0.528518792348381
Epoch 0, average loss: 0.5018674241048674
Epoch 0, average loss: 0.4726234438235058
Epoch 0, average loss: 0.4575824920682401
Epoch 0, average loss: 0.43705658347780196
Epoch 0, average loss: 0.42523301173901873
Epoch 0, average loss: 0.41005207173630387
Epoch 0, average loss: 0.3964193644316465
Epoch 0, average loss: 0.385574093113689
Epoch 0, average loss: 0.3757387894744371
Epoch 0, average loss: 0.36631443270165925
Epoch 0, average loss: 0.35609927316588136
Epoch 0, average loss: 0.3475033767746945
Epoch 0, average loss: 0.33765647614926875
Epoch 0, average loss: 0.3307706475689166
Epoch 0, average loss: 0.3230010364995437
Epoch 0, average loss: 0.3155888318764297
Epoch 0, average loss: 0.31179301667077686
Epoch 0, average loss: 0.30590222619066215
Epoch 0, average loss: 0.30413

#Evaluate

In [None]:

validation_dataloader = PromptDataLoader(dataset=dataset["dev"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

allpreds = []
alllabels = []
for step, inputs in enumerate(validation_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print(acc)