In [1]:
import pandas as pd
import numpy as np

In [2]:
personality = 'A'
df_data = pd.read_csv('data/Friends_'+personality+'_whole.tsv', sep = '\t')
df = df_data[['utterance', 'labels']]

from sklearn.model_selection import train_test_split
SEED=42

df_train, df_valid, label_train, label_valid = \
            train_test_split(df, df['labels'], test_size=0.1, random_state=SEED, stratify=df['labels'])

In [3]:
from openprompt.data_utils.utils import InputExample
from openprompt.data_utils.data_processor import DataProcessor
class DF_Processor(DataProcessor):
    
    def __init__(self):
        super().__init__()
        self.labels = ['0', '1']

    def get_examples(self, df):
        examples = []
        for i,r in df.iterrows():
            text_a = r['utterance']
            label = r['labels']
            guid = i
            example = InputExample(guid=guid, text_a=text_a, label=label)
            examples.append(example)
        return examples

data_train = DF_Processor().get_examples(df_train)
data_valid = DF_Processor().get_examples(df_valid)

In [4]:
print('load model...')
from openprompt.plms import load_plm
# load mlm model for main tasks
plm, tokenizer, model_config, WrapperClass = load_plm("roberta", "roberta-base")
template_generate_model, template_generate_tokenizer, template_generate_model_config, template_tokenizer_wrapper = \
                load_plm('t5', 't5-base')

load model...


In [5]:
df_verbalizer = pd.read_csv('big_five_cleaned.tsv', sep='\t')
pos = [a.lower() for a in list(df_verbalizer['word'][df_verbalizer[personality]>0])]
neg = [a.lower() for a in list(df_verbalizer['word'][df_verbalizer[personality]<0])]
from openprompt.prompts import ManualVerbalizer
import torch
classes = [0,1]        
verbalizer = ManualVerbalizer(
    classes = classes,
    label_words = {
        0 : neg, 
        1 : pos
    },
    tokenizer=tokenizer)

from openprompt.prompts.prompt_generator import LMBFFTemplateGenerationTemplate
template = LMBFFTemplateGenerationTemplate(tokenizer=template_generate_tokenizer, 
                                           verbalizer=verbalizer, 
                                           text='{"placeholder":"text_a"} {"mask"} {"meta":"labelword"} {"mask"}.')

wrapped_example = template.wrap_one_example(data_train[15])
print(wrapped_example)

[[{'text': "  Uh, you left out the stupid part.  I think it's totally insane, I mean, they work for the hospital. It's like returning to the scene of the crime. You know, I say we blow off the dates.", 'loss_ids': 0, 'shortenable_ids': 1}, {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0}, {'text': ' naive', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0}, {'text': '.', 'loss_ids': 0, 'shortenable_ids': 0}], {'guid': 706, 'label': 1}]


In [6]:
cuda = True
auto_t = True # whether to perform automatic template generation
auto_v = True # whether to perform automatic label word generation

from openprompt.plms import load_plm
from openprompt.prompts.prompt_generator import T5TemplateGenerator
from openprompt.pipeline_base import PromptDataLoader, PromptForClassification
from openprompt.prompts import ManualTemplate
from openprompt.trainer import ClassificationRunner
import copy
import torch
from transformers import  AdamW, get_linear_schedule_with_warmup

def fit(model, train_dataloader, val_dataloader, loss_func, optimizer):
    best_score = 0.0
    for epoch in range(10):
        train_epoch(model, train_dataloader, loss_func, optimizer)
        score = evaluate(model, val_dataloader)
        if score > best_score:
            best_score = score
    return best_score


def train_epoch(model, train_dataloader, loss_func, optimizer):
    model.train()
    for step, inputs in enumerate(train_dataloader):
        if cuda:
            inputs = inputs.cuda()
        logits = model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

def evaluate(model, val_dataloader):
    model.eval()
    allpreds = []
    alllabels = []
    with torch.no_grad():
        for step, inputs in enumerate(val_dataloader):
            if cuda:
                inputs = inputs.cuda()
            logits = model(inputs)
            labels = inputs['label']
            alllabels.extend(labels.cpu().tolist())
            allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
    acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
    return acc

In [7]:
# %%
from tqdm import tqdm
# template generation
if auto_t:
    print('performing auto_t...')

    if cuda:
        template_generate_model = template_generate_model.cuda()
    template_generator = T5TemplateGenerator(template_generate_model, template_generate_tokenizer, template_tokenizer_wrapper, verbalizer, beam_width=2) # beam_width is set to 5 here for efficiency, to improve performance, try a larger number.


    dataloader = PromptDataLoader(data_train, template, tokenizer=template_generate_tokenizer, tokenizer_wrapper_class=template_tokenizer_wrapper, batch_size=len(data_train), decoder_max_length=128, max_seq_length=128, shuffle=False, teacher_forcing=False) # register all data at once
    print('pass!')
    
    for data in dataloader:
        if cuda:
            data = data.cuda()
        template_generator._register_buffer(data)

    template_generate_model.eval() ## 不更新参数
    print('generating...')
    
    template_texts = template_generator._get_templates()
    print(template_texts)
    original_template = template.text
    template_texts = [template_generator.convert_template(template_text, original_template) for template_text in template_texts]
    # template_generator._show_template()
    template_generator.release_memory()
    # generate a number of candidate template text
    print(template_texts)
    # iterate over each candidate and select the best one
    best_metrics = 0.0
    best_template_text = None
    for template_text in tqdm(template_texts):
        template = ManualTemplate(tokenizer, template_text)

        train_dataloader = PromptDataLoader(data_train, template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass)
        valid_dataloader = PromptDataLoader(data_valid, template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass)

        model = PromptForClassification(copy.deepcopy(plm), template, verbalizer)

        loss_func = torch.nn.CrossEntropyLoss()
        no_decay = ['bias', 'LayerNorm.weight']
        # it's always good practice to set no decay to biase and LayerNorm parameters
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)
        if cuda:
            model = model.cuda()
        score = fit(model, train_dataloader, valid_dataloader, loss_func, optimizer)

        if score > best_metrics:
            print('best score:', score)
            print('template:', template_text)
            best_metrics = score
            best_template_text = template_text
    # use the best template
    template = ManualTemplate(tokenizer, text=best_template_text)
    print(best_template_text)


performing auto_t...


tokenizing: 230it [00:00, 1137.91it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 512). Running this sequence through the model will result in indexing errors
tokenizing: 639it [00:00, 1164.53it/s]


pass!
generating...


100%|███████████████████████████████████████████| 18/18 [00:30<00:00,  1.68s/it]


[['<extra_id_0>', '▁I', "'", 'm', '<extra_id_1>', '.', '<extra_id_2>'], ['<extra_id_0>', '▁It', "'", 's', '<extra_id_1>', '.', '<extra_id_2>']]
['{"placeholder": "text_a"} I\'m {"mask"} ..', '{"placeholder": "text_a"} It\'s {"mask"} ..']


  0%|                                                     | 0/2 [00:00<?, ?it/s]
tokenizing: 0it [00:00, ?it/s][A
tokenizing: 145it [00:00, 1449.21it/s][AToken indices sequence length is longer than the specified maximum sequence length for this model (557 > 512). Running this sequence through the model will result in indexing errors

tokenizing: 310it [00:00, 1566.38it/s][A
tokenizing: 639it [00:00, 1670.72it/s][A

tokenizing: 72it [00:00, 1829.21it/s]
 50%|██████████████████████                      | 1/2 [10:05<10:05, 605.01s/it]

best score: 0.5694444444444444
template: {"placeholder": "text_a"} I'm {"mask"} ..



tokenizing: 0it [00:00, ?it/s][A
tokenizing: 200it [00:00, 1997.35it/s][A
tokenizing: 412it [00:00, 2068.33it/s][A
tokenizing: 639it [00:00, 2060.91it/s][A

tokenizing: 72it [00:00, 2104.49it/s]
100%|████████████████████████████████████████████| 2/2 [20:14<00:00, 607.25s/it]

{"placeholder": "text_a"} I'm {"mask"} ..





In [9]:
from openprompt.prompts.prompt_generator import RobertaVerbalizerGenerator
if auto_v:
    print('performing auto_v...')
    # load generation model for template generation
    if cuda:
        plm = plm.cuda()
    verbalizer_generator = RobertaVerbalizerGenerator(model=plm, tokenizer=tokenizer, candidate_num=5, label_word_num_per_class=5)
    # to improve performance , try larger numbers

    dataloader = PromptDataLoader(data_train, template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass, batch_size=32)
    for data in dataloader:
        if cuda:
            data = data.cuda()
        verbalizer_generator.register_buffer(data)
    label_words_list = verbalizer_generator.generate()
    verbalizer_generator.release_memory()

    # iterate over each candidate and select the best one
    current_verbalizer = copy.deepcopy(verbalizer)
    best_metrics = 0.0
    best_label_words = None
    for label_words in tqdm(label_words_list):
        current_verbalizer.label_words = label_words
        train_dataloader = PromptDataLoader(data_train, template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass)
        valid_dataloader = PromptDataLoader(data_valid, template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass)

        model = PromptForClassification(copy.deepcopy(plm), template, current_verbalizer)

        loss_func = torch.nn.CrossEntropyLoss()
        no_decay = ['bias', 'LayerNorm.weight']
        # it's always good practice to set no decay to biase and LayerNorm parameters
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)
        if cuda:
            model = model.cuda()
        score = fit(model, train_dataloader, valid_dataloader, loss_func, optimizer)

        if score > best_metrics:
            best_metrics = score
            best_label_words = label_words
    # use the best verbalizer
    print(best_label_words)
    verbalizer = ManualVerbalizer(tokenizer, num_classes=2, label_words=best_label_words)


performing auto_v...


tokenizing: 639it [00:00, 2045.14it/s]
  0%|                                                     | 0/5 [00:00<?, ?it/s]
tokenizing: 0it [00:00, ?it/s][A
tokenizing: 196it [00:00, 1955.28it/s][A
tokenizing: 406it [00:00, 2038.66it/s][A
tokenizing: 639it [00:00, 2038.99it/s][A

tokenizing: 72it [00:00, 2080.52it/s]
 20%|████████▊                                   | 1/5 [09:59<39:59, 599.93s/it]
tokenizing: 0it [00:00, ?it/s][A
tokenizing: 198it [00:00, 1975.19it/s][A
tokenizing: 407it [00:00, 2038.45it/s][A
tokenizing: 639it [00:00, 2037.56it/s][A

tokenizing: 72it [00:00, 2093.29it/s]
 20%|████████▊                                   | 1/5 [10:34<42:16, 634.12s/it]


KeyboardInterrupt: 