In [54]:
import torch
import csv
from transformers import pipeline
from pprint import pprint
import transformers
from torch.utils.data import DataLoader
import numpy as np
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import process_dataset
from transformers import DataCollatorWithPadding
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from tqdm import tqdm

In [2]:
model_path = 'bert_critic.model'

critic_tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
critic_model = transformers.BertForSequenceClassification.from_pretrained(model_path)
classifier = pipeline("sentiment-analysis", model=critic_model, tokenizer=critic_tokenizer)

In [62]:
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium',bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
gpt_model = GPT2LMHeadModel.from_pretrained('checkpoint-gpt2-medium')
naive_generator = pipeline('text-generation',model=gpt_model,tokenizer=gpt_tokenizer,device=0)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [63]:
def select_the_best(classifier_results:list, decoded_results:list, only_label=None):
    best_index = 0
    best = classifier_results[0]
    for current_index, result in enumerate(classifier_results):
        if result[0]['score'] > best[0]['score']:
            if only_label is not None:
                if result[0]['label'] == only_label:
                    best = result
                    best_index = current_index
            else:
                best = result
                best_index = current_index
    return decoded_results[best_index], best


def inference_with_critic(text, gpt_tokenizer, gpt_model, critic_pipeline, verbose=False, num_beam=8, num_return=5, max_length=120):
    inputs = gpt_tokenizer(text, return_tensors="pt")
    beam_outputs = gpt_model.to(device='cuda').generate(
        inputs['input_ids'].to(device='cuda'), 
        attention_mask = inputs['attention_mask'].to(device='cuda'),
        num_beams = num_beam,
        no_repeat_ngram_size = 1,
        num_return_sequences = num_return, 
        early_stopping = False,
        pad_token_id=gpt_tokenizer.eos_token_id,
        eos_token_id=gpt_tokenizer.eos_token_id,
        max_length= max_length
    )
    decoded_results = [gpt_tokenizer.decode(beam_output) for beam_output in beam_outputs]
    decoded_results = list(map(lambda x: x.replace("<|endoftext|>", '').strip(), decoded_results))
    if verbose:
        print("====== Generated by GPT: ======\n")
        pprint(decoded_results)

    classifier_results = [classifier(decoded_result) for decoded_result in decoded_results]
    if verbose:
        print("====== Classifier results: ======\n")
        pprint(classifier_results)
    return select_the_best(classifier_results, decoded_results)
    

In [59]:
text = "PersonX begins to accept PersonY  xIntent"

inference_with_critic(text, gpt_tokenizer, gpt_model, classifier, verbose=True, num_beam=10, num_return=10, max_length=120)


['PersonX begins to accept PersonY  xIntention’s new relationship is in the '
 'best interest of both parties.',
 'PersonX begins to accept PersonY  xIntention’s new relationship is a good '
 'one.',
 'PersonX begins to accept PersonY  xIntention’s new relationship is a good '
 'one.',
 'PersonX begins to accept PersonY  xIntention’s new relationship is better '
 'than the old one.',
 'PersonX begins to accept PersonY  xIntention’s new friend is accepted by her '
 'family and friends.\n'
 ' Y has a better relationship with his parents than he did before, despite '
 'the fact that they are still fighting over who should take care of him.',
 'PersonX begins to accept PersonY  xIntention’s new relationship is a good '
 'one.',
 'PersonX begins to accept PersonY  xIntention’s new friend is accepted by her '
 'family and friends.',
 'PersonX begins to accept PersonY  xIntention’s new relationship is a good '
 'one.',
 'PersonX begins to accept PersonY  xIntention’s new friend is a good fit

('PersonX begins to accept PersonY  xIntention’s new relationship is in the best interest of both parties.',
 [{'label': 'LABEL_0', 'score': 0.9995953440666199}])

In [37]:
naive_generator(text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'PersonX begins to accept PersonY  xIntent  [GEN] to have a friend '}]

In [71]:
def trim(text):
    return text.split('\t')[0].split('\n')[0].strip()
def inference_wrapper(text, gpt_tokenizer, gpt_model, classifier, num_beam=10, num_return=10, max_length=120):
    beam_result, result_label = inference_with_critic(text, 
        gpt_tokenizer, 
        gpt_model, 
        classifier, 
        verbose=False, 
        num_beam=num_beam, 
        num_return=num_return, 
        max_length=max_length
        )
    
    naive_result = naive_generator(text)[0]['generated_text']
    return {"New Result":trim(beam_result), "Naive":trim(naive_result)}

In [72]:
text = "PersonX begins to accept PersonY  xIntent"
inference_wrapper(text, gpt_tokenizer, gpt_model, classifier, num_beam=10, num_return=10, max_length=120)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'New Result': 'PersonX begins to accept PersonY  xIntention’s new relationship is in the best interest of both parties.',
 'Naive': 'PersonX begins to accept PersonY  xIntent  [GEN] to be friends with PersonY'}

In [76]:
def get_prompts(texts:list)->list:
    return list(map(lambda x: x.split("[GEN]")[0].strip(), texts))

In [79]:
with open('./output_4000_DT.tsv', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    data = list(reader)
    data_ = tuple(map(lambda x: x[0].strip(), data))

In [80]:
data_[0:10]

('PersonX announces the result  xWant  [GEN]',
 'PersonX takes a diploma  HinderedBy  [GEN]',
 'PersonX is ranting  xEffect  [GEN]',
 'PersonX is practicing  xEffect  [GEN]',
 'PersonX is the best person for the job  xIntent  [GEN]',
 'PersonX visits a new town  xReact  [GEN]',
 'PersonX organizes others to work  HinderedBy  [GEN]',
 'PersonX informs PersonY about her surprise  xAttr  [GEN]',
 "PersonX doesn't kill anyone  xEffect  [GEN]",
 'PersonX seems to hear PersonY  xIntent  [GEN]')

In [81]:
with open('results_comparison.tsv', 'w') as f:
    f.write('New Result\tNaive\n')

In [83]:
with open('results_comparison.tsv', 'a') as f:
    for text in tqdm(data_[2045:2245]):
        result = inference_wrapper(text, gpt_tokenizer, gpt_model, classifier, num_beam=10, num_return=10, max_length=120)
        f.write(f"{result['New Result']}\t{result['Naive']}\n")


  0%|          | 0/200 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/200 [00:01<03:35,  1.08s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 2/200 [00:02<04:49,  1.46s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▏         | 3/200 [00:05<07:18,  2.23s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▏         | 4/200 [00:07<06:53,  2.11s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▎         | 5/200 [00:09<05:54,  1.82s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  3%|▎         | 6/200 [00:10<05:07,  1.59s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▎         | 7/200 [00:13<06:37,  2.06s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 8/200 [00:16<07:36,  2.38s/it]Setting `