In [1]:
! pip install -U accelerate
! pip install -U transformers



In [2]:
!pip install evaluate



In [3]:
import torch
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import nltk
import spacy
import string
import evaluate  # Bleu
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np
import transformers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast

import warnings
warnings.filterwarnings("ignore")

In [4]:
DEVICE = "cuda:0"

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Salesforce/dialogstudio-t5-base-v1.0")
model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/dialogstudio-t5-base-v1.0")

input_text = "Answer the following yes/no question by reasoning step-by-step. Can you write 200 words in a single tweet?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

It is not possible to write 200 words in a tweet. Therefore, the final answer is no.


In [6]:
OPTIMIZER = Adam(model.parameters(), lr=0.00001)
Q_LEN = 256   # Question Length
T_LEN = 32    # Target Length
BATCH_SIZE = 4

In [7]:
from datasets import load_dataset
dataset = load_dataset("amaydle/npc-dialogue")

In [10]:
model.to(torch.device('cuda'))

T5ForConditionalGeneration(
  (shared): Embedding(32103, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32103, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [11]:
def predict_answer(context, question, ref_answer=None, metric=None):
    inputs = tokenizer(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)

    predicted_answer = tokenizer.decode(outputs.flatten(), skip_special_tokens=True)

    if ref_answer:
      if metric == "bleu":
        # Load the Bleu metric
        bleu = evaluate.load("google_bleu")
        score = bleu.compute(predictions=[predicted_answer],
                            references=[ref_answer])

        print("\nContext: \n", context)
        #print("\n")
        print("Question: \n", question)
        return {
            "Reference Answer: ": ref_answer,
            "Predicted Answer: ": predicted_answer,
            "BLEU Score: ": score
        }
      elif metric == "rouge":
        rouge = evaluate.load("rouge")
        score = rouge.compute(predictions=[predicted_answer],
                            references=[ref_answer])

        print("\nContext: \n", context)
        #print("\n")
        print("Question: \n", question)
        return {
            "Reference Answer: ": ref_answer,
            "Predicted Answer: ": predicted_answer,
            "BLEU Score: ": score
        }
    else:
        return predicted_answer

In [12]:
ref_answers = []
predictions = []
for i in range(len(dataset['test']['Biography'])):
  context = dataset['test']["Name"][i] + ":" + dataset['test']['Biography'][i]
  question = dataset['test']['Query'][i]
  ref_answers.append(dataset['test']['Response'][i])

  #print("\n--context--", context)
  #print(predict_answer(context, question, ref_answer=ref_answer))
  predictions.append(predict_answer(context, question))

In [13]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [14]:
rouge = evaluate.load('rouge')
results = rouge.compute(predictions=predictions, references=ref_answers)
print(results)

{'rouge1': 0.14257018771061827, 'rouge2': 0.03429938262163584, 'rougeL': 0.13143513014158015, 'rougeLsum': 0.1314416752485637}
