In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

! pip install transformers==4.27.4

! pip install torchtext==0.10.1

import torch
device = torch.device("cuda")
torch.cuda.init()

from google.colab import drive
drive.mount('/content/gdrive')

from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel, TextDataset

In [3]:
model_path = "/content/gdrive/My Drive/model/chitchat_generator.pt"

tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='left')
model = GPT2LMHeadModel.from_pretrained('gpt2')

model.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

In [4]:
test_path = '/content/gdrive/My Drive/dataset/combined_test.txt'

In [5]:
from torch.utils.data import DataLoader

In [6]:
import re
def getResponse(input_text, model,tokenizer, device):
  input_ids = tokenizer.encode(input_text, return_tensors='pt')
  if(len(input_ids)<=0):
    print(input_text)
    return None
  input_ids = input_ids.to(device)
  model = model.to(device)
  output_ids = model.generate(input_ids,pad_token_id=tokenizer.eos_token_id, max_length=70,early_stopping=True)
  output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
  messages = output_text.split("\n")
  first_bot_response = None
  for message in messages:
    if message.startswith("Bot:"):
        first_bot_response = message.strip()
        break
  return first_bot_response

In [49]:
print(getResponse("User: Hello, why do you hate me?",model, tokenizer, device))

Bot: I hate you.


In [11]:
with open('/content/gdrive/My Drive/dataset/combined_test.txt', 'r', encoding='utf-8') as f:
    test_data = f.readlines()

In [41]:
len(test_data)

36273

In [13]:
def prepare(test_data):
  input = {}
  response = {}
  conv_id = 1
  idx = 1
  for i in range(len(test_data)):
    input[idx]=""
    response[idx]=""
    if(test_data[i]=="\n" or test_data[i]=='\n'):
      idx+=1
  for data in test_data:
    if(conv_id>=idx):
      break
    if(data.startswith("User")):
      input[conv_id]+= data 
    elif(data.startswith("Bot")):
      response[conv_id]+=data
    else:
      conv_id+=1
  return input,response

In [14]:
input, response = prepare(test_data)

In [None]:
# from tqdm import tqdm
# def getResponses(input, response, model,tokenizer, device):
#   generated_response = {}
#   with open("/content/gdrive/My Drive/dataset/generated_responses.json", 'a') as f:
#     for idx, text in tqdm(input.items()):
#       value = getResponse(text, model,tokenizer, device)
#       if(value==None):
#         response2[idx]="None"
#       generated_response[idx] = value
#       f.write(str(idx)+ ":" )
#       f.write(str(value))
#       f.write("\n")
#   return generated_response

In [40]:
from tqdm import tqdm
def getResponses(input, response, model,tokenizer, device):
  generated_response = {}
  for idx, text in tqdm(input.items()):
    value = getResponse(text, model,tokenizer, device)
    if(value==None):
      response2[idx]="None"
    generated_response[idx] = value
  return generated_response

In [16]:
input_demo_keys = list(input.keys())[:7000]
input_demo = {key: input[key] for key in input_demo_keys}

In [17]:
response_demo_keys = list(response.keys())[:7000]
response_demo = {key: response[key] for key in response_demo_keys}
response2=response_demo.copy()

In [None]:
generated_response = getResponses(input_demo, response_demo, model,tokenizer, device)

In [30]:
for key, value in generated_response.items():
  if(value is None):
    generated_response[key]="None"

In [50]:
import json
# open file for writing
with open("/content/gdrive/My Drive/dataset/generated_responses.json", "w") as outfile:
    # write dictionary to file in JSON format
    json.dump(generated_response, outfile)

In [20]:
import json
# open file for writing
with open("/content/gdrive/My Drive/dataset/modified_responses.json", "w") as outfile:
    # write dictionary to file in JSON format
    json.dump(response2, outfile)

In [26]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install bert-score


In [None]:
from bert_score import score
P, R, F1 = score(list(generated_response.values()), list(response2.values()), lang="en", verbose=True)

In [32]:
print(P.mean().item())
print(R.mean().item())
print(F1.mean().item())

0.869292676448822
0.857213020324707
0.8629080653190613


In [34]:
from nltk.translate.bleu_score import corpus_bleu
bleu_score = corpus_bleu(list(generated_response.values()), list(response2.values()))
bleu_score

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


1.1876857836124988e-231

In [None]:
!pip install rouge

In [38]:
from rouge import Rouge
rouge = Rouge()
scores = rouge.get_scores(list(generated_response.values()), list(response2.values()), avg=True)

In [39]:
scores

{'rouge-1': {'r': 0.212496510112691,
  'p': 0.2587420801793697,
  'f': 0.20327748922264302},
 'rouge-2': {'r': 0.015346934717142175,
  'p': 0.023703335199954014,
  'f': 0.0159509371007501},
 'rouge-l': {'r': 0.2081268174395909,
  'p': 0.2510940967382122,
  'f': 0.19811036254725495}}