In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"

## Installing Necessary Libraries and Modules

In [None]:
! pip -q install transformers
! pip -q install evaluate
! pip -q install bert_score
! pip -q install rouge_score
! pip -q install sacrebleu

[K     |████████████████████████████████| 5.5 MB 4.2 MB/s 
[K     |████████████████████████████████| 163 kB 65.8 MB/s 
[K     |████████████████████████████████| 7.6 MB 36.5 MB/s 
[K     |████████████████████████████████| 72 kB 1.0 MB/s 
[K     |████████████████████████████████| 441 kB 10.2 MB/s 
[K     |████████████████████████████████| 212 kB 57.2 MB/s 
[K     |████████████████████████████████| 115 kB 64.8 MB/s 
[K     |████████████████████████████████| 95 kB 4.5 MB/s 
[K     |████████████████████████████████| 127 kB 33.9 MB/s 
[K     |████████████████████████████████| 115 kB 38.5 MB/s 
[K     |████████████████████████████████| 60 kB 3.3 MB/s 
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 118 kB 4.8 MB/s 
[?25h

## Import Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import numpy as np
import evaluate

## Model Loading

In [None]:
model_size = "small" 
tokenizer = AutoTokenizer.from_pretrained(f'microsoft/DialoGPT-{model_size}')
model = AutoModelForCausalLM.from_pretrained(f'/content/drive/MyDrive/ChatBotProject/output/output-{model_size}')

## Loading test data

In [None]:
val_data = pd.read_excel("/content/drive/MyDrive/ChatBotProject/input/validation_data.xlsx")
val_data.head()

Unnamed: 0,Questions By Customer,Response By Chatbot
0,I would like to book a table for five people,Sure! I can process your request. Let me check...
1,Is it possible to cancel my reservation,"Sure, I can cancel your reservation."
2,who am i speaking with?,"Hello, I am Adam and i am here to assist you t..."
3,Hey are u the new chatbot?,I am Adam chatbot
4,Adam how are you feeling?,i am great what about you


In [None]:
validation_questions=val_data['Questions By Customer'].values
true_answers=val_data['Response By Chatbot'].values

In [None]:
print(validation_questions)

['I would like to book a table for five people'
 'Is it possible to cancel my reservation' 'who am i speaking with?'
 'Hey are u the new chatbot?' 'Adam how are you feeling?'
 'How many people can fit in the restuarant'
 'I would like to hold a party event' 'Yo Adam watsapp'
 'I loved your restuarant food' 'Thank You Adam'
 'Does Robusta beans contain caffeine and how do they taste'
 'I want to drink a cup of tea, what are the available options'
 'I want to drink a cup of coffee, what are the available options'
 'What are the benifits of drinking a coffee, does it contain any nutrients '
 'What process is followed to make a tea bag'
 'Can you explain what does gluten-free means'
 'can you give me the options of milk used for tea and coffee'
 'Are noodles healthy to eat at your restaurant'
 'Do you take care of temperature while brewing a coffee'
 'Can I choose the flour type used to prepare noodles'
 'what receipe do you follow to prepare noodles'
 'what at are hot drinks at your resta

In [None]:
print(true_answers)

['Sure! I can process your request. Let me check the availability'
 'Sure, I can cancel your reservation.'
 'Hello, I am Adam and i am here to assist you today' 'I am Adam chatbot'
 'i am great what about you' 'You want to book restaurant for party event'
 'Could you please tell me how many people are accompanying you'
 'Hi! How may I assist you today?'
 'Thanks for the positive feedback. Hope you liked our restaurant'
 "It's been my pleasure serving you!"
 'Robusta beans are typically cheaper to produce because the Robusta plant is easier to grow. They have a higher caffeine content and taste more bitter than Arabica beans. These beans are often used to make instant coffee and espresso blends.'
 'Sure! We have many varieties of Tea like Green Tea,Lemon Tea and Ginger Tea. May I know which one you prefer ?'
 'Sure! what type of coffee do you prefer ? We have Espresso, Mocha Latte and Macchiato'
 'Coffee contains caffeine, the most commonly consumed stimulant in the world and is widely 

## Model prediction

In [None]:
predicted_answers=[]
for question in validation_questions:
  new_user_input_ids = tokenizer.encode(question + tokenizer.eos_token, return_tensors='pt')
  bot_input_ids = torch.cat([new_user_input_ids], dim=-1)
  chat_history_ids = model.generate(bot_input_ids, max_length=1000,pad_token_id=tokenizer.eos_token_id,  no_repeat_ngram_size=3,do_sample=True, top_k=100, top_p=0.7,temperature = 0.8)
  predicted_answers.append(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

In [None]:
result=pd.DataFrame(data={"True Answers":true_answers,"Predicted Answers":predicted_answers})
result=result.sample(frac=1).reset_index(drop=True)
result.head()

Unnamed: 0,True Answers,Predicted Answers
0,Could you please tell me how many people are a...,Yes I am here to help you. what's your name?
1,Thanks for the order! Your Pasta noodles will ...,"Yes,A vegan diet is based on plants (such as v..."
2,Sure! what type of coffee do you prefer ? We h...,Sure! what type of coffee do you prefer? We ha...
3,"Hello, I am Adam and i am here to assist you t...",I am Adam chatbot
4,we use in-house recipe for all the items and m...,We follow different temperatures for different...


In [None]:
val_data_ans=result['True Answers'].values
predicted_sents=result['Predicted Answers'].values

### BLEU SCORE

In [None]:
import nltk
BLEU_scores=[]
for i in range(len(val_data_ans)):
    hypothesis = predicted_sents[i]
    reference = val_data_ans[i]
    B_score = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
    BLEU_scores.append(B_score)
print("BLEU score is {}".format(np.mean(BLEU_scores)))

BLEU score is 0.4884521411436808


### Precision, Recall and F1 Score

In [None]:
from evaluate import load
bertscore = load("bertscore")
predictions = predicted_sents
references = val_data_ans
results = bertscore.compute(predictions=predictions, references=references, lang="en")
print("Precision score - {}".format(np.mean(results['precision'])))
print("Recall score - {}".format(np.mean(results['recall'])))
print("F1 score - {}".format(np.mean(results['f1'])))

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Precision score - 0.9139922010271173
Recall score - 0.9112933576107025
F1 score - 0.9125735195059526


### Perplexity

In [None]:
perplexity = load("perplexity", module_type="metric")
results_pxty = perplexity.compute(predictions=predicted_sents, model_id='gpt2')
print("Perplexity value : {}".format(results_pxty['mean_perplexity']))

Downloading builder script:   0%|          | 0.00/8.41k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


  0%|          | 0/3 [00:00<?, ?it/s]

Perplexity value : 134.49190184944555


### ROUGE

In [None]:
rouge = evaluate.load('rouge')
results = rouge.compute(predictions=predictions,references=references)
print("Rouge value : {}".format(results['rouge1']))

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Rouge value : 0.4999208605220576


### METEOR

In [None]:
meteor = evaluate.load('meteor')
results = meteor.compute(predictions=predictions, references=references)
print("METEOR score is {}".format(results['meteor']))

Downloading builder script:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


METEOR score is 0.4778000300402429


### TER

In [None]:
ter = evaluate.load("ter")
results = ter.compute(predictions=predictions,references=references,case_sensitive=True)
print("TER score : {}".format(results['score']))

Downloading builder script:   0%|          | 0.00/9.99k [00:00<?, ?B/s]

TER score : 71.10778443113772
