<a href="https://colab.research.google.com/github/radwaahmed20112000/QA-Chatbot/blob/main/bert_exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Preparing Dataset

### Imports


In [None]:
import numpy as np
import pandas as pd 
import os
import io
import gzip
from google.colab import drive
from sklearn.model_selection import train_test_split

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from keras import backend as K
K._get_available_gpus()

['/device:GPU:0']

### Global Variables

In [None]:
drive_root_path = '/content/drive/My Drive/Colab Notebooks/chatbot project/Chatbot/'
test_dev_ratio = 0.2
chitchat_train_set = chitchat_dev_set = chitchat_test_set = pd.DataFrame(columns=['question','answer'])
categories_train_set, categories_dev_set, categories_test_set = [], [], []

### Dataset parsing


In [None]:
def parse(path):
  g = gzip.open(path, 'rb')
  
  for l in g:
    yield eval(l)

In [None]:
def getDF(path):
  i = 0
  df = {}

  for d in parse(path):
    df[i] = d
    i += 1
    
  return pd.DataFrame.from_dict(df, orient='index')

## Data load and Split

### Split

In [None]:
def data_split(dataset):

  dataset = dataset.sample(frac=1, random_state=1).reset_index(drop=True)

  train, test = train_test_split(dataset, test_size=test_dev_ratio, 
                                         random_state=0)

  test, dev = train_test_split(test, test_size=0.5, 
                                         random_state=0) 
  
  return train, dev, test

### Load

In [None]:
def load_split_amazon_dataset():

  global categories_train_set, categories_dev_set, categories_test_set

  geners = ['qa_Clothing_Shoes_and_Jewelry.json.gz',
            'qa_Health_and_Personal_Care.json.gz',
            'qa_Sports_and_Outdoors.json.gz']

  for gener in geners:

    df = getDF(drive_root_path + gener)    
    train, dev, test = data_split(df[['question', 'answer']])

    categories_train_set.append(train)
    categories_dev_set.append(dev)
    categories_test_set.append(test)

In [None]:
def load_split_chitchat_dataset():
  global chitchat_train_set, chitchat_dev_set, chitchat_test_set
  data = pd.DataFrame(columns = ["Question", "Answer", "Source", "Metadata"])
  files = ["English_Professional.tsv", "English_Friendly.tsv", "English_Witty.tsv", "English_Caring.tsv",   "English_Enthusiastic.tsv"]
  for file in files:
    path = drive_root_path + 'chitchat/' + file
    df = pd.read_csv(path, sep='\t')
    data = pd.concat([data, df])
  data = data[["Question", "Answer"]].copy()
  print('length of dataset = ', len(data))
  data.rename(columns = {'Question':'question', 'Answer':'answer'}, inplace = True)
  chitchat_train_set, chitchat_dev_set, chitchat_test_set = data_split(data)

### Shuffle Chitchat Dataset

In [None]:
def shuffle_dataset():
  global chitchat_train_set, chitchat_dev_set, chitchat_test_set
  chitchat_train_set = chitchat_train_set.sample(frac=1, random_state=1).reset_index(drop=True)
  chitchat_dev_set   = chitchat_dev_set.sample(frac=1, random_state=1).reset_index(drop=True)
  chitchat_test_set  = chitchat_test_set.sample(frac=1, random_state=1).reset_index(drop=True)

## Generate Data after processing


In [None]:
load_split_chitchat_dataset()
shuffle_dataset()

length of dataset =  48965


In [None]:
train_data = chitchat_train_set
test_data = chitchat_test_set
val_data = chitchat_dev_set

In [None]:
val_data.shape

(4897, 2)

#Preprocessing Data

###Data Status

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer 
keras_tokenizer = Tokenizer()
keras_tokenizer.fit_on_texts(train_data['question'])
tokenized_questions = keras_tokenizer.texts_to_sequences(train_data['question'])
maxlen_questions = max([len(x) for x in tokenized_questions])
print('questions max len = ', maxlen_questions)

questions max len =  15


In [None]:
keras_tokenizer.fit_on_texts(train_data['answer'])
tokenized_answers = keras_tokenizer.texts_to_sequences(train_data['answer'])
maxlen_answers = max([len(x) for x in tokenized_answers])
print('answers max len = ', maxlen_answers)

answers max len =  26


###Prepare Bert Uncased Tokenizer

In [None]:
!pip install transformers
from transformers import BertTokenizerFast
import tensorflow as tf
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 27.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 55.7 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 72.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.1


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

###Process Data

In [None]:
encoder_max_length= 16
decoder_max_length= 32

def process_data_to_model_inputs(data):
  # tokenize the inputs and labels
  inputs = tokenizer(data["question"].tolist(), padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(data["answer"].tolist(), padding="max_length", truncation=True, max_length=decoder_max_length)

  data["input_ids"] = inputs.input_ids
  data["attention_mask"] = inputs.attention_mask
  data["decoder_input_ids"] = outputs.input_ids
  data["decoder_attention_mask"] = outputs.attention_mask
  data["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  data["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in data["labels"]]

  return data

In [None]:
train_data = process_data_to_model_inputs(train_data)
train_data

Unnamed: 0,question,answer,input_ids,attention_mask,decoder_input_ids,decoder_attention_mask,labels
0,Do I seem bad to you today?,"I don't have a way to know how you look, but I...","[101, 2079, 1045, 4025, 2919, 2000, 2017, 2651...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]","[101, 1045, 2123, 1005, 1056, 2031, 1037, 2126...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1045, 2123, 1005, 1056, 2031, 1037, 2126..."
1,Are you ever hungry?,"I don't need to eat, but food does sound prett...","[101, 2024, 2017, 2412, 7501, 1029, 102, 0, 0,...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 1045, 2123, 1005, 1056, 2342, 2000, 4521...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1045, 2123, 1005, 1056, 2342, 2000, 4521..."
2,Tell me what is up with you,"You know, same ol', same ol'.","[101, 2425, 2033, 2054, 2003, 2039, 2007, 2017...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]","[101, 2017, 2113, 1010, 2168, 19330, 1005, 101...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[101, 2017, 2113, 1010, 2168, 19330, 1005, 101..."
3,What's happening with you?,"Oh, not much!","[101, 2054, 1005, 1055, 6230, 2007, 2017, 1029...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]","[101, 2821, 1010, 2025, 2172, 999, 102, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 2821, 1010, 2025, 2172, 999, 102, -100, ..."
4,do you like golden retrievers?,I like a lot of things.,"[101, 2079, 2017, 2066, 3585, 12850, 2869, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]","[101, 1045, 2066, 1037, 2843, 1997, 2477, 1012...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[101, 1045, 2066, 1037, 2843, 1997, 2477, 1012..."
...,...,...,...,...,...,...,...
39167,Sometimes I feel bummed out,Sorry to hear that. Here's a virtual high five...,"[101, 2823, 1045, 2514, 26352, 7583, 2041, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[101, 3374, 2000, 2963, 2008, 1012, 2182, 1005...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 3374, 2000, 2963, 2008, 1012, 2182, 1005..."
39168,I didn't find you very funny,Sometimes humor is tricky for a bot.,"[101, 1045, 2134, 1005, 1056, 2424, 2017, 2200...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]","[101, 2823, 8562, 2003, 24026, 2005, 1037, 285...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[101, 2823, 8562, 2003, 24026, 2005, 1037, 285..."
39169,I'm laughing so hard my sides hurt,You're laughing!,"[101, 1045, 1005, 1049, 5870, 2061, 2524, 2026...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]","[101, 2017, 1005, 2128, 5870, 999, 102, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 2017, 1005, 2128, 5870, 999, 102, -100, ..."
39170,What is your opinion of Eliza?,We're all here to help.,"[101, 2054, 2003, 2115, 5448, 1997, 13234, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]","[101, 2057, 1005, 2128, 2035, 2182, 2000, 2393...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[101, 2057, 1005, 2128, 2035, 2182, 2000, 2393..."


In [None]:
train_data = train_data.drop(['question', 'answer'], axis=1)

In [None]:
val_data = process_data_to_model_inputs(val_data)

In [None]:
val_data = val_data.drop(['question', 'answer'], axis=1)

#Bert Model

##Configure Model

In [None]:
!pip install transformers
from transformers import EncoderDecoderModel
bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
bert2bert

In [None]:
bert2bert.save_pretrained("bert2bert")

In [None]:
bert2bert = EncoderDecoderModel.from_pretrained("bert2bert")

In [None]:
bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
bert2bert.config.eos_token_id = tokenizer.sep_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id
bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size

In [None]:
#Bleu score config
bert2bert.config.max_length = 142
bert2bert.config.min_length = 56
bert2bert.config.no_repeat_ngram_size = 3
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [None]:
!pip install rouge_score
!pip install sacrebleu

In [None]:
!pip install datasets
import datasets
rouge = datasets.load_metric("rouge")

In [None]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

##Training

###Convert Dataframes to Datasets

In [None]:
from datasets import Dataset
dataset_train = Dataset.from_pandas(train_data)
dataset_val = Dataset.from_pandas(val_data)
dataset_val.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
dataset_train.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

In [None]:
batch_size = 64
checkpoint_name = 'checkpoint-1500'
path = '/content/drive/My Drive/Colab Notebooks/chatbot project/BertModel/checkpoints/model-2/'
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir=path,
    evaluation_strategy = "epoch",
    learning_rate=2e-3,
    num_train_epochs=10,
    logging_strategy= "epoch"
    #resume_from_checkpoint = path + checkpoint_name
    # logging_steps=1000,
    # save_steps=500,
    # eval_steps=7500,
    # warmup_steps=2000,
    # save_total_limit=3,
)

###Training Step

In [None]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=bert2bert,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
)
trainer.train()

##Evaluation

In [None]:
!pip install datasets
!pip install transformers
import tensorflow as tf
from transformers import BertTokenizerFast
from transformers import EncoderDecoderModel
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
save_path = '/content/drive/My Drive/Colab Notebooks/chatbot project/BertModel/checkpoints/model-2/checkpoint-1500'
bert2bert = EncoderDecoderModel.from_pretrained(save_path).to('cuda')

###Rouge Testing

In [None]:
def generate_answer(batch):
    # cut off at BERT max length 512
    inputs = tokenizer(batch["question"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to('cuda')
    attention_mask = inputs.attention_mask.to('cuda')

    outputs = bert2bert.generate(input_ids, attention_mask=attention_mask)

    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred_answer"] = output_str

    return batch

In [None]:
#convert test dataframe to dataset
from datasets import Dataset
dataset_test = Dataset.from_pandas(test_data)

In [None]:
batch_size = 16
results = dataset_test.map(generate_answer, batched=True, batch_size=batch_size, remove_columns=["question"])

In [None]:
rouge.compute(predictions=results["pred_answer"], references=results["highlights"], rouge_types=["rouge2"])["rouge2"].mid

###Chatbot Answers

In [None]:
def answer(batch):
    # cut off at BERT max length 512
    inputs = tokenizer(batch["question"], padding="max_length", truncation=True, max_length=16, return_tensors="pt")
    input_ids = inputs.input_ids.to('cuda')
    attention_mask = inputs.attention_mask.to('cuda')

    outputs = bert2bert.generate(input_ids, attention_mask=attention_mask)

    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return output_str 

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
def bleu_score(y_true, y_pred):
  return sentence_bleu(y_true, y_pred, smoothing_function=SmoothingFunction().method1)

In [None]:
arr = ['how are you', 'nice to meet you', 'tell me a joke', 'are you happy today']

In [None]:
data = [arr[0]]
df = pd.DataFrame(data, columns=['question'])
df = Dataset.from_pandas(df)
re = answer(df)
res = re[0]
print(res)
print('bleu score = ', bleu_score(['i am fine', 'i am good', 'i am great thanks'], res ))