In [166]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator, pipeline
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

## Function for computing document similarity score using embeddings. This has been used to do retrieval from the Knowledgbase.

In [167]:
embedding_model_name = "distilbert-base-uncased"
embedding_model = AutoModel.from_pretrained(embedding_model_name)
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)

embedding_tokenizer.pad_token = embedding_tokenizer.eos_token



def compare_documents(doc1, doc2):

    # Tokenize and encode the documents
    encoding1 = embedding_tokenizer(doc1, return_tensors='pt', truncation=True,max_length=200)
    encoding2 = embedding_tokenizer(doc2, return_tensors='pt', truncation=True,max_length=200)

    # Compute model scores
    with torch.no_grad():
        outputs1 = embedding_model(**encoding1)
        outputs2 = embedding_model(**encoding2)

    embeddings1 = outputs1.last_hidden_state.mean(dim=1)
    embeddings2 = outputs2.last_hidden_state.mean(dim=1)

    # Compare the scores
    similarity_score = cosine_similarity(embeddings1, embeddings2)[0][0]

    return similarity_score


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Training dataset for finetuning small LLM for QnA task.

In [3]:
def _load_dataset_from_hf_for_training(name_of_dataset: str = 'squad',
                                       initial_split: str = "train[0:2000]",
                                       test_size_for_train_test_split: float = 0.25):

  subset_qna_dataset = load_dataset("squad", split=initial_split)
  subset_qna_dataset_train_test_split = subset_qna_dataset.train_test_split(test_size=0.25)

  return subset_qna_dataset_train_test_split

In [4]:
def _get_model_and_initialize(model_name: str = 'gpt2'):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForQuestionAnswering.from_pretrained(model_name)

  return tokenizer, model

# This is dataset preprocessing function for model training for QnA task.

In [5]:
model_name = "google-t5/t5-base"
def _preprocess_function(examples):
    '''
    This preprocessing function is heavily based out of the hugging face example
    '''
    tokenizer, _ = _get_model_and_initialize(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    prefix = "Answer this question based on the context: "
    questions = [prefix + q.strip() for q in examples["question"]]
    inputs = tokenizer(questions, examples["context"], max_length=512, truncation="only_second", return_offsets_mapping=True, padding="max_length")
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
from google.colab import drive
import os
drive.mount('/content/drive',force_remount = True)

Mounted at /content/drive


# Finetuning the Model based on the dataset defined above.

In [7]:
def finetune_model_and_save(model_name: str = 'gpt2',
                   directory_for_model_storing:str = 'qna_gpt_bot'):
  subset_qna_dataset_train_test_split = _load_dataset_from_hf_for_training()

  tokenizer, model = _get_model_and_initialize(model_name)
  tokenizer.pad_token = tokenizer.eos_token

  tokenized_datasets = subset_qna_dataset_train_test_split.map(_preprocess_function, batched=True)

  training_args = TrainingArguments(
    output_dir = directory_for_model_storing,
    logging_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=0.0001,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_strategy = "epoch",
    load_best_model_at_end = True,
)


  data_collator = DefaultDataCollator()
  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer.train()

In [8]:
finetune_model_and_save(model_name = 'gpt2',
                   directory_for_model_storing = 'qna_gpt_bot')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,3.2757,2.14118
2,1.6749,2.055543
3,0.9427,2.276876


# Function to get LLM response given a context and a question.

In [None]:
def get_answer(context, question,model_path:str):
  qna = pipeline(task="question-answering", model=model_path, verbose=False)
  prefix = "Answer this question based on the context: "
  return qna(prefix + question, context)['answer']

In [8]:
finetune_model_and_save(model_name = "google-t5/t5-base",
                   directory_for_model_storing = 'qna_t5_bot')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at google-t5/t5-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at google-t5/t5-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at google-t5/t5-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at google-t5/t5-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,3.2634,1.182161
2,1.0895,0.972934
3,0.6965,1.038409


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


# Sample Knowledgebase for Retrieval.

In [170]:
knowledgebase = [
    "The Pacific Ocean is the largest ocean on Earth, covering more than 63 million square miles.",
    "The light bulb was invented by Thomas Edison in 1879.",
    "Sharks have been around for over 400 million years, predating the dinosaurs.",
    "The Eiffel Tower in Paris was completed in 1889 and is 1,083 feet tall.",
    "Brazil is the fifth largest country in the world by both area and population.",
    "The human skeleton consists of 206 bones.",
    "Venus is the hottest planet in our solar system with surface temperatures around 900 degrees Fahrenheit.",
    "The Great Wall of China stretches over 13,000 miles and was primarily built using stone, brick, and wood.",
    "Coffee is the second most traded commodity in the world, after oil.",
    "The longest river in the world is the Nile River, which runs for 4,135 miles.",
    "Antarctica is the coldest continent on Earth, with temperatures as low as minus 128.6 degrees Fahrenheit.",
    "The theory of relativity was developed by Albert Einstein in the early 20th century.",
    "Australia is the only country that is also a continent.",
    "Honey never spoils and has been found edible in ancient Egyptian tombs after thousands of years.",
    "The loudest animal relative to its size is the water boatman, which can produce sounds up to 99.2 decibels.",
    "The human brain uses approximately 20% of the body’s energy despite making up only 2% of its weight.",
    "Mount Everest is the highest point on Earth, standing at 29,029 feet above sea level.",
    "Octopuses have three hearts; two pump blood to the gills, while the third pumps it to the rest of the body.",
    "A day on Venus is longer than a year on Venus; it takes 243 Earth days to rotate once"
]

# Basic version of Retriever using vector search.

In [185]:
def get_context_from_kb(question:str) -> str:
  context = ""

  similarity_score = 0
  for i in knowledgebase:
    sim_score = compare_documents(question, i)
    if sim_score > similarity_score:
      context = i
      similarity_score = sim_score

  return context, similarity_score

# Example 1

In [188]:
q = 'What is the length of the Great Wall of China'
context, sim_score = get_context_from_kb(q)
print(q,context,sim_score,sep='\n')

What is the length of the Great Wall of China
The Great Wall of China stretches over 13,000 miles and was primarily built using stone, brick, and wood.
0.8406506


In [189]:
print(get_answer(context, q,model_path='/content/qna_t5_bot/checkpoint-564'))

13,000 miles


In [190]:
print(get_answer(context, q,model_path='/content/qna_gpt_bot/checkpoint-564'))

 13,000 miles


# Example 2

In [191]:
q = 'Who invented light bulb'
context, sim_score = get_context_from_kb(q)
print(q,context,sim_score,sep='\n')

Who invented light bulb
The light bulb was invented by Thomas Edison in 1879.
0.82907844


In [192]:
print(get_answer(context, q,model_path='/content/qna_t5_bot/checkpoint-564'))

Thomas Edison


In [193]:
print(get_answer(context, q,model_path='/content/qna_gpt_bot/checkpoint-564'))

 Thomas Edison


# Example 3

In [194]:
q = 'What year was the light bulb invented'
context, sim_score = get_context_from_kb(q)
print(q,context,sim_score,sep='\n')

What year was the light bulb invented
The light bulb was invented by Thomas Edison in 1879.
0.73382187


In [195]:
print(get_answer(context, q,model_path='/content/qna_t5_bot/checkpoint-564'))

1879.


In [196]:
print(get_answer(context, q,model_path='/content/qna_gpt_bot/checkpoint-564'))

 1879
