# QAMaster: Fine-tuned Q/A Chat Box
- Version 2
- Summer 2024

This project creates a Q/A chat box using multiple pre-trained models. It fine-tunes these models on a Q/A dataset and evaluates them to choose the best one based on F1-score and Exact Match.

In [27]:
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

## Intro
- Loading models:
      - Tuner 007
      - GPT2
- Generating sample data

In [16]:
models_to_finetune = [
    "tuner007/pegasus_paraphrase",  # Scientific Research
    "gpt2",                         # GPT-2
]

In [17]:
tokenizer_models = {}
for model_name in models_to_finetune:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    config = AutoConfig.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
    
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
        model.resize_token_embeddings(len(tokenizer))
    
    tokenizer_models[model_name] = (tokenizer, model)

Some weights of PegasusForCausalLM were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
data = [
    {"input": "Hello, how can I assist you today?", "target": "Hi, what can I help you with today?"},
    {"input": "What is the weather like?", "target": "Can you tell me what the weather is like?"},
    {"input": "Tell me a joke.", "target": "Do you know any good jokes to tell me?"},
    {"input": "Goodbye!", "target": "See you later! Have a great day!"},
    {"input": "Can you help me with my homework?", "target": "Sure, I'd be happy to help with your homework!"},
    {"input": "What's the capital of France?", "target": "The capital of France is Paris."},
    {"input": "How do I bake a cake?", "target": "To bake a cake, you need flour, sugar, eggs, and butter. Preheat your oven to 350 degrees."},
    {"input": "What's your name?", "target": "I am a chatbot created to assist you."},
    {"input": "Can you play music?", "target": "I can recommend some good music for you."},
    {"input": "What time is it?", "target": "I don't have access to real-time data, but you can check the time on your device."},
    {"input": "How can I help you?", "target": "What do you need help with today?"},
    {"input": "What's the weather today?", "target": "Can you tell me about today's weather?"},
    {"input": "Make me laugh.", "target": "Do you have any good jokes?"},
    {"input": "Bye!", "target": "Goodbye! Have a great day!"},
    {"input": "Can you assist me with my homework?", "target": "Yes, I can help you with your homework."},
]

## PreProcessing
- Split Dataset into Train and Test Sets
- Tokenizing the dataset

In [18]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [19]:
def tokenize_data(data, tokenizer):
    inputs = [example['input'] for example in data]
    targets = [example['target'] for example in data]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding=True)
    model_inputs['labels'] = labels['input_ids']
    model_inputs['attention_mask'] = model_inputs['attention_mask']
    return Dataset.from_dict(model_inputs)

In [20]:
train_dataset = tokenize_data(train_data, tokenizer_models['gpt2'][0])
test_dataset = tokenize_data(test_data, tokenizer_models['gpt2'][0])

datasets = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})



## Fine-tuning
- Fine tuning funcion defined
- Fine tuned

In [21]:
def fine_tune_model(tokenizer, model, train_dataset):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    training_args = TrainingArguments(
        output_dir="./results",
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=2,
        save_steps=10_000,
        save_total_limit=2,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )
    trainer.train()
    return model

In [22]:
for model_name, (tokenizer, model) in tokenizer_models.items():
    fine_tuned_model = fine_tune_model(tokenizer, model, datasets['train'])
    tokenizer_models[model_name] = (tokenizer, fine_tuned_model)

Step,Training Loss


Step,Training Loss


## Evaluation
- F1-score
- BLEU

In [23]:
def evaluate_model(tokenizer, model, test_data):
    predictions = []
    references = []
    for example in test_data:
        inputs = tokenizer.encode(example["input"], return_tensors="pt")
        attention_mask = tokenizer.batch_encode_plus([example["input"]], padding=True, return_tensors="pt")["attention_mask"]
        outputs = model.generate(inputs, max_length=50, attention_mask=attention_mask, pad_token_id=tokenizer.pad_token_id)
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(decoded_output)
        references.append(example["target"])
    
    f1 = f1_score(references, predictions, average='weighted')
    return f1

In [24]:
results = {}
for model_name, (tokenizer, model) in tokenizer_models.items():
    f1 = evaluate_model(tokenizer, model, test_data)
    results[model_name] = f1

print("F1 Scores:", results)

F1 Scores: {'tuner007/pegasus_paraphrase': 0.0, 'gpt2': 0.0}


In [25]:
def calculate_bleu(predictions, references):
    scores = []
    smoothing_function = SmoothingFunction().method4
    for pred, ref in zip(predictions, references):
        scores.append(sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothing_function))
    return sum(scores) / len(scores)

def evaluate_model_with_bleu(tokenizer, model, test_data):
    predictions = []
    references = []
    for example in test_data:
        inputs = tokenizer.encode(example["input"], return_tensors="pt")
        attention_mask = tokenizer.batch_encode_plus([example["input"]], padding=True, return_tensors="pt")["attention_mask"]
        outputs = model.generate(inputs, max_length=50, attention_mask=attention_mask, pad_token_id=tokenizer.pad_token_id)
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(decoded_output)
        references.append(example["target"])
    
    bleu = calculate_bleu(predictions, references)
    return bleu

In [26]:
bleu_results = {}
for model_name, (tokenizer, model) in tokenizer_models.items():
    bleu = evaluate_model_with_bleu(tokenizer, model, test_data)
    bleu_results[model_name] = bleu

print("BLEU Scores:", bleu_results)

BLEU Scores: {'tuner007/pegasus_paraphrase': 0.027215891051044813, 'gpt2': 0.017204779418087093}


## Chat box
- Takes a question as input and answers with different models

In [28]:
def answer_question(question, tokenizer, model):
    inputs = tokenizer.encode(question, return_tensors="pt")
    attention_mask = tokenizer.batch_encode_plus([question], padding=True, return_tensors="pt")["attention_mask"]
    outputs = model.generate(inputs, max_length=50, attention_mask=attention_mask, pad_token_id=tokenizer.pad_token_id)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [34]:
question = input('Ask: ')
for model_name, (tokenizer, model) in tokenizer_models.items():
    answer = answer_question(question, tokenizer, model)
    print(f"Answer from {model_name}: {answer}")

Ask: Hi
Answer from tuner007/pegasus_paraphrase: Hi,,.......................................................................................................................................
Answer from gpt2: Hi, I'm not going to tell you how to do this. I'm not going to tell you how to do this. I'm not going to tell you how to do this. I'm not going to tell you how to do this.
