# Q/A GPT2 

June 13, 2024

## Intro

Importing libraries

In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

2024-06-13 17:59:34.994071: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading pre-trained GPT2 model and its tokenizer

In [2]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [3]:
tokenizer.pad_token = tokenizer.eos_token

## Preprocessing

Preprocess function to create input-output pairs 

In [4]:
def preprocess_data(data):
    examples = []
    for item in data:
        question = item["question"]
        answer = item["answer"]
        prompt = f"Q: {question}\nA: {answer}\n"
        examples.append(prompt)
    return examples

fine-tuning data

In [5]:
data = [
    {"question": "What is the capital of France?", "answer": "The capital of France is Paris."},
    {"question": "Who wrote 'Pride and Prejudice'?", "answer": "Jane Austen wrote 'Pride and Prejudice'."},
    {"question": "What is the tallest mountain in the world?", "answer": "The tallest mountain in the world is Mount Everest."}
]

In [6]:
processed_data = preprocess_data(data)
processed_data

['Q: What is the capital of France?\nA: The capital of France is Paris.\n',
 "Q: Who wrote 'Pride and Prejudice'?\nA: Jane Austen wrote 'Pride and Prejudice'.\n",
 'Q: What is the tallest mountain in the world?\nA: The tallest mountain in the world is Mount Everest.\n']

Tokenizer

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

## Processing

Create a dataset object

In [8]:
dataset = Dataset.from_dict({"text": processed_data})

Tokenize the data

In [9]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Define data collator for language modeling

In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Function to generate a response from the model

In [11]:
def generate_response(model, tokenizer, question):
    inputs = tokenizer.encode(f"Q: {question}\nA:", return_tensors="pt", padding=True)
    attention_mask = torch.ones(inputs.shape, dtype=torch.long)
    outputs = model.generate(
        inputs, 
        max_length=50, 
        num_return_sequences=1, 
        attention_mask=attention_mask, 
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("\nA:")[1].split("\nQ:")[0].strip()
    return response

### Testing the model before fine-tuning

In [12]:
questions = [
    "What is the capital of France?",
    "Who wrote 'Pride and Prejudice'?",
    "What is the tallest mountain in the world?"
]

In [13]:
print("Responses before fine-tuning:")
for question in questions:
    response = generate_response(model, tokenizer, question)
    print(f"Question: {question}\nResponse: {response}\n")

Responses before fine-tuning:
Question: What is the capital of France?
Response: The capital of France is Paris.

Question: Who wrote 'Pride and Prejudice'?
Response: I think it was the author of the book, and I think it was the author of the book. I think it was the author of the book. I think it was

Question: What is the tallest mountain in the world?
Response: The tallest mountain in the world is the Himalayas. It is the tallest mountain in the world. It is the tallest mountain in the world. It is the tallest mountain in



## Fine-Tuning

**Set up training arguments**

In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

Initialize Trainer

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

In [16]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=6, training_loss=1.4621446927388508, metrics={'train_runtime': 33.0695, 'train_samples_per_second': 0.272, 'train_steps_per_second': 0.181, 'total_flos': 2351628288000.0, 'train_loss': 1.4621446927388508, 'epoch': 3.0})

In [17]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

### Testing the model after fine-tuning

Load the fine-tuned model and tokenizer

In [18]:
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./fine_tuned_model")
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_model")

In [19]:
print("Responses before fine-tuning:")
for question in questions:
    response = generate_response(model, tokenizer, question)
    print(f"Question: {question}\nResponse: {response}\n")

Responses before fine-tuning:
Question: What is the capital of France?
Response: The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of

Question: Who wrote 'Pride and Prejudice'?
Response: I wrote 'Pride and Prejudice' in the first place.

Question: What is the tallest mountain in the world?
Response: The tallest mountain in the world is Mount Everest.



# Part 2 - Fixing the looping problem

In [24]:
def generate_response2(model, tokenizer, question):
    inputs = tokenizer.encode(f"Q: {question}\nA:", return_tensors="pt", padding=True)
    attention_mask = torch.ones(inputs.shape, dtype=torch.long)
    outputs = model.generate(
        inputs, 
        max_length=50, 
        num_return_sequences=1, 
        attention_mask=attention_mask, 
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=2,  # Prevents repetition of 2-grams
        top_p=0.95,  # Uses nucleus sampling
        top_k=50  # Limits the number of tokens to consider at each step
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the part after "A:" and before any new question part if present
    response = response.split("A:")[1].split("Q:")[0].strip()
    return response

In [25]:
print("Responses after fine-tuning:")
for question in questions:
    response = generate_response(model, tokenizer, question)
    print(f"Question: {question}\nResponse: {response}\n")

Responses after fine-tuning:
Question: What is the capital of France?
Response: France is the capital of France is France.

Question: Who wrote 'Pride and Prejudice'?
Response: I wrote 'Pride and Prejudice' in the early 1970s.

Question: What is the tallest mountain in the world?
Response: The tallest mountain in the world is Mount Everest.



# models
- QWEn 1.5.72B
- Scientific Research

# Concepts
- domain specific Q/A models
- nlp article type Q/As
- how to tune weights in fine-tuning process
- F1-score, exact match,


# Tasks
- Try 5 different models and evalute them **(QWEn 1.5.72B - Scientific Research)**
- Upload the code on github and share with Samira
- (optional) extract question and answers from articles

# Deadline
- Tuesday or Thursday this week(16-22)