# QA Pair Generation - Intellihack Scope 03

This notebook focuses on generating high-quality question-answer pairs from the processed document chunks, which will be used for fine-tuning the Qwen 2.5-3B model.

In [None]:
# Import necessary libraries
import os
import json
import random
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
import re
from nltk.tokenize import sent_tokenize

# Set random seed for reproducibility
random.seed(42)

In [None]:
# Load processed chunks
processed_dir = Path('../data/processed')
qa_dir = Path('../data/qa_pairs')
qa_dir.mkdir(parents=True, exist_ok=True)

with open(processed_dir / 'chunks.json', 'r', encoding='utf-8') as f:
    chunks = json.load(f)

print(f"Loaded {len(chunks)} document chunks")

## QA Generation Strategies

We'll use rule-based templates to generate different types of questions:
1. Definition questions ("What is X?")
2. Explanation questions ("How does X work?")
3. Comparison questions ("Compare X and Y")
4. Technical detail questions ("What are the components of X?")

In [None]:
def extract_key_terms(text):
    """
    Extract potential key terms from text using simple heuristics.
    """
    # Find capitalized terms (potential proper nouns)
    capitalized = re.findall(r'\b[A-Z][a-zA-Z0-9]+([-][A-Za-z0-9]+)*\b', text)
    
    # Find terms in quotes
    quoted = re.findall(r'"([^"]+)"', text)
    
    # Find technical terms (containing numbers, hyphens, etc.)
    technical = re.findall(r'\b[A-Za-z][A-Za-z0-9]*[-][A-Za-z0-9]+\b', text)
    
    # Combine and deduplicate
    all_terms = list(set(capitalized + quoted + technical))
    
    # Filter out common words and very short terms
    common_words = {'the', 'and', 'for', 'with', 'this', 'that', 'these', 'those'}
    filtered_terms = [term for term in all_terms if term.lower() not in common_words and len(term) > 3]
    
    return filtered_terms

In [None]:
def generate_qa_pairs(chunk):
    """
    Generate question-answer pairs from a document chunk.
    """
    qa_pairs = []
    content = chunk['content']
    topic = chunk['topic']
    
    # Extract potential terms to ask about
    key_terms = extract_key_terms(content)
    
    # If no key terms found, use the topic itself
    if not key_terms and topic:
        key_terms = [topic]
    
    # Generate different question types
    for term in key_terms[:3]:  # Limit to 3 terms per chunk to avoid repetition
        # Definition question
        qa_pairs.append({
            "question": f"What is {term}?",
            "answer": content,  # Using full chunk as answer
            "source": chunk['source'],
            "chunk_id": chunk['chunk_id'],
            "question_type": "definition"
        })
        
        # Technical details question
        qa_pairs.append({
            "question": f"Explain the technical details of {term}.",
            "answer": content,
            "source": chunk['source'],
            "chunk_id": chunk['chunk_id'],
            "question_type": "technical"
        })
        
        # Use case question
        qa_pairs.append({
            "question": f"What are the applications or benefits of {term}?",
            "answer": content,
            "source": chunk['source'],
            "chunk_id": chunk['chunk_id'],
            "question_type": "application"
        })
    
    # Add a general question about the topic
    qa_pairs.append({
        "question": f"Describe the key aspects of {topic}.",
        "answer": content,
        "source": chunk['source'],
        "chunk_id": chunk['chunk_id'],
        "question_type": "description"
    })
    
    return qa_pairs

In [None]:
# Generate QA pairs from all chunks
all_qa_pairs = []

for chunk in tqdm(chunks, desc="Generating QA pairs"):
    pairs = generate_qa_pairs(chunk)
    all_qa_pairs.extend(pairs)

# Convert to DataFrame
qa_df = pd.DataFrame(all_qa_pairs)

# Display statistics
print(f"Generated {len(qa_df)} QA pairs")
print(f"Question types: {qa_df['question_type'].value_counts().to_dict()}")
qa_df.head()

## Format QA Pairs for Fine-tuning

Now we'll format the QA pairs into the specific format required by the Qwen 2.5-3B model for fine-tuning.

In [None]:
def format_for_finetuning(qa_pair):
    """
    Format QA pairs for Qwen 2.5-3B fine-tuning using chat template.
    """
    return {
        "messages": [
            {"role": "user", "content": qa_pair['question']},
            {"role": "assistant", "content": qa_pair['answer']}
        ]
    }

In [None]:
# Format all QA pairs
formatted_data = [format_for_finetuning(qa_pair) for qa_pair in qa_df.to_dict(orient='records')]

# Split into train and validation sets (90% train, 10% validation)
random.shuffle(formatted_data)
split_idx = int(len(formatted_data) * 0.9)

train_data = formatted_data[:split_idx]
val_data = formatted_data[split_idx:]

# Save the datasets
with open(qa_dir / "train.json", 'w', encoding='utf-8') as f:
    json.dump(train_data, f, indent=2)
    
with open(qa_dir / "validation.json", 'w', encoding='utf-8') as f:
    json.dump(val_data, f, indent=2)
    
print(f"Saved {len(train_data)} training examples and {len(val_data)} validation examples")

## Sample QA Pairs

Let's examine some sample QA pairs to ensure quality.

In [None]:
# Display 3 random examples
sample_indices = random.sample(range(len(formatted_data)), 3)

for idx in sample_indices:
    sample = formatted_data[idx]
    print("="*80)
    print(f"Example {idx}:")
    print(f"Question: {sample['messages'][0]['content']}")
    print("\nAnswer (truncated): ")
    print(sample['messages'][1]['content'][:300], "...")
    print("="*80)