##1. Data Generation & Dataset Creation

###1.1. Data Loading and Preprocessing

make sure to upload md and pdf files into `/data` directory

In [None]:
import os
import re
from PyPDF2 import PdfReader

def load_and_preprocess_data(data_dir):
    """Loads and preprocesses Markdown and PDF files."""
    documents = []
    for filename in os.listdir(data_dir):
        filepath = os.path.join(data_dir, filename)
        if filename.endswith(".md"):
            with open(filepath, "r", encoding="utf-8") as f:
                text = f.read()
                text = re.sub(r"^[#\s]+", "", text, flags=re.MULTILINE)
                documents.append({"filename": filename, "text": text})
        elif filename.endswith(".pdf"):
            try:
                with open(filepath, "rb") as f:
                    pdf_reader = PdfReader(f)
                    text = ""
                    for page in pdf_reader.pages:
                        text += page.extract_text()
                    # cleaning
                    text = re.sub(r"\n+", "\n", text)
                    text = re.sub(r" +", " ", text)
                    documents.append({"filename": filename, "text": text})
            except Exception as e:
                print(f"Error reading PDF {filename}: {e}")
    return documents


data_dir = "data"
documents = load_and_preprocess_data(data_dir)
# print(documents[0]['text'][:500]) # check a document

###1.2. Synthetic Data Generation (using Qwen2.5-3B-Instruct itself, initially)

In [None]:
from transformers import pipeline, AutoTokenizer

def generate_qa_pairs(documents, model_name="Qwen/Qwen2.5-3B-Instruct", num_questions_per_doc=5):
    """Generates QA pairs using the base Qwen model."""

    generator = pipeline('text-generation', model=model_name, device=0)  # Use GPU if available
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    qa_pairs = []
    for doc in documents:

        text_chunks = [doc['text'][i:i+4096] for i in range(0, len(doc['text']), 4096)]

        for chunk in text_chunks:
            prompt = f"""
            Context:
            {chunk}

            Based on the above context, generate {num_questions_per_doc} question and answer pairs.
            Format them strictly as follows:

            Q: [Question 1]
            A: [Answer 1]

            Q: [Question 2]
            A: [Answer 2]

            ...
            """

            # Generate text using model
            generated_text = generator(
                prompt,
                max_length=1024,
                num_return_sequences=1,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                temperature=0.7
            )[0]['generated_text']


            matches = re.findall(r"Q: (.*?)\nA: (.*?)(?=\nQ:|\Z)", generated_text, re.DOTALL)
            for question, answer in matches:
                qa_pairs.append({"question": question.strip(), "answer": answer.strip(), "source": doc['filename']})
    return qa_pairs


initial_qa_pairs = generate_qa_pairs(documents, num_questions_per_doc=3)
# print(initial_qa_pairs[:5])

###1.3. Data Augmentation and Refinement

###1.4. Dataset Splitting

###1.5 Dataset Formatting (JSONL)

##2. Model Selection and Preparation

##3. Efficient Fine-tuning (QLoRA)