In [None]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.3


In [None]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import os
import re
import markdown
import fitz  # PyMuPDF for PDFs
import random
import nltk
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.model_selection import train_test_split
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline
import json

# Download NLTK tokenizer resources
nltk.download('punkt')

# --- Step 1: Data Extraction, Preprocessing & Chunking ---

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from a PDF file."""
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join([page.get_text("text") for page in doc])
        doc.close()
        return text
    except Exception as e:
        print(f"Error extracting PDF '{pdf_path}': {e}")
        return ""

def extract_text_from_md(md_path: str) -> str:
    """Extract text from a Markdown file."""
    try:
        with open(md_path, "r", encoding="utf-8") as f:
            text = f.read()
        text = re.sub(r'^---.*?---\n', '', text, flags=re.DOTALL)
        return markdown.markdown(text)
    except Exception as e:
        print(f"Error extracting Markdown '{md_path}': {e}")
        return ""

def preprocess_text(text: str, lowercase: bool = True) -> str:
    """Normalize whitespace and optionally lowercase text."""
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower() if lowercase else text

def chunk_text(text: str, chunk_size: int = 1024, overlap: int = 100) -> list:
    """Split text into chunks that preserve semantic boundaries."""
    if not text:
        return []
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    return splitter.split_text(text)

def process_documents(folder_path: str) -> list:
    """Process supported files in a folder into text chunks."""
    all_chunks = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        elif file_name.endswith(".md"):
            text = extract_text_from_md(file_path)
        else:
            continue
        text = preprocess_text(text)
        chunks = chunk_text(text)
        all_chunks.extend(chunks)
    return all_chunks

# Set your document folder path (use a raw string for Windows paths)
folder_path = "/content/"
document_chunks = process_documents(folder_path)
print(f"Extracted {len(document_chunks)} text chunks.")

# Optional: Apply simple augmentation to expand the dataset
def simple_augmentation(chunk: str) -> str:
    sentences = nltk.sent_tokenize(chunk)
    augmented = []
    for sent in sentences:
        augmented.append(sent)
        if len(sent) > 20 and random.random() < 0.3:
            augmented.append(sent + " (augmented)")
    return " ".join(augmented)

def augment_data(chunks: list) -> list:
    augmented_chunks = []
    for chunk in chunks:
        augmented_chunks.append(chunk)
        if random.random() < 0.5:
            augmented_chunks.append(simple_augmentation(chunk))
    return augmented_chunks

augmented_chunks = augment_data(document_chunks)
print(f"Augmented dataset size: {len(augmented_chunks)} chunks.")

# --- Step 2: Synthetic QA Generation using RAGAS Approach ---

# Load a T5 model for question generation
qg_model_name = "valhalla/t5-small-e2e-qg"
qg_tokenizer = T5Tokenizer.from_pretrained(qg_model_name)
qg_model = T5ForConditionalGeneration.from_pretrained(qg_model_name)

# Load an extractive QA pipeline to produce answers
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

def generate_question(context: str) -> str:
    """Generate a question from context using a T5 model."""
    input_text = "generate question: " + context
    inputs = qg_tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512)
    outputs = qg_model.generate(inputs, max_length=64)
    return qg_tokenizer.decode(outputs[0], skip_special_tokens=True)

def generate_qa_pair(context: str) -> dict:
    """Generate a QA pair from a given context."""
    if len(context.split()) < 20:
        return None
    try:
        question = generate_question(context)
        qa_result = qa_pipeline(question=question, context=context)
        answer = qa_result.get("answer", "").strip()
        return {"context": context, "question": question, "answer": answer}
    except Exception as e:
        print(f"Error in QA generation: {e}")
        return None

# Generate QA pairs for each augmented chunk
synthetic_qa_pairs = []
for chunk in augmented_chunks:
    qa_pair = generate_qa_pair(chunk)
    if qa_pair:
        synthetic_qa_pairs.append(qa_pair)

print(f"Generated {len(synthetic_qa_pairs)} synthetic QA pairs.")

# --- Step 3: Quality Evaluation using DeepEval ---
# (This is a placeholder for integration with a quality evaluation system.)
# You would typically pass your QA pairs through DeepEval to get quality scores and filter accordingly.
def evaluate_quality(qa_pair: dict) -> float:
    # Dummy function: replace with actual DeepEval API call
    return random.uniform(0.0, 1.0)

quality_threshold = 0.5
qa_pairs_filtered = [pair for pair in synthetic_qa_pairs if evaluate_quality(pair) >= quality_threshold]
print(f"Filtered dataset size after quality evaluation: {len(qa_pairs_filtered)} QA pairs.")

# --- Step 4: Data Curation with Kiln-AI ---
# (This step might involve exporting the filtered data for human review and further augmentation.)
# For the purpose of this pipeline, we assume that the filtered dataset is our final curated dataset.
curated_qa_pairs = qa_pairs_filtered

# --- Step 5: Dataset Splitting ---
def split_dataset(qa_pairs: list, test_size: float = 0.1, val_size: float = 0.1, random_state: int = 42):
    train_val, test = train_test_split(qa_pairs, test_size=test_size, random_state=random_state)
    val_relative = val_size / (1 - test_size)
    train, val = train_test_split(train_val, test_size=val_relative, random_state=random_state)
    return train, val, test

train_data, val_data, test_data = split_dataset(curated_qa_pairs)
print(f"Training set: {len(train_data)} QA pairs")
print(f"Validation set: {len(val_data)} QA pairs")
print(f"Test set: {len(test_data)} QA pairs")

# --- Optional: Save the datasets ---
with open("train_qa.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, indent=2, ensure_ascii=False)
with open("val_qa.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, indent=2, ensure_ascii=False)
with open("test_qa.json", "w", encoding="utf-8") as f:
    json.dump(test_data, f, indent=2, ensure_ascii=False)

print("Datasets saved as train_qa.json, val_qa.json, and test_qa.json.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Extracted 136 text chunks.
Augmented dataset size: 208 chunks.


Device set to use cpu


Generated 208 synthetic QA pairs.
Filtered dataset size after quality evaluation: 113 QA pairs.
Training set: 89 QA pairs
Validation set: 12 QA pairs
Test set: 12 QA pairs
Datasets saved as train_qa.json, val_qa.json, and test_qa.json.
