# Data Preprocessing

This notebook covers the required preprocessing steps for preparing the `seed.jsonl` dataset which is ready for Synthetic Data Generation (SDG). 

1. Configure the paths
2. Loading the files into memory
3. Document Conversion
4. Chunking
5. Saving the processed data

In [None]:
import json
import random
from pathlib import Path

from docling.chunking import HybridChunker
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

from ai_tools.usecase.knowledge_tuning.create_seed_dataset import \
    get_seed_dataset
from ai_tools.usecase.knowledge_tuning.utils import (generate_seed_examples,
                                                     review_seed_examples_file,
                                                     view_seed_example)

WORKSPACE = Path.cwd().parent  # Path to the workspace directory

SOURCE_DOCUMENT_DIR = WORKSPACE / "source_documents"
OUTPUT_DIR = WORKSPACE / "output" / "step_01"

OUTPUT_DIR.mkdir(
    parents=True, exist_ok=True
)  # Create output directory if it doesn't exist

In [None]:
available_files = SOURCE_DOCUMENT_DIR.glob("**/*.pdf")
available_files = list(available_files)

print(f"Total PDF files found: {len(available_files)} \n")
print("Available Files:")

for file in available_files:
    print(f"\t{file.stem}.pdf")

## Document Conversion

The source documents are in pdf format and we will be using `docling` to read and convert them into docling output format.

configuring docling pipeline

In [None]:
pipeline_options = PdfPipelineOptions()
doc_converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)

Convert the document to docling format

In [None]:
confidence_report = {}
for file in available_files:
    conv_result = doc_converter.convert(file)

    document = conv_result.document
    confidence_report[file.stem] = conv_result.confidence

    document_dict = document.export_to_dict()

    (OUTPUT_DIR / "docling_output").mkdir(parents=True, exist_ok=True)
    output_file = OUTPUT_DIR / "docling_output" / f"{file.stem}.json"
    with open(output_file, "w") as f:
        json.dump(document_dict, f, indent=4)

In [None]:
for file, confidence_report in confidence_report.items():
    print(f"Conversion confidence for {file}:")

    print(
        f"Average confidence: \x1b[1m{confidence_report.mean_grade.name}\033[0m (score {confidence_report.mean_score:.3f})"
    )

    low_score_pages = []
    for page in confidence_report.pages:
        page_confidence_report = confidence_report.pages[page]
        if page_confidence_report.mean_score < confidence_report.mean_score:
            low_score_pages.append(page)

    print(
        f"Pages that scored lower than average: {', '.join(str(x + 1) for x in low_score_pages)}"
    )

    print()

## Chunking

Chunk the document using docling

In [None]:
chunker = HybridChunker()

all_chunks = []

json_files = (OUTPUT_DIR / "docling_output").glob("*.json")

convertor = DocumentConverter()

for file in json_files:
    conv_result = convertor.convert(file)

    chunks = chunker.chunk(conv_result.document)
    chunks = list(chunks)
    print(f"Total chunks created for {file.stem}: {len(chunks)}")

    for chunk in chunks:
        all_chunks.append(
            {
                "chunk": chunker.contextualize(chunk),
                "file": file.stem,
                "metadata": chunk.meta.export_json_dict(),
            }
        )

chunks_file_path = OUTPUT_DIR / "chunks.jsonl"
with open(chunks_file_path, "w", encoding="utf-8") as file:
    for chunk in all_chunks:
        json.dump(chunk, file)
        file.write("\n")
    print(f"Path of chunks JSON is: {Path(chunks_file_path).resolve()}")

View random chunks

In [None]:
NUM_CHUNKS_TO_VIEW = 5


sample = random.sample(all_chunks, min(len(all_chunks), NUM_CHUNKS_TO_VIEW))

i = 1
for chunk in sample:
    print(f"== Randomly selected chunk {i}: ==========\n\n{chunk['chunk']}\n\n")
    i += 1

Read the chunks back from the saved file and then randomly select the chunks to use as seed data for SDG.

In [None]:
chunks = []
with open(chunks_file_path, encoding="utf-8") as file:
    for line in file:
        chunk = json.loads(line)
        chunks.append(chunk)

NUM_SEED_EXAMPLES = 5  # Number of chunks to select as seed examples
selected_chunks = random.sample(chunks, NUM_SEED_EXAMPLES)
selected_chunks_path = OUTPUT_DIR / "selected_chunks.jsonl"
with open(selected_chunks_path, "w", encoding="utf-8") as file:
    for chunk in selected_chunks:
        json.dump(chunk, file)
        file.write("\n")
    print(f"Path of selected chunks JSON is: {Path(selected_chunks_path).resolve()}")

## QnA.yaml file Generation

Generate QnA for each chunk selected above

In [None]:
# Define your API credentials and model details
API_KEY = ""  # Replace with your actual API key
ENDPOINT = "https://granite-3-3-8b-instruct-maas-apicast-production.apps.prod.rhoai.rh-aiservices-bu.com:443/v1"
MODEL_NAME = "granite-3-3-8b-instruct"

In [None]:
CUSTOMISATION_PROMPT = "Generate atleast 5 seed examples in the format specified below."

For every chunk in the randomly selected chunks, we will create a QnA pair in the `QnA.yaml` file. 

The generation will be done using a LLM, with a prompt.

In [None]:
generate_seed_examples(
    "",
    selected_chunks_path,
    OUTPUT_DIR,
    API_KEY,
    ENDPOINT,
    MODEL_NAME,
    "DOMAIN",
    "SUMMARY",
    CUSTOMISATION_PROMPT,
)

View the a seed example from the QnA.yaml file generated above.

In [None]:
view_seed_example(OUTPUT_DIR / "qna.yaml", 0)

Review the generated QnA pairs.

- Checks to find the presence of required fields
- Check the number of seed examples generated
- Check the number of QnA pairs generated for each seed example

In [None]:
review_seed_examples_file(OUTPUT_DIR / "qna.yaml", min_seed_examples=4, num_qa_pairs=3)

In [None]:
seed_data = get_seed_dataset(OUTPUT_DIR, OUTPUT_DIR)
seed_data
seed_data.to_json(OUTPUT_DIR / "final_seed_data.jsonl", orient="records", lines=True)