In [None]:
from os import mkdir, makedirs
!pip install -r requirements.txt

In [None]:
from dotenv import load_dotenv
from pathlib import Path
import os

load_dotenv(override=True)

# Access the variables
input_dir = Path(os.getenv('INPUT_DIR', 'document_collection'))
output_dir = Path(os.getenv('OUTPUT_DIR', 'converted_documents'))
os.makedirs(output_dir, exist_ok=True)

### Setup Instructions

This demo demonstrates the process of converting raw PDF files into InstructLab Synthetic Knowledge Infusion Data using the RBC POC as an example. Follow these steps to get started with your own data.

#### Steps to Get Started:

1. **Organize Your Documents:**
   - Create a new directory under the `document_collection` directory for your specific project. For example, if your project is named "my_org," your directory structure should look like this:
     ```
     |-- document_collection
     |   `-- my_subject
     |       `-- my_subject_data.pdf
     ```
   - Place all your PDF files into this directory.

2. **Create Your qna.yaml:**
    - Create a taxonomy folder with qna.yaml files
    - Ensure your ICL files contain sufficient context and question-answer pairs. We recommend including at least 5 distinct contexts, each with a minimum of 3 sets of questions and answers. More entries will improve the robustness of your data.
    - The ICL file should be in the following format (refer to the `document_collection/my_subject/qna.yaml` file for an example):

    ```yaml
    domain: 
    document_outline: A one to two line description of the document
    seed_examples:
      - context: <context 1 goes here>
        question_and_answers:
          - question: <question 1 goes here>
            answer: <answer 1 goes here>
          - question: <question 2 goes here>
            answer: <answer 2 goes here>
          - question: <question 3 goes here>
            answer: <answer 3 goes here>
    ... 


   - **Note:** Replace placeholders with actual content relevant to your documents. Ensure the contexts are clear and questions are well-formulated to extract meaningful answers.

3. **Update the Data Directory Path:**
   - In the .env file, customize the `INPUT_DIR` variable to reflect the path to your directory.  For example:
     ```shell
     INPUT_DIR = "document_collection"
     ```
     If you do not customize this variable, it will default to `document_collection`
4. **Update the Output Directory Path:**
   - In the .env file, customize the `OUTPUT_DIR` variable to reflect the path to your output directory. For example:
     ```shell
     data_dir = "converted_documents"
     ```
     If you do not customize this variable, it will default to `converted_documents`.
---

### Convert PDF Documents to Markdown and JSON and inspect

In order to understand how the PDF documents will be used for Synthetic Data Generation (SDG), you may want to view the converted documents.  We employ [Docling](https://github.com/DS4SD/docling), which is also used by the SDG conversion process of InstructLab.  If the converted documents are not well formed and accurate, you can modify the converted markdown as needed and use that as your source document instead of the PDF.



In [None]:
from pathlib import Path
from typing import Iterable
import json
import time
import os
import re

from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter


def write_doc_json(conv_res, filename):
    with filename.open("w") as fp:
        fp.write(json.dumps(conv_res.document.export_to_dict(), indent=2))
    print(f"Exported: {filename}")
    return filename


def write_doc_md(conv_res, filename):
    with filename.open("w") as fp:
        fp.write(conv_res.document.export_to_markdown())
    print(f"Exported: {filename}")
    return filename


def process_directory(input_dir, output_dir):
    file_paths = list(input_dir.rglob("*.pdf"))
    doc_converter = DocumentConverter()
    start_time = time.time()
    conversion_results = doc_converter.convert_all(file_paths)

    success_count = 0
    failure_count = 0
    output_files = []

    for conv_res in conversion_results:
        if conv_res.status == ConversionStatus.SUCCESS:
            success_count += 1
            print(f"Exporting: {conv_res.input.file}")
            doc_filename = conv_res.input.file.stem
            doc_directory = conv_res.input.file.parent
            doc_output_dir = Path(re.sub(str(input_dir), str(output_dir), str(doc_directory)))
            doc_output_dir.mkdir(parents=True, exist_ok=True)

            # output_json = write_doc_json(conv_res, doc_output_dir / f"{doc_filename}.json")
            # output_files.append(output_json)
            output_md = write_doc_md(conv_res, doc_output_dir / f"{doc_filename}.md")
            output_files.append(output_md)

        else:
            print(f"Document {conv_res.input.file} failed to convert.")
            failure_count += 1

    print(
        f"Successfully processed {success_count} docs. "
        f"Failed to convert {failure_count} docs. "
        f"Elapsed time: {time.time() - start_time:.2f} seconds."
    )

    return success_count, failure_count

process_directory(input_dir, output_dir)