In [None]:
!pip install -r requirements.txt

In [None]:
# Standard
from pathlib import Path
from typing import Iterable
import json
import time
import os

# Third Party
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import DocumentConverter
from utils.logger_config import setup_logger
import click
import pandas as pd

# Local
from utils.docprocessor import DocProcessor

def export_documents(
        converted_docs: Iterable[ConvertedDocument],
        output_dir: Path,
):
    output_dir.mkdir(parents=True, exist_ok=True)

    success_count = 0
    failure_count = 0

    for doc in converted_docs:
        if doc.status == ConversionStatus.SUCCESS:
            success_count += 1
            doc_filename = doc.input.file.stem

            # Export Deep Search document JSON format:
            with (output_dir / f"{doc_filename}.json").open("w") as fp:
                fp.write(json.dumps(doc.render_as_dict()))

            # Export Markdown format:
            with (output_dir / f"{doc_filename}.md").open("w") as fp:
                fp.write(doc.render_as_markdown())
        else:
            print(f"Document {doc.input.file} failed to convert.")
            failure_count += 1

    print(
        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
    )

    return doc_filename

In [None]:
from dotenv import load_dotenv
import os

load_dotenv(override=True)

# Access the variables
input_dir = Path(os.getenv('INPUT_DIR'))
output_dir = Path(os.getenv('OUTPUT_DIR'))
os.makedirs(output_dir, exist_ok=True)

### Setup Instructions

This demo demonstrates the process of converting raw PDF files into InstructLab Synthetic Knowledge Infusion Data using the RBC POC as an example. Follow these steps to get started with your own data.

#### Steps to Get Started:

1. **Organize Your Documents:**
   - Create a new directory under the `document_collection` directory for your specific project. For example, if your project is named "my_org," your directory structure should look like this:
     ```
     |-- document_collection
     |   `-- my_org
     |       |-- my_org_data.pdf
     |       `-- qna.yaml
     ```
   - Place all your PDF files and ICL files (like `qna.yaml`) into this directory.

2. **Format Your ICLs:**
   - Ensure your ICL files contain sufficient context and question-answer pairs. We recommend including at least 5 distinct contexts, each with a minimum of 3 sets of questions and answers. More entries will improve the robustness of your data.
    - The ICL file should be in the following format (refer to the `document_collection/my_org/qna.yaml` file for an example):

    ```yaml
    domain: 
    document_outline: A one to two line description of the document
    seed_examples:
      - context: <context 1 goes here>
        question_and_answers:
          - question: <question 1 goes here>
            answer: <answer 1 goes here>
          - question: <question 2 goes here>
            answer: <answer 2 goes here>
          - question: <question 3 goes here>
            answer: <answer 3 goes here>
    ... 


   - **Note:** Replace placeholders with actual content relevant to your documents. Ensure the contexts are clear and questions are well-formulated to extract meaningful answers.

3. **Update the Data Directory Path:**
   - In the script or code where the data directory is specified, update the `input_dir` variable to reflect the path to your new directory. For example:
     ```python
     data_dir = "document_collection/my_org"
     ```
4. **Update the Output Directory Path:**
   - In the script or code where the data directory is specified, update the `output_dir` variable to reflect the path to your directory. For example:
     ```python
     data_dir = "output/my_org"
     ```
---

### PDF Documents to Seed Dataset

To convert PDF documents into a usable seed dataset, we employ [Docling](https://github.com/DS4SD/docling), a tool designed for extracting and processing text from PDF files. The text extraction process involves parsing the PDF documents and saving the extracted text into a structured JSON file. The extracted text in JSON format can be used to generate InstructLab Synthetic Knowledge Infusion Data.


#### Step 1: 

Run the following command to extract text from the PDF documents and save it in JSON format:

⚠️ **Note:** This process takes about 5 minutes to run for this example


In [None]:
file_paths = list(input_dir.glob("*.pdf"))
artifacts_path = DocumentConverter.download_models_hf()
doc_converter = DocumentConverter(artifacts_path=artifacts_path)
inputs = DocumentConversionInput.from_paths(file_paths)

start_time = time.time()
converted_docs = doc_converter.convert(inputs)
doc_filename = export_documents(converted_docs, output_dir)
end_time = time.time()

print(f"Parsing documents took {end_time - start_time:.2f} seconds")

dp = DocProcessor(output_dir, user_config_path=f'{input_dir}/qna.yaml')
seed_data = dp.get_processed_dataset()

seed_data.to_json(f'{output_dir}/seed_data.jsonl', orient='records', lines=True)

md_output_dir = f"{output_dir}/md"
os.makedirs(md_output_dir, exist_ok=True)

jsonl_file_path = f"{output_dir}/seed_data.jsonl"

### Convert JSONL to markdown files

In [None]:
import pandas as pd
import os
import json

# Create the output directory if it doesn't exist
md_output_dir = f"{output_dir}/md"
os.makedirs(md_output_dir, exist_ok=True)


In [None]:
def save_document(index, document_text):
    file_name = f"document_{index+1}.md"
    file_path = os.path.join(md_output_dir, file_name)
    
    with open(file_path, 'w') as f:
        f.write(document_text)
    
    print(f"Saved {file_path}")


In [None]:
jsonl_file_path = f"{output_dir}/seed_data.jsonl"

In [None]:
with open(jsonl_file_path, 'r') as f:
    saved_hashes = set()
    i = 0
    for line in f:
        entry = json.loads(line)
        document_text = entry.get('document', '')
        h = hash(document_text)
        if h not in saved_hashes:
            saved_hashes.add(h)
            save_document(i, document_text)
            i += 1