In [46]:
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions


In [47]:

source = r"sample.pdf"  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(source)
# print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"


Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAt

In [48]:
with open('sample.md', 'w', encoding='utf-8') as f:
    f.write(result.document.export_to_markdown())

In [49]:
from collections import defaultdict

element_types = defaultdict(list)

# Iterate through all document elements and group them by label
for item, _ in result.document.iterate_items():
    element_type = item.label
    element_types[element_type].append(item)

# Display the breakdown of document structure
print("Document structure breakdown:")
for element_type, items in element_types.items():
    print(f"  {element_type}: {len(items)} elements")

Document structure breakdown:
  picture: 1 elements
  section_header: 4 elements
  text: 15 elements
  list_item: 14 elements


In [50]:
doc = result.document # DoclingDocument

In [51]:
json_dict = doc.export_to_dict()

json_dict.keys()

dict_keys(['schema_name', 'version', 'name', 'origin', 'furniture', 'body', 'groups', 'texts', 'pictures', 'tables', 'key_value_items', 'form_items', 'pages'])

In [52]:
# Optimized for large documents
pipeline_options = PdfPipelineOptions(
    max_num_pages=4,  # Limit processing to first 4 pages
    page_range=[1, 3],  # Process specific page range
    generate_page_images=False,  # Skip page images to save memory
    do_table_structure=False,  # Skip table structure extraction
    enable_parallel_processing=True  # Use multiple cores
)

In [53]:
from docling.chunking import HybridChunker

# Process with HybridChunker (token-aware)
hybrid_chunker = HybridChunker(max_tokens=512, overlap_tokens=50)
hybrid_chunks = list(hybrid_chunker.chunk(doc))

print(f"HybridChunker: {len(hybrid_chunks)} chunks")

def print_chunk(chunk):
    print(f"Chunk length: {len(chunk.text)} characters")
    if len(chunk.text) > 30:
        print(f"Chunk content: {chunk.text[:30]}...{chunk.text[-30:]}")
    else:
        print(f"Chunk content: {chunk.text}")
    print("-" * 50)

# Print the first 3 chunks
for chunk in hybrid_chunks[:5]:
    print_chunk(chunk)


Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors


HybridChunker: 6 chunks
Chunk length: 831 characters
Chunk content: TORONTO, July 15, 2025 /CNW/ I....isoenergy.ca/sustainability/.
--------------------------------------------------
Chunk length: 2241 characters
Chunk content: - Environmental achievements i...sustainability policy in 2025.
--------------------------------------------------
Chunk length: 876 characters
Chunk content: IsoEnergy (NYSE American: ISOU...@IsoEnergyLtd www.isoenergy.ca
--------------------------------------------------
Chunk length: 956 characters
Chunk content: This press release contains "f...ll or may occur in the future.
--------------------------------------------------
Chunk length: 1443 characters
Chunk content: Forw ard-looking statements ar...n forw ard-looking statements.
--------------------------------------------------


### Conclusion

- Docling gives great results 
- the advanced pipeline options should be explored for larger docs
- hybrid chunking seems to be the right approach

### NLP based approach for data extraction
- faster, no-cost soln as compared to llm-based extraction

In [54]:
import spacy
import re

In [55]:
# ---------- NLP SETUP ----------
nlp = spacy.load("en_core_web_trf")

# ESG keywords & patterns
ESG_KEYWORDS = [
    "emissions", "carbon", "CO2", "renewable", "sustainable", "ESG",
    "waste", "diversity", "gender", "compliance", "regulation", "ISO",
    "investment", "energy", "greenhouse", "climate"
]

PATTERNS = [
    re.compile(r"reduced\s+\w+\s+emissions\s+by\s+\d+%.*", re.IGNORECASE),
    re.compile(r"complies\s+with\s+.+", re.IGNORECASE),
    re.compile(r"invest(ed|ment)\s+in\s+.+", re.IGNORECASE),
    re.compile(r"achiev(ed|ement)\s+of\s+\d+%.*", re.IGNORECASE)
]

In [56]:
def rule_based_extract(text):
    """Apply keyword filtering and regex to extract candidate ESG sentences."""
    doc = nlp(text)
    candidates = []
    uncertain = []

    for sent in doc.sents:
        sent_text = sent.text.strip()
        if any(k in sent_text.lower() for k in ESG_KEYWORDS):
            if any(p.search(sent_text) for p in PATTERNS):
                # Direct extraction from rule
                candidates.append({
                    "subject": "Unknown",  # Will refine later
                    "predicate": "esg_fact",
                    "object": sent_text
                })
            else:
                # Keyword found but no exact match → send to LLM
                uncertain.append(sent_text)

    return candidates, uncertain

In [57]:
text = hybrid_chunks[1].text

In [58]:
hits, uncertain = rule_based_extract(text)

In [59]:
text

'- Environmental achievements include reducing impact and advancing operational readiness.\n- Achieved zero significant environmental incidents across all projects in 2024.\n- Initiated baseline environmental studies at Larocque East to guide future permitting and project design with a view to minimizing ecological impacts.\n- Reclaimed all active exploration sites, and enhanced waste management practices with the goal of reducing material waste.\n- Improved water efficiency and reduced emissions at Tony M Mine through targeted infrastructure upgrades.\n- Partnered with communities, establishing investments in people.\n- Maintained strong Indigenous representation in the workforce with 63% at Matoush and 36% at Larocque East.\n- Supported community well-being initiatives, including the Northlands College Scholarship Foundation and JZ Memorial Fund.\n- Implemented Company-wide policies on Health and Safety, Respectful Workplace, and Diversity.\n- Standardized incident investigation repo

In [60]:
uncertain

['- Reclaimed all active exploration sites, and enhanced waste management practices with the goal of reducing material waste.\n- Improved water efficiency and reduced emissions at Tony M Mine through targeted infrastructure upgrades.\n- Partnered with communities, establishing investments in people.\n- Maintained strong Indigenous representation in the workforce with 63% at Matoush and 36% at Larocque East.\n- Supported community well-being initiatives, including the Northlands College Scholarship Foundation and JZ Memorial Fund.\n- Implemented Company-wide policies on Health and Safety, Respectful Workplace, and Diversity.\n- Standardized incident investigation reporting across all exploration projects.\n- Strengthened governance with new policies for sustainable growth.\n- Adopted new oversight measures, including Corporate Governance Guidelines and Majority Voting Policy.',
 '- Achieved 100% Code of Ethics compliance, reinforcing a culture of integrity.\n- Continued strong Board ove

spacy has failed miserably..

trying out specialized ESGBert models

In [61]:
### MAKE SURE TO INSTALL THIS LIB: !pip install transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline # for using the models

### Load the models (takes ca. 1 min)
# Environmental model.
name = "ESGBERT/EnvironmentalBERT-environmental" # path to download from HuggingFace
# In simple words, the tokenizer prepares the text for the model and the model classifies the text-
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSequenceClassification.from_pretrained(name)
# The pipeline combines tokenizer and model to one process.
pipe_env = pipeline("token-classification", model=model, tokenizer=tokenizer)

# Also load the social and governance model.
# Social model.
name = "ESGBERT/SocialBERT-social"
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSequenceClassification.from_pretrained(name)
pipe_soc = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Governance model.
name = "ESGBERT/GovernanceBERT-governance"
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSequenceClassification.from_pretrained(name)
pipe_gov = pipeline("text-classification", model=model, tokenizer=tokenizer)

Device set to use cuda:0
The model 'RobertaForSequenceClassification' is not supported for token-classification. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'BrosForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DiffLlamaForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassification', 'ErnieForTokenClassification', 'ErnieMForTokenClassification', 'EsmForTokenClassification', 'FalconForTokenClassification', 'FlaubertForTokenClassification', 'FNetForTokenClassification', 'FunnelForTokenClassification', 'GemmaForTokenClassification', 'Gemma2ForTokenClassification', 'GlmForTokenClassification', 'GPT2ForTokenClassification', 'GPT2ForTokenClassificat