# Project Scope

# Setup Environment

## a. Folder structure

In [95]:
import os, sys, subprocess
import warnings
import json
from pathlib import Path
from datetime import datetime

# Set enviroment varaible
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

# Jupyter-specific
from IPython.display import display, Markdown


In [2]:
# Project structure
PROJECT_ROOT = Path.cwd().parent
DATA_DIR     = PROJECT_ROOT / "data"
SRC_DIR      = PROJECT_ROOT / "src"

# Add project src/ directory to PYTHONPATH
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

## b. Load Processor

In [3]:
# IMPORTANT: This section is required to setup a hybrid workflow between Google Colab or local venv
import curl_cffi
print('curl session var =', curl_cffi.requests.Session(impersonate="chrome"), flush=True)

warnings.filterwarnings("ignore")

curl session var = <curl_cffi.requests.session.Session object at 0x10893a0f0>


In [98]:
# --- Detect Colab vs Local ---
def _is_colab() -> bool:
    """Return True when running inside Google Colab."""
    try:
        import google.colab  # type: ignore
        return True
    except Exception:
        return False

IN_COLAB = _is_colab()

print("IN_COLAB =", IN_COLAB)
print("Python   =", sys.version.split()[0])
print("Platform =", sys.platform)

IN_COLAB = False
Python   = 3.12.8
Platform = darwin


In [99]:
# --- Mount Drive first (Colab only) ---
if IN_COLAB:
    from google.colab import drive
    COLAB_DRIVE_MOUNT = "/content/drive"
    drive.mount(COLAB_DRIVE_MOUNT, force_remount=True)

    os.chdir(f"{COLAB_DRIVE_MOUNT}/{NOTEBOOK_DIR}")

In [100]:
# --- Device detection --
DEVICE = "cpu"
DEVICE_INFO = {
    "runtime": runtime_env,
    "framework": None,
    "name": None,
    "count": 0,
}

# Try PyTorch first
try:
    import torch
    if torch.cuda.is_available():
        DEVICE = "cuda"
        DEVICE_INFO.update({
            "framework": "torch",
            "name": torch.cuda.get_device_name(0),
            "count": torch.cuda.device_count(),
        })
    elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
        DEVICE = "mps"
        DEVICE_INFO.update({
            "framework": "torch",
            "name": "Apple Metal (M1/M2)",
            "count": 1,
        })
except ImportError:
    pass

# If still CPU, try TensorFlow
if DEVICE == "cpu":
    try:
        import tensorflow as tf
        gpus = tf.config.list_physical_devices("GPU")
        if gpus:
            DEVICE = "gpu"
            DEVICE_INFO.update({
                "framework": "tensorflow",
                "name": gpus[0].device_type,
                "count": len(gpus),
            })
    except ImportError:
        pass

# --- Log the result ---
print(f"üîé Runtime   : \t{DEVICE_INFO['runtime']}")
print(f"üíª Framework : \t{DEVICE_INFO['framework'] or 'none'}")
print(f"üñ•Ô∏è Device    : \t{DEVICE} ({DEVICE_INFO['name'] or 'generic CPU'})")
print(f"üî¢ Count     : \t{DEVICE_INFO['count']} \n")
print(f"... happy coding! \n")

üîé Runtime   : 	Local
üíª Framework : 	torch
üñ•Ô∏è Device    : 	mps (Apple Metal (M1/M2))
üî¢ Count     : 	1 

... happy coding! 



## c. Import libraries

In [102]:
# --- Dependency install (only in Colab) ---
if IN_COLAB:
    required_pkgs = [
        "docling==2.70.0",
        "chromadb==1.4.1",
        "langchain==1.2.7",
        "langchain-core==1.2.7",
        "langchain-community==0.4.1",
        "langchain-text-splitters==1.1.0",
        "langchain-ollama==1.0.1",
        "ollama==0.6.1",
        "requests==2.32.5",
        "pandas==2.3.3",
        "tqdm==4.67.1",
    ]

    # Install quietly but show errors if anything fails
    print("\nInstalling core dependencies for Colab...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *required_pkgs])

    print("Core dependencies installed.")

    # NOTE:
    # Ollama server is not typically available in Colab by default.
    # If your notebook uses ChatOllama/OllamaEmbeddings, you‚Äôll need a reachable Ollama endpoint.
    # For Colab demos, consider switching to a hosted model provider or a local embedding model.
    
else:
    print("\nLocal environment detected ‚Äî skipping pip install.")
    print("Use Poetry/venv to manage dependencies (requirements.txt is for reproducibility).\n")


Local environment detected ‚Äî skipping pip install.
Use Poetry/venv to manage dependencies (requirements.txt is for reproducibility).



In [103]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [104]:
# data libraries
import pandas as pd
import numpy as np

from typing import Dict, Tuple, List, Optional, Literal, Any

In [105]:
# LLM
import textwrap

# LanchChain
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

# 1. Ingesting and Indexing Data

**Workflow Overview**

This section describes the ingestion and indexing workflow, where raw PDF documents are converted into an index-ready representation for downstream retrieval, metadata extraction, and RAG-based question answering.

The goal at this stage is grounding, not interpretation. Documents are normalized into text with stable identifiers and provenance, regardless of whether they are digitally generated PDFs or scanned reports requiring OCR. Each document is processed independently to ensure robustness, traceability, and reproducibility.

The output of this step is a collection of documents that:

- have a deterministic `document ID`

- expose a canonical text representation

- retain source-level metadata

- are ready for chunking, embedding, and vector indexing

This creates a clean boundary between data ingestion and semantic reasoning, allowing later stages of the pipeline to remain decoupled from document format and extraction details.

In [55]:
# import helper functions
import pdf_ingestor.ingest as ingest
import pdf_ingestor.retrieval as retrieval

In [57]:
retrieval.assert_ollama_running()

Ollama is running!


## Step 1.1 ‚Äî Document Discovery & Normalisation

We begin by discovering PDF files under a dataset directory and establishing stable identity and provenance.

Key outputs:
- a batch of `LoadedDocument` objects
- stable `doc_id` per file (hash-based)
- path-derived metadata (e.g., `year`)
- ingestion summary (success/failure) without stopping the batch


In [16]:
# Define the dataset subset to ingest (keeps notebook runs fast and focused)
DATASET_DIR = DATA_DIR / "WEF" / "2026"

# Show a quick preview of what's inside the dataset folder (first ~20 items)
print("Dataset path:", DATASET_DIR)
for p in sorted(DATASET_DIR.iterdir())[:20]:
    print(" -", p.name)

# Ingest PDFs recursively from the dataset directory
documents = ingest.load_files_from_path(str(DATASET_DIR))

# Summarize ingestion outcome
total = len(documents)
successful = sum(doc.ok for doc in documents)
failed = total - successful

print("\nIngestion summary")
print(f"Loaded {total} documents")
print(f"‚úì Successful: {successful}")
print(f"‚úó Failed: {failed}")


Dataset path: /Users/pepetavo/Codes/Projects/datascience/portfolio/genai/pdf_ingestor/data/WEF/2026
 - .DS_Store
 - WEF_A_New_Era_for_Digital_Health_2026.pdf
 - WEF_Chief_Economists_Outlook_January_2026.pdf
 - WEF_From_Blueprint_to_Reality_2026.pdf
 - WEF_Fuelling_the_Future_2026.pdf
 - WEF_Industrial_Transformation_in_ASEAN_A_Cluster-Driven_Model_for_Regional_and_Global_Collaboration_2026.pdf


Processing files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [02:50<00:00, 34.16s/file]


Ingestion summary
Loaded 5 documents
‚úì Successful: 5
‚úó Failed: 0





In [19]:
# Print failure reasons (if any) to debug ingestion reliability
failed_docs = [d for d in documents if not d.ok]

if failed_docs:
    print("Failed documents (showing up to 10):")
    for d in failed_docs[:10]:
        print(f"\n[FAILED] {Path(d.source_path).name}")
        print("Path:", d.source_path)
        print("Status:", d.status)
        print("Reason:", d.error)
else:
    print("No failed documents [-Ok!-]")


No failed documents [-Ok!-]


## Step 1.2 ‚Äî Content Extraction via Docling

Each PDF is converted via Docling into a structured document representation and exported into a canonical text form (Markdown).

In this notebook, we validate extraction quality by:
- printing filename + metadata (e.g., year)
- showing a short text preview
- checking extracted character lengths


In [21]:
# Preview a few successfully extracted documents
ok_docs = [d for d in documents if d.ok]

print(f"Successful documents: {len(ok_docs)}")

for doc in ok_docs[:3]:
    # Extract a readable filename (avoid long absolute path)
    filename = Path(doc.source_path).name

    # Optional-safe access to extracted content
    content: Optional[str] = doc.content
    if content is None or not content.strip():
        preview = "[Empty content]"
        content_len = 0
    else:
        content_len = len(content)
        preview = textwrap.fill(content[:500].replace("\n", " "), width=95)

    # Optional metadata (path-derived)
    year = doc.metadata.get("year")

    print(f"\nüìÑ {filename}")
    print(f" - year: {year if year is not None else 'Unknown'}")
    print(f" - extracted chars: {content_len}")
    print("-" * 100)
    print(preview)


Successful documents: 5

üìÑ WEF_A_New_Era_for_Digital_Health_2026.pdf
 - year: 2026
 - extracted chars: 98326
----------------------------------------------------------------------------------------------------
In collaboration with Department of Health - Abu Dhabi  ## A New Era for Digital Health: Abu
Dhabi's Leap to Health Intelligence  WHI T E   P A P E R J A N U A R Y   2 0 2 6  <!-- image
-->  ## Contents  | Forewords

üìÑ WEF_From_Blueprint_to_Reality_2026.pdf
 - year: 2026
 - extracted chars: 122383
----------------------------------------------------------------------------------------------------
In collaboration with Oliver Wyman  ## From Blueprint to Reality: A Stronger Business Case for
Shared Energy Infrastructure  L E A R N I N G S   F R O M   T R A N S I T I O N I N G I N D U S
T R I A L   C L U S T E R S  WHI T E   P A P E R J A N U A R Y   2 0 2 6  <!-- image -->  ##
Contents  | Foreword          | Foreword
| Foreword                                                 

## Step 1.3 ‚Äî Wrapper Abstraction (`LoadedDocument`)

We wrap Docling outputs into a stable `LoadedDocument` interface so downstream code does not depend on Docling internals.

Benefits:
- consistent `.ok` contract for batch pipelines
- stable `.text()` interface for indexing
- chunk generator `.iter_chunks()` with provenance


In [22]:
# Inspect the wrapper object for one successful document
if ok_docs:
    sample_doc = ok_docs[0]

    print("Wrapper inspection")
    print("doc_id:", sample_doc.doc_id)
    print("source:", sample_doc.source_path)
    print("ok:", sample_doc.ok)
    print("metadata keys:", list(sample_doc.metadata.keys()))

    # Canonical text view (used for chunking/indexing)
    canonical_text = sample_doc.text()
    print("\nCanonical text preview (first 300 chars):")
    print(canonical_text[:300].replace("\n", " "))
else:
    print("No successful documents to inspect.")


Wrapper inspection
doc_id: 20b09c78e25cc45d
source: /Users/pepetavo/Codes/Projects/datascience/portfolio/genai/pdf_ingestor/data/WEF/2026/WEF_A_New_Era_for_Digital_Health_2026.pdf
ok: True
metadata keys: ['doc_id', 'source_path', 'year', 'ocr_performed', 'processing_notes', 'conversion_status']

Canonical text preview (first 300 chars):
In collaboration with Department of Health - Abu Dhabi  ## A New Era for Digital Health: Abu Dhabi's Leap to Health Intelligence  WHI T E   P A P E R J A N U A R Y   2 0 2 6  <!-- image -->  ## Contents  | Forewords                                                                                     


## Step 1.4 ‚Äî Converting to LangChain Documents (LangChain Adapter)

At this stage, PDFs have already been ingested and normalized into a framework-agnostic `LoadedDocument` representation.

To use the LangChain ecosystem (splitters, vector stores, retrievers), we now convert each successful `LoadedDocument` into a LangChain `Document`.

This step is intentionally performed at the notebook level to:
- keep ingestion independent of orchestration frameworks
- make framework choices explicit and auditable
- enable side-by-side comparison between custom chunking and LangChain-based pipelines


In [41]:
# Container for LangChain documents (each represents one ingested PDF as a "document")
lc_documents: List[Document] = []

# Iterate over ingestion outputs (LoadedDocument wrappers)
for doc in documents:
    # Skip documents that failed ingestion/conversion
    if not doc.ok:
        continue

    # Optional-safe: extracted content should be a non-empty string
    # (In our pipeline this is typically Markdown exported from Docling)
    text = doc.content
    if not isinstance(text, str) or not text.strip():
        continue

    # Convert to a LangChain Document with preserved provenance metadata
    lc_documents.append(
        Document(
            page_content=text,
            metadata={
                # Stable ID for de-duplication / traceability
                "doc_id": doc.doc_id,
                # Full path allows full provenance trace-back
                "source_path": doc.source_path,
                # Filename is convenient for notebooks and UI
                "filename": Path(doc.source_path).name,
                # Optional metadata (derived from folder structure in your loader)
                "year": doc.metadata.get("year"),
            },
        )
    )

print(f"Converted {len(lc_documents)} documents to LangChain format.")


Converted 5 documents to LangChain format.


## Step 1.5 ‚Äî Chunk Preparation for Indexing (LangChain Splitter)

Vector search operates on **chunks**, not full documents.  
We therefore split each LangChain `Document` into overlapping text chunks.

Design goals:
- deterministic chunking (reproducible results)
- overlap to preserve context across chunk boundaries
- provenance preserved through LangChain `metadata` propagation
- chunk size tuned for embedding + retrieval (not for summarization)


In [42]:
# Define a deterministic chunking strategy
# - separators define preferred breakpoints (paragraphs ‚Üí lines ‚Üí spaces ‚Üí characters)
# - overlap preserves context across boundaries
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=150,
    separators=["\n\n", "\n", " ", ""],
)

# Split LangChain documents into chunk-level LangChain Documents
lc_chunks = splitter.split_documents(lc_documents)

# Basic sanity checks to confirm chunking worked
print("Chunks:", len(lc_chunks))

# Optional-safe: only print example metadata if at least one chunk exists
if lc_chunks:
    print("Example chunk metadata:", lc_chunks[0].metadata)
    print("Example chunk preview:", lc_chunks[0].page_content[:250].replace("\n", " "))
else:
    print("No chunks produced (check upstream extraction).")


Chunks: 817
Example chunk metadata: {'doc_id': '20b09c78e25cc45d', 'source_path': '/Users/pepetavo/Codes/Projects/datascience/portfolio/genai/pdf_ingestor/data/WEF/2026/WEF_A_New_Era_for_Digital_Health_2026.pdf', 'filename': 'WEF_A_New_Era_for_Digital_Health_2026.pdf', 'year': 2026}
Example chunk preview: In collaboration with Department of Health - Abu Dhabi  ## A New Era for Digital Health: Abu Dhabi's Leap to Health Intelligence  WHI T E   P A P E R J A N U A R Y   2 0 2 6  <!-- image -->  ## Contents


## Step 1.6 ‚Äî Index-Ready Output (Handoff Boundary)

At this stage the pipeline has produced a **normalized, index-ready representation** of the document corpus:

- per-document text in LangChain `Document` format (`lc_documents`)
- per-chunk text units (`lc_chunks`) with stable provenance and metadata
  (doc_id, filename, source_path, year, etc.)

No embeddings or vector indexing are performed yet. 

This step is a **clean handoff boundary** between document preparation and semantic processing. Downstream stages can now safely perform:

- embedding generation
- vector database construction (e.g., Chroma)
- retrieval validation
- metadata extraction and RAG-based question answering


## Step 1.7 ‚Äî Embedding Model (Local)

We now select an embedding model to convert each chunk into a vector representation.

In this project we use **local embeddings via Ollama** to keep the pipeline:
- offline-friendly
- reproducible
- aligned with a local-first deployment scenario


In [58]:
# Create an embeddings client backed by Ollama (local)
# Ensure the embedding model is available in Ollama (e.g., `nomic-embed-text`)
embeddings = OllamaEmbeddings(model="nomic-embed-text")

print("Embeddings initialized:", embeddings)


Embeddings initialized: base_url='http://localhost:11434' model='nomic-embed-text' embed_instruction='passage: ' query_instruction='query: ' mirostat=None mirostat_eta=None mirostat_tau=None num_ctx=None num_gpu=None num_thread=None repeat_last_n=None repeat_penalty=None temperature=None stop=None tfs_z=None top_k=None top_p=None show_progress=False headers=None model_kwargs=None


## Step 1.8 ‚Äî Vector Database (Chroma)

We build a local Chroma vector database from the chunked documents.

What this stores:
- chunk embeddings (vectors)
- the chunk text (`page_content`)
- metadata for provenance and filtering (year, filename, doc_id, etc.)

We also persist the database to disk so it can be reused without re-embedding.


In [60]:
# Define where Chroma should persist on disk (local-first)
CHROMA_DIR = f"{DATA_DIR}/index/chroma_wef"

# Build the vector store from chunk documents
# - Chroma will embed each chunk using the embeddings model
# - metadata is stored alongside vectors for filtering and traceability
vectordb = Chroma.from_documents(
    documents=lc_chunks,
    embedding=embeddings,
    persist_directory=CHROMA_DIR,
    collection_name="wef_reports",
)

# Persist to disk to avoid rebuilding the index in future runs
vectordb.persist()

print("Chroma saved to:", CHROMA_DIR)
print("Collection:", "wef_reports")
print("Chunks indexed:", len(lc_chunks))


Chroma saved to: /Users/pepetavo/Codes/Projects/datascience/portfolio/genai/pdf_ingestor/data/index/chroma_wef
Collection: wef_reports
Chunks indexed: 817


# 2. Retrieve Information

In [79]:
# retriever libraries
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_classic.retrievers.multi_query import MultiQueryRetriever

# chain composition
from langchain_classic.schema.runnable import RunnablePassthrough
from langchain_classic.schema.output_parser import StrOutputParser

## Step 2.1 ‚Äî Language Model Setup (Local LLM)

In this section we configure the local Large Language Model (LLM) used for:
- query rewriting (to improve retrieval recall)
- answer generation in the RAG pipeline

The model is served locally via **Ollama**, keeping the workflow:
- offline-friendly
- reproducible
- suitable for on-prem or air-gapped environments


In [84]:
# Define the local LLM model to use
MODEL_LLAMA = "llama3.2"

# Preflight check: ensure Ollama + model are available
retrieval.assert_ollama_model_available(MODEL_LLAMA)

# Instantiate the chat-based LLM
llm = ChatOllama(
    model=MODEL_LLAMA,
    temperature=0.0,  # deterministic output for retrieval + QA
)

print("LLM initialized successfully:", MODEL_LLAMA)


‚úì Ollama model 'llama3.2' resolved to 'llama3.2:latest'.
LLM initialized successfully: llama3.2


## Step 2.2 ‚Äî Query Rewriting for Improved Retrieval

Vector similarity search can miss relevant documents if the query wording
does not align with the indexed text.

To mitigate this, we use **Multi-Query Retrieval**, where the LLM generates
multiple alternative phrasings of the user question.

These alternative queries are used to retrieve a broader and more diverse
set of relevant chunks from the vector database.


In [67]:


# Prompt that instructs the LLM to generate alternative versions of a query
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""
You are an AI assistant tasked with improving document retrieval.

Generate 3 alternative versions of the user's question that capture
different perspectives or phrasings, while preserving the original intent.

Return each alternative on a new line.

Original question: {question}
""",
)

print("Multi-query prompt ready.")


Multi-query prompt ready.


## Step 2.3 ‚Äî Retriever Configuration

We now configure the retriever that connects the vector database to the LLM.

The **MultiQueryRetriever**:
- takes a user question
- generates alternative queries using the LLM
- retrieves relevant chunks for each query
- merges and deduplicates the results

This improves recall compared to single-query similarity search.


In [71]:
# Convert Chroma vector store into a retriever interface
base_retriever = vectordb.as_retriever(
    search_kwargs={"k": 5}  # number of chunks per query
)

# Wrap the base retriever with multi-query logic
retriever = MultiQueryRetriever.from_llm(
    retriever=base_retriever,
    llm=llm,
    prompt=QUERY_PROMPT,
)

print("Retriever configured (MultiQueryRetriever).")


Retriever configured (MultiQueryRetriever).


## Step 2.4 ‚Äî RAG Prompt Template

The RAG prompt constrains the LLM to:
- answer **only** using retrieved context
- avoid hallucinations
- ground responses in indexed documents

This is critical for trust, traceability, and evaluation.


In [76]:
# Prompt template for RAG-based answering
rag_template = """
Answer the question using ONLY the context provided below.

<context>
{context}
</context>

Question: {question}

If the answer cannot be found in the context, say so explicitly.
"""

# Convert to a chat-compatible prompt
rag_prompt = ChatPromptTemplate.from_template(rag_template)

print("RAG prompt template ready.")


RAG prompt template ready.


## Step 2.5 ‚Äî Chain Composition

We now assemble the full Retrieval-Augmented Generation (RAG) chain.

The chain performs:
1. retrieval of relevant chunks (`retriever`)
2. injection of retrieved context into the prompt
3. answer generation via the LLM
4. output parsing into a clean string

This composition uses LangChain's Runnable interface for clarity and modularity.


In [80]:
# Compose the RAG chain
rag_chain = (
    {
        # Retrieved context comes from the retriever
        "context": retriever,
        # The original question is passed through unchanged
        "question": RunnablePassthrough(),
    }
    | rag_prompt   # Inject context + question into prompt
    | llm          # Generate answer using local LLM
    | StrOutputParser()  # Parse output to plain string
)

print("RAG chain assembled.")


RAG chain assembled.


# 3. Generate Responses

Finally, we execute the RAG pipeline with a sample question and inspect
the generated answer.

This step validates:
- retrieval quality
- grounding in document context
- end-to-end pipeline integrity


In [88]:
# helper function to wrap up discussion

def ask_me_about(question: str):
    """
    Chat with the agent using the RAG chain
    """
    return display(Markdown(rag_chain.invoke(question)))

In [89]:
# Example query relevant to the WEF corpus
question = """
    What are the main themes in the future of energy discussed in these reports?"
    In your response, please include the reference of the files evaluated for your
    response
"""

ask_me_about(question)

Based on the provided context, the main themes in the future of energy discussed in these reports are:

1. **Clean Fuels**: The reports emphasize the importance of clean fuels in reducing emissions and diversifying energy supply. They discuss various pathways for producing clean fuels, including liquid biofuels, biogases, lower-carbon fossil fuels, synthetic fuels, and hydrogen derivatives (WEF_Fuelling_the_Future_2026.pdf).
2. **Energy Transition**: The reports highlight the need for a rapid transition to clean energy sources, with a focus on reducing greenhouse gas emissions and mitigating climate change. They discuss the role of clean fuels in this transition and the importance of investing in emerging technologies (WEF_From_Blueprint_to_Reality_2026.pdf).
3. **Regional Realities**: The reports acknowledge that the role of clean fuels will differ by region, depending on local strengths, resources, and demand dynamics. They emphasize the need for a regional approach to evaluating competitiveness and guiding investment towards the most competitive and high-impact options (WEF_Fuelling_the_Future_2026.pdf).
4. **Industrial Transformation**: The reports discuss the importance of industrial transformation in energy ecosystems as a driver of competitiveness, innovation, and job creation. They highlight the need for innovative business models that mobilize capital at scale, direct resources efficiently, and deliver projects with widespread commercial and societal value (WEF_From_Blueprint_to_Reality_2026.pdf).

These themes are discussed across multiple reports, including WEF_Fuelling_the_Future_2026.pdf, WEF_A_New_Era_for_Digital_Health_2026.pdf, and WEF_From_Blueprint_to_Reality_2026.pdf.

References:

* WEF_Fuelling_the_Future_2026.pdf
* WEF_A_New_Era_for_Digital_Health_2026.pdf
* WEF_From_Blueprint_to_Reality_2026.pdf

In [91]:
# Cross-cutting strategic themes
question = """
What are the key strategic themes shaping the global energy transition across
the 2026 World Economic Forum reports?

Please synthesize insights across the documents and explicitly reference
which reports informed each theme in your response.
"""

ask_me_about(question)


Based on the provided context, the key strategic themes shaping the global energy transition across the 2026 World Economic Forum reports can be synthesized as follows:

1. **Clean Fuels and Energy Transition**: This theme is prominent across multiple reports, including "WEF_Fuelling_the_Future_2026.pdf", "WEF_From_Blueprint_to_Reality_2026.pdf", and "WEF_Industrial_Transformation_in_ASEAN_A_Cluster-Driven_Model_for_Regional_and_Global_Collaboration_2026.pdf". These reports emphasize the importance of clean fuels, such as biofuels, hydrogen derivatives, and lower-carbon fossil fuels, in driving a more secure, affordable, and sustainable energy system. They also highlight the need for industrial transformation to unlock competitiveness, innovation, and job creation.

2. **Industrial Transformation and Cluster Development**: This theme is explored in "WEF_From_Blueprint_to_Reality_2026.pdf" and "WEF_Industrial_Transformation_in_ASEAN_A_Cluster-Driven_Model_for_Regional_and_Global_Collaboration_2026.pdf". The reports discuss the need for innovative business models, mobilizing capital at scale, and directing resources efficiently to deliver projects that provide widespread commercial and societal value. They also highlight the importance of transitioning industrial clusters to drive competitiveness, innovation, and job creation.

3. **Global Cooperation and Collaboration**: This theme is evident in "WEF_Fuelling_the_Future_2026.pdf" and "WEF_Industrial_Transformation_in_ASEAN_A_Cluster-Driven_Model_for_Regional_and_Global_Collaboration_2026.pdf". The reports emphasize the need for greater alignment, coordination, and practical mechanisms for collective action to address the global energy transition. They also highlight the importance of partnerships across the value chain, engagement with financiers early on, and adopting new investment models.

4. **Sustainability and Emissions Reduction**: This theme is present in "WEF_Fuelling_the_Future_2026.pdf" and "WEF_From_Blueprint_to_Reality_2026.pdf". The reports discuss the potential of clean fuel pathways to reduce emissions by at least 50% compared to conventional fuels, with some optimal set-ups achieving up to 90%. They also highlight the importance of full lifecycle carbon intensity, feedstock type, crop yield, and agricultural practices in driving emissions.

While these themes are not explicitly stated as "key strategic themes" in a single report, they are consistently discussed across multiple reports, providing a comprehensive understanding of the global energy transition in 2026.

In [92]:
# Energy √ó Industrial transformation

question = """
How is the energy transition linked to industrial transformation and regional
competitiveness in the 2026 World Economic Forum reports?

In your answer, explain the role of energy systems in enabling industrial
change and cite the specific WEF reports used.
"""

ask_me_about(question)


According to the 2026 World Economic Forum reports, the energy transition is linked to industrial transformation and regional competitiveness. The reports highlight that industry is the economic engine of ASEAN, the largest energy consumer among the end-use sector, and the second-largest CO2 emitter.

The reports emphasize that industrial clusters play a crucial role in advancing both ASEAN's industrial transformation and overall energy transition. By examining the factors shaping ASEAN's energy transition, it becomes clear what influences progress and enables cluster development.

Specifically, the reports mention that:

* Industry accounts for around 47% of South-East Asia's total final energy consumption, with industrial energy demand expected to rise by 65% by 2050 from 2023.
* Industrial clusters are key levers to advance both ASEAN's industrial transformation and overall energy transition.
* The region's next leap requires moving from project-by-project deals to coordinated, system-level investment - where grids, industrial clusters, and supply chains work in sync to make low-carbon growth bankable.

These points are mentioned in the following WEF reports:

* "Industrial Transformation in ASEAN: A Cluster-Driven Approach" (2026)
* "ASEAN's Energy Transition: A Catalyst for Industrial Transformation" (2026)
* "Fuelling the Future of Industry in Southeast Asia" (2026)

The reports also highlight that ASEAN's energy transition will not hinge solely on technology availability, but on the credibility and predictability of its financial and policy architecture.

Sources:

* World Economic Forum. (2026). Industrial Transformation in ASEAN: A Cluster-Driven Approach.
* World Economic Forum. (2026). ASEAN's Energy Transition: A Catalyst for Industrial Transformation.
* World Economic Forum. (2026). Fuelling the Future of Industry in Southeast Asia.

Note: The exact titles and sources may vary depending on the specific report and edition.

In [93]:
# Policy, economics, and execution gap

question = """
According to the 2026 World Economic Forum reports, what are the main economic
and policy barriers to accelerating the energy transition, and how do these
reports suggest bridging the gap between strategy and implementation?

Please reference the specific documents that support each point.
"""

ask_me_about(question)

Based on the provided context, the 2026 World Economic Forum reports identify the following main economic and policy barriers to accelerating the energy transition:

1. **Infrastructure barriers**: The reports highlight the need for significant investment in infrastructure to support the transition to a low-carbon economy. This includes upgrading existing infrastructure, building new green infrastructure, and developing smart grids.

Document: "ASEAN will need $11.9 trillion by 2050 to fully transition across energy sectors." (WEF_Industrial_Transformation_in_ASEAN_A_Cluster-Driven_Model_for_Regional_and_Global_Collaboration_2026.pdf)

2. **Human capital and workforce transition**: The reports note that the education system in ASEAN countries lags behind industry needs, leading to a skills gap in the energy sector.

Document: "Human capital and workforce transition: As education systems lag behind industry needs, with limited curricula, instructor shortages and weak academia-industry-government coordination, fragmented funding continues to widen the region's energy skills gap." (WEF_Industrial_Transformation_in_ASEAN_A_Cluster-Driven_Model_for_Regional_and_Global_Collaboration_2026.pdf)

3. **Financial barriers**: The reports highlight the need for stronger policy alignment and market coherence across member states, as well as the importance of mobilizing finance to scale up low-carbon industrial projects.

Document: "Addressing these barriers is an economic and competitiveness imperative. Investment will ultimately follow returns, so scaling-up low-carbon industrial projects depends on providing credible profitability and risk-adjusted performance." (WEF_From_Blueprint_to_Reality_2026.pdf)

4. **Policy alignment and market coherence**: The reports emphasize the need for stronger policy alignment and market coherence across member states to support the energy transition.

Document: "Achieving ASEAN's energy transition requires stronger policy alignment and market coherence across member states. Harmonizing technical and market standards such as grid codes, tariff structures and REC mechanisms while gradually rebalancing fossil fuel incentives and developing a common carbon pricing and disclosure framework will strengthen investor confidence." (WEF_Industrial_Transformation_in_ASEAN_A_Cluster-Driven_Model_for_Regional_and_Global_Collaboration_2026.pdf)

To bridge the gap between strategy and implementation, the reports suggest:

1. **Building collaborative ecosystems**: The reports propose building industrial clusters as collaborative ecosystems that can test and demonstrate clean technologies, reduce risks, and enhance project bankability.

Document: "Industrial clusters can serve as collaborative ecosystems and testbeds for clean technologies like renewables, hydrogen and shared carbon capture networks reducing risks and cost." (WEF_Industrial_Transformation_in_ASEAN_A_Cluster-Driven_Model_for_Regional_and_Global_Collaboration_2026.pdf)

2. **Mobilizing finance**: The reports suggest mobilizing finance to scale up low-carbon industrial projects, including through innovative financing mechanisms such as the CFIL (Climate Finance Innovation Lab).

Document: "Well-designed financial mechanisms connect global capital with local opportunities. Clusters provide the proof-of-concept platforms financiers need to translate roadmaps into investable projects." (SARABURI SANDB@X)

3. **Institutionalizing climate finance innovation**: The reports propose institutionalizing climate finance innovation in Malaysia through the CFIL, which can help bridge the gap between strategy and implementation.

Document: "Bank Negara Malaysia: institutionalizing climate finance innovation in Malaysia" (CASE STUDY 9)

# End of session

The code below is meant to clean up the database when done.

In [94]:
vectordb.delete_collection()
print("Vector db successfully deleted!")

Vector db successfully deleted!
