In [1]:
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.chunking import HybridChunker

Knowledge Base

- Textual content
- Document Outlines
- Table content

In [2]:
import torch

# Check if GPU or MPS is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"CUDA GPU is enabled: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS GPU is enabled.")
else:
    raise OSError(
        "No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured."
    )

CUDA GPU is enabled: NVIDIA GeForce GTX 1050 Ti


In [3]:
import ollama

test_embeddings = ollama.embed(
  model='mxbai-embed-large',
  input='Llamas are members of the camelid family',
)

test_embeddings

EmbedResponse(model='mxbai-embed-large', created_at=None, done=None, done_reason=None, total_duration=25370936300, load_duration=25103127000, prompt_eval_count=10, prompt_eval_duration=None, eval_count=None, eval_duration=None, embeddings=[[0.03286565, 0.06610957, 0.0360856, 0.045056015, -0.007489444, 0.036489364, -0.025019486, 0.050013043, 0.036570687, -0.013854629, 0.035655916, 0.019339444, -0.035473242, 0.007771385, -0.029357782, 0.039437715, 0.012204067, -0.009387629, -0.008070199, -0.016061394, -0.046638295, 0.042092364, -0.046672374, 0.009753022, 0.068648085, 0.056318358, 0.06935988, 0.008108562, 0.05684624, 0.0035377643, 0.00082494074, 0.009582719, 0.105082005, -0.06520899, -0.012078525, -0.012814953, 0.017257271, 0.00032923312, -0.010861294, -0.084993, 0.05509558, -0.0018817299, 0.028186284, 0.00052878936, -0.032144602, -0.023194235, -0.0026011458, -0.030285371, 0.0053186137, -0.057683147, -0.021915201, -0.020089097, 0.0036510772, -0.014111891, -0.041652936, -0.03654601, 0.0042

In [4]:
def embed(text_chunk: str, model_name='mxbai-embed-large'):
    return ollama.embed(
        model=model_name,
        input=text_chunk
    )['embeddings'][0]

In [37]:
converter = DocumentConverter()
chunker = HybridChunker()

doc = converter.convert(r"..\..\data\raw\2019_Salzgitter AG_ESG.pdf").document

texts = [chunk.text for chunk in chunker.chunk(doc)]

Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAt

In [6]:
# x = doc.tables[0].export_to_dataframe()

# x.to_csv('..\..\config\llm_prompts.csv')

In [42]:
from pymilvus import Collection, connections, DataType, FieldSchema, CollectionSchema, utility
import os

MILVUS_HOST = os.getenv('MILVUS_HOST', "localhost")
MILVUS_PORT = os.getenv('MILVUS_PORT', '19530')
COLLECTION_NAME = os.getenv('COLLECTION_NAME', 'report_chunks')
EMBED_DIM = os.getenv('EMBED_DIM', 1024)   

test_embeddings = ollama.embed(
  model='nomic-embed-text:v1.5',
  input='Llamas are members of the camelid family',
)

EMBED_DIM = len(test_embeddings.embeddings[0])
print(EMBED_DIM)

model_name = 'nomic-embed-text:v1.5'

class MilvusManager:
    _instance = None
    _collection = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(MilvusManager, cls).__new__(cls)
            cls._instance._connect()
            cls._instance._init_collection()
        return cls._instance

    def _connect(self):
        '''Establish a single connection to Milvus.'''
        connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
        print(connections.list_connections())

    def _init_collection(self):
        '''Initialize or load collection schema once.'''
        fields = [
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=EMBED_DIM),
            FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2000)
        ]
        schema = CollectionSchema(fields, description="Report text chunks with embeddings")

        try:
            self._collection = Collection(COLLECTION_NAME)
        except Exception:
            self._collection = Collection(name=COLLECTION_NAME, schema=schema)

        # Create index if not already present
        if not self._collection.has_index():
            self._collection.create_index(
                field_name="embedding",
                index_params={"metric_type": "COSINE", "index_type": "IVF_FLAT", "params": {"nlist": 128}}
            )

    def get_collection(self) -> Collection:
        '''Return the active collection object.'''
        return self._collection
    
    def semantic_search(self, query: str, top_k: int = 5):
        '''
        
        '''
        vector = embed(query, model_name)

        collection = self.get_collection()
        collection.load()

        results = collection.search(
            data=[vector],
            anns_field="embedding",
            param={"metric_type": "COSINE", "params": {"nprobe": 10}},
            limit=top_k,
            output_fields=["text"]
        )

        return [hit.entity.get("text") for hit in results[0]]

768


In [None]:
# # refresh collection schema
# milvus = MilvusManager()

# utility.drop_collection(milvus.get_collection().name)

# milvus._instance = None


In [43]:
model_name = 'nomic-embed-text:v1.5'

embeddings = [embed(chunk, model_name) for chunk in texts]

milvus = MilvusManager()
collection = milvus.get_collection()

data = [embeddings, texts]  
collection.insert(data)
collection.flush()

[('default', <pymilvus.client.grpc_handler.GrpcHandler object at 0x000002066249AE50>)]


## Fetching search term from ESG_Metadata

In [31]:
ESG_METADATA = "..\..\config\ESG_Metadata.xlsx"

import pandas as pd
import json

esg_meta = pd.read_excel(ESG_METADATA)

esg_meta.head()


  ESG_METADATA = "..\..\config\ESG_Metadata.xlsx"


Unnamed: 0,Aspect,KPI,Topic,Quantity,SearchTerm,Knowledge
0,Emissions,Direct (Scope 1) and energy indirect (Scope 2)...,"[Direct (Scope 1), Energy Indirect (Scope 2)]",[Absolute Values],"Greenhouse gases, carbon dioxide, CO2, carbon ...","Greenhouse gases include carbon dioxide, metha..."
1,Emissions,"Total waste produced (in tonnes) and, where ap...","[Non-hazardous Waste, Hazardous Waste]",[Absolute Values],"Construction waste, organic waste, recyclable ...","Non-hazardous waste includes broken bricks, wa..."
2,Emissions,The types of emissions and respective emission...,"[Nitrogen oxides (NOx), Sulfur Oxides (SOx), P...",[Absolute Values],"Nitrogen oxides, NOx, sulfur oxides, SOx, sulf...",Nitrogen oxides (NOx) refer to compounds consi...
3,Emissions,Emissions target(s) and steps taken to achieve...,"[Waste, Exhaust and Greenhouse Gases, Wastewat...",[Key Actions],"Exhaust and Greenhouse Gases: Gas emissions, g...",Waste: Waste can be categorized into three phy...


In [44]:
search_term_quant = esg_meta.loc[0, 'SearchTerm']
search_term_qual = esg_meta.loc[3, 'SearchTerm']

search_term_quant, search_term_qual

('Greenhouse gases, carbon dioxide, CO2, carbon dioxide equivalents',
 'Exhaust and Greenhouse Gases: Gas emissions, greenhouse gases, carbon dioxide. Wastewater: wastewater, sewage, water consumption. Dust: dust emission. Noise: noise pollution.')

In [48]:
context = milvus.semantic_search(query=search_term_quant)

In [49]:
context

["- \uf0a7 Scope 1 = direct GHG emissions from operating plants\n- \uf0a7 Scope 2 = indirect, energy-related GHG emissions\n- \uf0a7 Scope 3 = further indirect GHG emissions from the purchase of raw materials\nWe have therefore used the approach of the Greenhouse Gas Protocol Corporate Standard for calculating the greenhouse gas emissions in Scope 1 to Scope 3. The majority of our direct emissions are also subject to the European  Emissions  Trading  System  (ETS)  with  the  result  that  the  strict  specifications  contained  in  the Monitoring Regulation are applied in this area. The data for Scope 1 and 2 result from energy consumption figures shown in Section 'Resources Management' as well as details from ETS monitoring.\nAs an integrated steel mill, Salzgitter Flachstahl GmbH (SZFG) occupies a special position with regard to GHG emissions, and it therefore operates its own energy savings scheme and GHG emissions reduction program. In 2018,  these  programs  achieved  a  cumulati

In [None]:
# def retrieve(search_term: tuple[str]):
#     search_res = milvus_client.search(
#         collection_name=collection_name,
#         data=[embed(search_term)],
#         limit=5,
#         search_params={"metric_type": "IP", "params": {}},
#         output_fields=["text"],
#     )
    
#     retrieved_lines_with_distances = [
#         (res["entity"]["text"], res["distance"]) for res in search_res[0]
#     ]
#     print(json.dumps(retrieved_lines_with_distances, indent=4))

#     return retrieved_lines_with_distances

In [None]:


# context = "\n".join(
#     [line_with_distance[0] for line_with_distance in retrieve(search_term_quant)]
# )

## Defining model for output

In [62]:
from pydantic import BaseModel

# {{"Disclosure": <0 or 1>, "Data": {{"KPI": "{kpi}", "Topic": "{topic}", "Value":"<value>", "Unit": "<unit>"}}}}

class ESGQuantData(BaseModel):
  kpi: str
  topic: str
  value: float
  unit: str

class ESGResponse(BaseModel):
  disclosure: bool
  data: list[ESGQuantData]

In [63]:
knowledge = esg_meta.loc[0, 'Knowledge']
aspect = esg_meta.loc[0, 'Aspect']
topic = esg_meta.loc[0, 'Topic']
kpi = esg_meta.loc[0, 'KPI']
quantity = esg_meta.loc[0, 'Quantity']

quantity


'[Absolute Values]'

In [64]:
topic, kpi

('[Direct (Scope 1), Energy Indirect (Scope 2)]',
 'Direct (Scope 1) and energy indirect (Scope 2) greenhouse gas emissions (in tonnes) and, where appropriate, intensity (e.g. per unit of production volume, per facility).')

In [None]:
# following format:

# Response: {{"Disclosure": <0 or 1>, "Data": {{"KPI": "{kpi}", "Topic": "{topic}", "Value":"<value>", "Unit": "<unit>"}}}}

# An example response for reference:
# {{"Disclosure": 1,
# "Data":{{"KPI": "Total waste produced (in tonnes) and, where appropriate, intensity (e.g. per unit of production volume, per facility).", 
# "Topic": "Non-hazardous Waste",
# "Value": "77",
# "Unit": "Metric Tons"}}
# }}

In [None]:
SYSTEM_PROMPT = f"""
You are an expert in the field of ESG (Environmental, Social, and Governance). Your task is to analyze reference content in both text and CSV table formats to answer questions, providing your responses in JSON format.
Answer in the specified format only.

"""
USER_PROMPT_QUANTITATIVE = f"""
Please follow these steps for your analysis:

Begin by interpreting the meaning of the data disclosed in the table, summarizing it in brief terms.

Then, be aware that the provided reference content may not be related to the question. Assess whether the reference content is relevant to the question. If it is, extract all the data related to the question and provide your answer. 

Your response should include: (1) Whether the reference content discloses data relevant to the question, indicated by a 'disclosure' field with a value of true or false. (2) If relevant data exists, provide the disclosed data in the 'data' field.

The reference content is as follows: {context}

Supplementary ESG expertise is as follows: {knowledge}

The question is: Please answer based on the above information and do not strip away the given materials. In terms of {aspect}, extract the {topic} about {kpi}, and output {quantity}. 
"""


In [65]:
response = ollama.chat(
    model="deepseek-r1:7b",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_QUANTITATIVE},
    ],
    format=ESGResponse.model_json_schema()
)
print(response)

model='deepseek-r1:7b' created_at='2025-09-28T10:33:15.4425119Z' done=True done_reason='stop' total_duration=70207562700 load_duration=6060355600 prompt_eval_count=1776 prompt_eval_duration=9973833900 eval_count=150 eval_duration=54157334100 message=Message(role='assistant', content='{ "disclosure": true, "data": [ { "kpi": "Direct (Scope 1) and energy indirect (Scope 2) greenhouse gas emissions (in tonnes)", "topic": "[Direct (Scope 1), Energy Indirect (Scope 2)]", "value": 208, "unit": "kt CO2" }, { "kpi": "Direct (Scope 1) and energy indirect (Scope 2) greenhouse gas emissions (in tonnes)", "topic": "[Direct (Scope 1), Energy Indirect (Scope 2)]", "value": 211, "unit": "kt CO2" } ] }\n\n  \t \t\t\t\t\t \t\t\t\t\t \t\t\t\t', thinking=None, images=None, tool_name=None, tool_calls=None)


In [67]:
print(response.message['content'])

{ "disclosure": true, "data": [ { "kpi": "Direct (Scope 1) and energy indirect (Scope 2) greenhouse gas emissions (in tonnes)", "topic": "[Direct (Scope 1), Energy Indirect (Scope 2)]", "value": 208, "unit": "kt CO2" }, { "kpi": "Direct (Scope 1) and energy indirect (Scope 2) greenhouse gas emissions (in tonnes)", "topic": "[Direct (Scope 1), Energy Indirect (Scope 2)]", "value": 211, "unit": "kt CO2" } ] }

  	 					 					 				


In [None]:
knowledge = esg_meta.loc[3, 'Knowledge']
aspect = esg_meta.loc[3, 'Aspect']
topic = esg_meta.loc[3, 'Topic']
kpi = esg_meta.loc[3, 'KPI']
quantity = esg_meta.loc[3, 'Quantity']
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieve(search_term_qual)]
)



USER_PROMPT_QUALITATIVE = f"""
Please follow these steps for your analysis:

Begin by interpreting the meaning of the data disclosed in the table, summarizing it in brief terms.

Then, be aware that the provided reference content may not be related to the question. Assess whether the reference content is relevant to the question. If it is, extract all the text content related to the question and provide your answer. 

Your response should include: (1) Whether the reference content covers text relevant to the question, indicated by a 'disclosure' field with a value of 0 or 1. (2) If the reference content does cover relevant text, respond with the related text content in the 'data' field.

The reference content is as follows: {context}

Supplementary ESG expertise is as follows: {knowledge}

The question is: Please answer based on the above information and do not strip away the given materials. In terms of {aspect}, extract the {topic} about {kpi}, and output {quantity}. 

Answer in the following format:
Analysis: <Your Concise Analytical Process>

Response: {{"KPI": "{kpi}", "Topic": "{topic}", "Target":"<target>", "Action": "<action>"}}

An example response for reference:
{{"Disclosure: 1,
Data:{{"KPI": "Hazardous or Non-hazardous wastes reduction target(s) and steps taken to achieve them.", 
"Topic": "Hazardous or Non-hazardous Waste Management",
"Target": "Reduce carbon emission intensity by 1% annually",
"Action": ["Conduct a comprehensive inventory and verification of carbon emissions data, manage and reserve carbon assets","Continuously strengthen the operational management and research support capacity building of CCUS (Carbon Capture, Utilization, and Storage) demonstration units","Focus on carbon offsetting efforts, promote ecological forest construction, and leverage carbon reduction and sequestration through tree planting, reforestation, and mine reclamation"]}}
}}
"""

In [None]:
response = ollama.chat(
    model="gemma3:1b",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_QUALITATIVE},
    ],
)
print(response)

In [None]:
response.message['content']