In [1]:
# Line magic functions that will allow for imports to be reloaded and not cached
%load_ext autoreload
%autoreload 2

# Imports
import os
import pandas as pd
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from datetime import date
from pathlib import Path
from json import dump
try:
    from tqdm.notebook import tqdm
except ImportError:
    from tqdm import tqdm

# Local
from scripts.extractmd import Extractor
from scripts.vectorindex import QnAEngine
from scripts.utilities import get_prompt_dict, get_questions, get_answers, get_procurement_content, get_config_data, get_ini_files
from scripts.gen_results import gen_results
from scripts.gen_reports import generate_precision_report




**Global config**

In [2]:
embedding_conf = {
    "embeddingmodel": "BAAI/bge-m3",  # "BAAI/bge-m3" "nomic-ai/nomic-embed-text-v2-moe" # "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    "chunk_size": 1536,
    "chunk_overlap": 0,
    "top_similar": 5,
    "n4rerank": 0, #How many nodes to retrieve for reranking. If 0, reranker is not used
    "use_similar_chunks": True, #To use similar chunks or the whole document as the context
    "prevnext": True #to include in the context also the previouse and the next chunk of the current similar chunk
}
embedding=HuggingFaceEmbedding(model_name=embedding_conf["embeddingmodel"],trust_remote_code=True)

#For nomic-embed-text-v2-moe
#embedding=HuggingFaceEmbedding(model_name=embedding_conf["embeddingmodel"],trust_remote_code=True,query_instruction="search_query: ",text_instruction="search_document: ")

In [3]:
# LLM Setup 
llmmodelAzure = { "model": "gpt-4o",
                "version":os.environ.get('AZURE_OPENAI_VERSION',''),
                "azure_deployment":"gpt-4o",
                "azure_endpoint":os.environ.get('AZURE_ENDPOINT',''),
                "api_key":os.environ.get('AZURE_OPENAI_KEY','')}

llm=AzureOpenAI(azure_deployment=llmmodelAzure["azure_deployment"],
                azure_endpoint=llmmodelAzure["azure_endpoint"],temperature=0.0,
                api_version=llmmodelAzure["version"], api_key=llmmodelAzure["api_key"],
                timeout=120,max_retries=3,top_p=0.0001)
print(llmmodelAzure)

{'model': 'gpt-4o', 'version': '2025-01-01-preview', 'azure_deployment': 'gpt-4o', 'azure_endpoint': 'https://atlernativellmqna.openai.azure.com/', 'api_key': '2cac1a62b95d489297273a426f0add53'}


In [4]:
extractor = Extractor() # Markdown doc extractor

In [5]:
#Ollama model
# llmmodelOllama = { "model": "gemma3:27b",
#                 "url":os.environ.get('OLLAMA_ENDPOINT',''),
#                 "context_window":"128000"}

#from llama_index.llms.ollama import Ollama
#llm = Ollama(base_url=llmmodelOllama["url"],
#             model=llmmodelOllama["model"], 
#             context_window=int(llmmodelOllama["context_window"]),
#            request_timeout=300.0,
#            temperature=0.0,
#            additional_kwargs={"seed":1337})

**PROCUREMENT FILE SETTINGS**

In [6]:
# Script dir for getting relative paths for notebook file
script_dir = globals()['_dh'][0] 

# Document paths
question_file_path = script_dir / "questions" / "questions.yaml"
prompt_file = script_dir / "questions" / "prompts.tsv"
report_dir = script_dir / "reports"
config_dir = script_dir / "dev_config" # "dev_config" # "config"
procurement_file_dir = script_dir / "cfla_files" # "cfla_files"
answer_file_dir = script_dir / "answers"

# TODO perhaps prompt user to define unique report name; some types - all; one etc?
report_identifier = "final"
# TODO maybe add report as a subdirectory as there are 2 files per report; might be even more with histograms etc.
report_name = f"{report_identifier}_{date.today():%d.%m}"

report_dir_path = report_dir / report_name
report_path_htm = report_dir_path / "report.htm"
report_path_csv = report_dir_path / "report.csv"
report_path_config = report_dir_path / "config.json"

In [7]:
# Loading static information
overwrite = False  # If true this will delete the existing report and generate a new one;
                  # Else - new data will be appended only if it isn't in the CSV file.

question_dictionary = get_questions(question_file_path)
prompt_dictionary = get_prompt_dict(prompt_file)

ini_files = get_ini_files(config_dir, overwrite, report_path_csv)
#ini_files =['LNP_202050ERAF']
print(f"Processing {len(ini_files)} procurement files: {sorted(ini_files)}")

Questions loaded
Found 10 config files in C:\Repos\vpp-cfla\dev_config
Skipping 10 already-processed files: ['KND-2020_07', 'KND-2020_20', 'KNP202134', 'LNP_202050ERAF', 'LU_CFI_201935ERAF', 'RPNC202122', 'SNP-2021_07_AK', 'SNP202131', 'SNP_20213ERAF', 'VNIP_2020_036_ERAF']
Processing 1 procurement files: ['LNP_202050ERAF']


**MAIN Q/A GENERATION SCRIPT**

In [None]:
# TODO add parallel prompting

if overwrite: # overwrtitting report; Delete and create new
        if report_path_htm.exists():
                report_path_htm.unlink()
        if report_path_csv.exists():
                report_path_csv.unlink()
        if report_path_config.exists():
                report_path_config.unlink()
                
if not os.path.exists(report_dir_path):
        os.makedirs(report_dir_path)

# Make config dictionary and save as json
config_dict = embedding_conf
config_dict["model"] = llmmodelAzure["model"]
config_dict["temperature"] = llm.temperature
config_dict["temperature"] = llm.timeout
config_dict["max_retries"] = llm.max_retries
#config_dict["top_p"] = 0.0001

with open(report_path_config, "w") as file:
        dump(config_dict, file) 


for file in tqdm(ini_files, desc="Config files", unit="file"): # TODO fix TQDM; expected that after first file finishes it shows the aproximation of all the other files left to process
        configfile = config_dir / f"{file}.ini"

        #if f"{file}.ini" == "RTK_20209.ini": continue
        #if f"{file}.ini" == "VeA_202012ERAFGroz.ini": continue
        
        print(f"Processing config file: {configfile}")
        procurement_id, procurement_file, agreement_file, answer_file = get_config_data(configfile, procurement_file_dir, answer_file_dir)
        # Open CSV file, maybe as pandas dataframe
        answer_dictionary = get_answers(answer_file)
        print(f"Processing config file: {configfile}")

        # Getting markdown text from procurement doc
        procurement_content = get_procurement_content(extractor, procurement_file, agreement_file)
    
        # Creating FAISS vector index for the procurement document
        qnaengine = QnAEngine(embedding,llm)
        await qnaengine.createIndex(
                procurement_content,
                "Procurement",
                chunk_size=embedding_conf["chunk_size"],
                chunk_overlap=embedding_conf["chunk_overlap"]
                )

        ### Generating results
        results_table = gen_results(qnaengine, configfile, embedding_conf, question_dictionary, answer_dictionary, prompt_dictionary)
        
        # add "Iepirkuma ID" as procurement_id to results table
        # TODO move this inside gen results function once it has been refactored
        for row in results_table:
                row.insert(0, file)
        
        ### Save output
        data = pd.DataFrame(results_table, columns=["Iepirkuma ID", "Nr", "Atbilde", "Sagaidāmā atbilde", "Pamatojums"])
        precision = (data['Atbilde'] == data['Sagaidāmā atbilde']).sum()/len(data)
        print(f"PRECIZITĀTE: {precision*100}%")

        with report_path_htm.open('a', encoding='utf-8') as ofile:
                # TODO Create a dropdown menu that lets the user select file by "Iepirkuma ID" - in each page only information that has that "Iepirkuma ID" is displayed
                print(procurement_id,file=ofile)
                print(data.to_html(index=False).replace('\\n','<br>'),file=ofile)
                print(f"PRECIZITĀTE: {precision*100}%",file=ofile)

        data.to_csv(report_path_csv, 
                    mode='a', 
                    index=False, 
                    header=not report_path_csv.exists(), # only adding one header
                    encoding='utf-8')
        
        
                

Config files:   0%|          | 0/1 [00:00<?, ?file/s]

Processing config file: C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini
Processing config file: C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini



Generating embeddings:   0%|                                                                           [ time left: ? ][A
Generating embeddings:  28%|███████████████████▋                                                   [ time left: 00:30 ][A
Generating embeddings:  54%|█████████████████████████████████████▋                                 [ time left: 00:25 ][A
Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ][A


39 segments created and vectorized.
Index is ready.



Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:   0%|                              | 0/68 [00:00<?, ?q/s][A

2 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:   3%|▋                     | 2/68 [00:02<01:23,  1.27s/q][A

4 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:   6%|█▎                    | 4/68 [00:06<01:53,  1.77s/q][A

6 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:   9%|█▉                    | 6/68 [00:08<01:28,  1.43s/q][A

7 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  10%|██▎                   | 7/68 [00:11<01:43,  1.69s/q][A

9 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  13%|██▉                   | 9/68 [00:14<01:36,  1.64s/q][A

10 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  15%|███                  | 10/68 [00:17<01:54,  1.98s/q][A

15 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  22%|████▋                | 15/68 [00:20<00:59,  1.11s/q][A

16 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  24%|████▉                | 16/68 [00:23<01:13,  1.41s/q][A

17 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  25%|█████▎               | 17/68 [00:28<01:40,  1.97s/q][A

18 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  26%|█████▌               | 18/68 [00:30<01:44,  2.09s/q][A

19 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  28%|█████▊               | 19/68 [00:35<02:11,  2.69s/q][A

22 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  32%|██████▊              | 22/68 [00:42<01:56,  2.52s/q][A

23 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  34%|███████              | 23/68 [00:48<02:18,  3.09s/q][A

24 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  35%|███████▍             | 24/68 [00:54<02:47,  3.82s/q][A

26 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  38%|████████             | 26/68 [00:57<01:59,  2.85s/q][A

27 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  40%|████████▎            | 27/68 [01:02<02:23,  3.50s/q][A

28 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  41%|████████▋            | 28/68 [01:06<02:18,  3.45s/q][A

29 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  43%|████████▉            | 29/68 [01:08<02:02,  3.13s/q][A

31 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  46%|█████████▌           | 31/68 [01:09<01:17,  2.10s/q][A

32 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  47%|█████████▉           | 32/68 [01:13<01:32,  2.56s/q][A

33 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  49%|██████████▏          | 33/68 [01:15<01:25,  2.44s/q][A

34 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  50%|██████████▌          | 34/68 [01:18<01:25,  2.52s/q][A

35 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  51%|██████████▊          | 35/68 [01:20<01:14,  2.25s/q][A

36 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  53%|███████████          | 36/68 [01:22<01:13,  2.28s/q][A

37 37.1 37.2 37.3 37.4 37.5 37.6 37.7 37.8 37.9 37.10 37.11 37.12 37.13 


Questions in C:\Repos\vpp-cfla\dev_config\LNP_202050ERAF.ini:  54%|███████████▍         | 37/68 [02:07<07:27, 14.43s/q][A

38 38.1 38.2 38.3 38.4 38.5 

# Question precision data report

In [None]:
# Load report we want to analayze
analyzed_report_dir = report_dir/ "all_01.07"
csv_filename = "report.csv"
input_csv = analyzed_report_dir / csv_filename

# Generate report
precison_report_html = generate_precision_report(input_csv)

# Save the report next to the CSV
output_html = analyzed_report_dir / f"precision_report.html"
with open(output_html, "w", encoding="utf-8") as f:
    f.write(precison_report_html)

print(f"HTML report saved to: {output_html}")

In [None]:
# TODO confusion matrix (false positives; true positives etc)

# TODO Make view for specific procurmennt
# report.htm gets generated after from csv.
# There is a dropdown that can be used to select which report we want to see