In [1]:
# Line magic functions that will allow for imports to be reloaded and not cached
%load_ext autoreload
%autoreload 2

# Imports
import os
import pandas as pd
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from datetime import date
from pathlib import Path
from json import dump
try:
    from tqdm.notebook import tqdm
except ImportError:
    from tqdm import tqdm

# Local
from scripts.extractmd import Extractor
from scripts.vectorindex import QnAEngine
from scripts.utilities import get_prompt_dict, get_questions, get_answers, get_procurement_content, get_config_data, get_ini_files
from scripts.gen_results import gen_results

**Global config**

In [2]:
embedding_conf = {
    "embeddingmodel": "BAAI/bge-m3",  # "BAAI/bge-m3" "nomic-ai/nomic-embed-text-v2-moe" # "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    "chunk_size": 1536,
    "chunk_overlap": 0,
    "top_similar": 5,
    "n4rerank": 0, #How many nodes to retrieve for reranking. If 0, reranker is not used
    "use_similar_chunks": True, #To use similar chunks or the whole document as the context
    "prevnext": True #to include in the context also the previouse and the next chunk of the current similar chunk
}
embedding=HuggingFaceEmbedding(model_name=embedding_conf["embeddingmodel"],trust_remote_code=True)

#For nomic-embed-text-v2-moe
#embedding=HuggingFaceEmbedding(model_name=embedding_conf["embeddingmodel"],trust_remote_code=True,query_instruction="search_query: ",text_instruction="search_document: ")

In [3]:
# LLM Setup 
llmmodelAzure = { "model": "gpt-4o",
                "version":os.environ.get('AZURE_OPENAI_VERSION',''),
                "azure_deployment":"gpt-4o",
                "azure_endpoint":os.environ.get('AZURE_ENDPOINT',''),
                "api_key":os.environ.get('AZURE_OPENAI_KEY','')}

llm=AzureOpenAI(azure_deployment=llmmodelAzure["azure_deployment"],
                azure_endpoint=llmmodelAzure["azure_endpoint"],temperature=0.0,
                api_version=llmmodelAzure["version"], api_key=llmmodelAzure["api_key"],
                timeout=120,max_retries=3,top_p=0.0001)

In [4]:
extractor = Extractor() # Markdown doc extractor

In [5]:
#Ollama model
# llmmodelOllama = { "model": "gemma3:27b",
#                 "url":os.environ.get('OLLAMA_ENDPOINT',''),
#                 "context_window":"128000"}

#from llama_index.llms.ollama import Ollama
#llm = Ollama(base_url=llmmodelOllama["url"],
#             model=llmmodelOllama["model"], 
#             context_window=int(llmmodelOllama["context_window"]),
#            request_timeout=300.0,
#            temperature=0.0,
#            additional_kwargs={"seed":1337})

**PROCUREMENT FILE SETTINGS**

In [None]:
overwrite = False  # If true this will delete the existing report and generate a new one;
                  # Else - new data will be appended only if it isn't in the CSV file.

# Script dir for getting relative paths for notebook file
script_dir = globals()['_dh'][0] 

# Document paths
question_file_path = script_dir / "questions" / "questions.yaml"
prompt_file = script_dir / "questions" / "prompts.tsv"
report_dir = script_dir / "reports"
config_dir = script_dir / "config" # "dev_config" # "config"
procurement_file_dir = script_dir / "cfla_files" # "cfla_files"
answer_file_dir = script_dir / "answers"

# TODO perhaps prompt user to define unique report name; some types - all; one etc?
report_identifier = "final"
# TODO maybe add report as a subdirectory as there are 2 files per report; might be even more with histograms etc.
report_name = f"{report_identifier}_{date.today():%d.%m}"

report_dir_path = report_dir / report_name
report_path_htm = report_dir_path / "report.htm"
report_path_csv = report_dir_path / "report.csv"
report_path_config = report_dir_path / "config.json"

# Loading static information
question_dictionary = get_questions(question_file_path)
prompt_dictionary = get_prompt_dict(prompt_file)

ini_files = get_ini_files(config_dir, overwrite, report_path_csv)
print(f"Processing {len(ini_files)} procurement files: {sorted(ini_files)}")

Questions loaded
Found 27 config files in c:\Users\Admin\Desktop\Programming\Work\vpp-cfla\config
Skipping 25 already-processed files: ['APP_DI_20202ERAF_AK', 'DND_20206', 'EDI_20203AK', 'IKVD_20213_ESF_SAM', 'IP2020_01_ERAF', 'IeM_IC_201913', 'KNP202134', 'KNP_202215', 'KP2020016A-KF', 'LNP_201976', 'LNP_202037ERAF', 'LNP_202050ERAF', 'LU_CFI_201935ERAF', 'MNP_202121_ERAF', 'PND_2019_15-ERAF', 'RTK_2019_12', 'SND_202015-ERAF', 'SNP202131', 'SNP_202001', 'SNP_202067', 'SNP_20213ERAF', 'VND_20201', 'VNIP_2020_036_ERAF', 'VNP_2023057AK', 'VeA_202012ERAF']
Processing 2 procurement files: ['LU_202049_ERAF', 'RPNC202122']


**MAIN Q/A GENERATION SCRIPT**

In [None]:
# TODO add parallel prompting

if overwrite: # overwrtitting report; Delete and create new
        if report_path_htm.exists():
                report_path_htm.unlink()
        if report_path_csv.exists():
                report_path_csv.unlink()
        if report_path_config.exists():
                report_path_config.unlink()
                
if not os.path.exists(report_dir_path):
        os.makedirs(report_dir_path)

# Make config dictionary and save as json
config_dict = embedding_conf
config_dict["model"] = llmmodelAzure["model"]
config_dict["temperature"] = llm.temperature
config_dict["temperature"] = llm.timeout
config_dict["max_retries"] = llm.max_retries
#config_dict["top_p"] = 0.0001

with open(report_path_config, "w") as file:
        dump(config_dict, file) 


for file in tqdm(ini_files, desc="Config files", unit="file"): # TODO fix TQDM; expected that after first file finishes it shows the aproximation of all the other files left to process
        configfile = config_dir / f"{file}.ini"

        #if f"{file}.ini" == "RTK_20209.ini": continue
        #if f"{file}.ini" == "VeA_202012ERAFGroz.ini": continue
        
        print(f"Processing config file: {configfile}")
        procurement_id, procurement_file, agreement_file, answer_file = get_config_data(configfile, procurement_file_dir, answer_file_dir)
        # Open CSV file, maybe as pandas dataframe
        answer_dictionary = get_answers(answer_file)
        print(f"Processing config file: {configfile}")

        # Getting markdown text from procurement doc
        procurement_content = get_procurement_content(extractor, procurement_file, agreement_file)
    
        # Creating FAISS vector index for the procurement document
        qnaengine = QnAEngine(embedding,llm)
        await qnaengine.createIndex(
                procurement_content,
                "Procurement",
                chunk_size=embedding_conf["chunk_size"],
                chunk_overlap=embedding_conf["chunk_overlap"]
                )

        ### Generating results
        results_table = gen_results(qnaengine, configfile, embedding_conf, question_dictionary, answer_dictionary, prompt_dictionary)
        
        # add "Iepirkuma ID" as procurement_id to results table
        # TODO move this inside gen results function once it has been refactored
        for row in results_table:
                row.insert(0, file)
        
        ### Save output
        data = pd.DataFrame(results_table, columns=["Iepirkuma ID", "Nr", "Atbilde", "Sagaidāmā atbilde", "Pamatojums"])
        precision = (data['Atbilde'] == data['Sagaidāmā atbilde']).sum()/len(data)
        print(f"PRECIZITĀTE: {precision*100}%")

        with report_path_htm.open('a', encoding='utf-8') as ofile:
                # TODO Create a dropdown menu that lets the user select file by "Iepirkuma ID" - in each page only information that has that "Iepirkuma ID" is displayed
                print(procurement_id,file=ofile)
                print(data.to_html(index=False).replace('\\n','<br>'),file=ofile)
                print(f"PRECIZITĀTE: {precision*100}%",file=ofile)

        data.to_csv(report_path_csv, 
                    mode='a', 
                    index=False, 
                    header=not report_path_csv.exists(), # only adding one header
                    encoding='utf-8')
        
        
                

Config files:   0%|          | 0/2 [00:00<?, ?file/s]

Processing config file: c:\Users\Admin\Desktop\Programming\Work\vpp-cfla\config\RPNC202122.ini
Loaded layout model s3://layout/2025_02_18 on device cpu with dtype torch.float32
Loaded texify model s3://texify/2025_02_18 on device cpu with dtype torch.float32
Loaded recognition model s3://text_recognition/2025_02_18 on device cpu with dtype torch.float32
Loaded table recognition model s3://table_recognition/2025_02_18 on device cpu with dtype torch.float32
Loaded detection model s3://text_detection/2025_02_28 on device cpu with dtype torch.float32
Loaded detection model s3://inline_math_detection/2025_02_24 on device cpu with dtype torch.float32


Recognizing layout: 100%|██████████| 4/4 [00:21<00:00,  5.33s/it]
Running OCR Error Detection: 100%|██████████| 6/6 [00:02<00:00,  2.54it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 5/5 [00:34<00:00,  6.93s/it]
Generating embeddings: 100%|██████████ [ time left: 00:00 ]


31 segments created and vectorized.
Index is ready.




2 



4 



6 

Table for Questions

In [None]:
# TODO remake using CSV; no need for loops just use pandas dataframe to get all the data
# Get a dataframe that contains the following:
# procurement_id
# Question_nr, 
# Answers_total(this will contain the count of the answers where there were no missing - "?" answers),
# TODO pielabot tabulas vērtības
# Expected_answer,
# Yes_count,
# No_count,
# n_a_count, (Having all of this data could be useful for seeing maybe the llm likes to frequently say there is not enough context for example - problem is with RAG)

# TODO Generate final report:
# Using above data get the amount of correct answers, better would be a fraction like 7/10 instead of % for readability
# Get accuray for each file (%)
# Get total accuracy on all questions on all files as %; 

# Later TODO use the CSVs and dataframes to generate a html file as a nice clean readable report - we still need to discuss this


In [None]:



# # ── Containers ──
# all_correctness = {}   # filename → { Nr_str: 0|1, … }
# master_qids = []       # list of all Nr’s in the order first seen

# # ── Reloop, but capture correctness per Nr ──
# for file in ini_files:
    
#     per_file = {}
#     for nr, grp in data.groupby("nr"):
#         nr_str = str(nr)                # e.g. "4", "9.1", "9.2"
#         if nr_str not in master_qids:
#             master_qids.append(nr_str)

#         # 1 only if every row in grp matches
#         is_correct = (grp["Atbilde"] == grp["Sagaidāmā atbilde"]).all()
#         per_file[nr_str] = int(is_correct)

#     all_correctness[file] = per_file

# # ── Build the matrix ──
# df = pd.DataFrame.from_dict(all_correctness, orient="index")  # rows=files, cols=Nr
# df = df.T                                                 
# df = df.reindex(master_qids)                               
# df.index.name = "Question Nr"


# print("Question correctness (1=all sub-rows(q0 and q) correct, 0=any wrong):")
# print(df)



Correctness % and plot

In [None]:

# import matplotlib.pyplot as plt

# # ── 1) Performance metrics ──
# # df: rows=Question Nr, cols=filenames, values=0/1
# correct_counts = df.sum(axis=1)                   # how many files got each question right
# num_files     = df.shape[1]                       # total number of files
# percentages   = (correct_counts / num_files) * 100

# perf_df = pd.DataFrame({
#     "CorrectCount": correct_counts,
#     "Percentage":  percentages.round(1)
# })

# # ── 2) Display stats ──
# print("Per‐question performance:")
# print(perf_df)

# # ── 3) Bar chart ──
# plt.figure()
# plt.bar(perf_df.index.astype(str), perf_df["CorrectCount"])
# plt.xlabel("Question Nr")
# plt.ylabel("Number of Correct Answers")
# plt.ylim(0, num_files)
# plt.xticks(rotation=90)
# plt.title("Quesiton correctnes")
# plt.tight_layout()
# plt.show()
