In [1]:
# Line magic functions that will allow for imports to be reloaded and not cached
%load_ext autoreload
%autoreload 2

# Imports
import os
import pandas as pd
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from datetime import date
from json import dump
from tqdm import tqdm

# Local
from scripts.extractmd import Extractor
from scripts.vectorindex import QnAEngine
from scripts.utilities import get_prompt_dict, get_questions, get_answers, get_procurement_content, get_config_data, get_ini_files, get_supplementary_info, get_questions_without_q0
from scripts.gen_results import gen_results
from scripts.gen_precision_report import generate_precision_report
from scripts.main_report import build_main_report_html




**My Config:**
[my_config.py](scripts/my_config.py)

In [2]:
#  with open("scripts/my_config.py") as f:
#      code = f.read()
#      print(code) 
#      exec(code)  

**Global config**

In [3]:
embedding_conf = {
    "embeddingmodel": globals().get('my_embeddingmodel', "BAAI/bge-m3"), # You can also just add a path to the embedding model to use a local one
        #"sentence-transformers/paraphrase-multilingual-mpnet-base-v2",  # "BAAI/bge-m3" "nomic-ai/nomic-embed-text-v2-moe" # "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    "chunk_size": 1536,
    "chunk_overlap": 0,
    "top_similar": 5,
    "n4rerank": 0, #How many nodes to retrieve for reranking. If 0, reranker is not used
    "use_similar_chunks": True, #To use similar chunks or the whole document as the context
    "prevnext": True #to include in the context also the previouse and the next chunk of the current similar chunk
}
embedding=HuggingFaceEmbedding(model_name=embedding_conf["embeddingmodel"],trust_remote_code=True)

#For nomic-embed-text-v2-moe
#embedding=HuggingFaceEmbedding(model_name=embedding_conf["embeddingmodel"],trust_remote_code=True,query_instruction="search_query: ",text_instruction="search_document: ")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [4]:
# LLM Setup 
llmmodelAzure = { "model": "gpt-4o",
                "version":os.environ.get('AZURE_OPENAI_VERSION',''),
                "azure_deployment":"gpt-4o",
                "azure_endpoint":os.environ.get('AZURE_ENDPOINT',''),
                "api_key":os.environ.get('AZURE_OPENAI_KEY','')}

llm=AzureOpenAI(azure_deployment=llmmodelAzure["azure_deployment"],
                azure_endpoint=llmmodelAzure["azure_endpoint"],temperature=0.0,
                api_version=llmmodelAzure["version"], api_key=llmmodelAzure["api_key"],
                timeout=120,max_retries=3,top_p=0.0001)

In [5]:
extractor = Extractor() # Markdown doc extractor

In [6]:
#Ollama model
# llmmodelOllama = { "model": "gemma3:27b",
#                 "url":os.environ.get('OLLAMA_ENDPOINT',''),
#                 "context_window":"128000"}

#from llama_index.llms.ollama import Ollama
#llm = Ollama(base_url=llmmodelOllama["url"],
#             model=llmmodelOllama["model"], 
#             context_window=int(llmmodelOllama["context_window"]),
#            request_timeout=300.0,
#            temperature=0.0,
#            additional_kwargs={"seed":1337})

**PROCUREMENT FILE SETTINGS**

In [7]:
# Script dir for getting relative paths for notebook file
script_dir = globals()['_dh'][0] 

# Document paths
question_file_path = script_dir / "questions" / "questions.yaml" # original.yaml
prompt_file = script_dir / "questions" / "prompts.yaml"
report_dir = script_dir / "reports"
config_dir = script_dir / globals().get('my_config_dir', "config") # "dev_config" # "dev_config" # "config"
procurement_file_dir = script_dir / "cfla_files" # "cfla_files"
answer_file_dir = script_dir / "answers"

# TODO perhaps prompt user to define unique report name; some types - all; one etc?
report_identifier = globals().get('my_report_identifier', "dev-test")
# TODO maybe add report as a subdirectory as there are 2 files per report; might be even more with histograms etc.
report_today = f"{date.today():%d.%m}"
report_name = f"{report_identifier}_{globals().get('my_report_date', report_today)}"

report_dir_path = report_dir / report_name
report_path_htm = report_dir_path / "report.htm"
report_path_csv = report_dir_path / "report.csv"
report_path_config = report_dir_path / "config.json"

In [8]:
# Loading static information
overwrite = globals().get('my_overwrite', False)  
            # If true this will delete the existing report and generate a new one;
            # Else - new data will be appended only if it isn't in the CSV file.

question_dictionary = get_questions(question_file_path)
prompt_dictionary = get_prompt_dict(prompt_file, question_dictionary)
supplementary_info = get_supplementary_info()

ini_files = globals().get('my_ini_files', get_ini_files(config_dir, overwrite, report_path_csv))
print(f"Processing {len(ini_files)} procurement files: {sorted(ini_files)}")

Questions loaded
Prompts loaded
Found 20 config files in C:\Repos\vpp-cfla\config
Processing 20 procurement files: ['APP_DI_20202ERAF_AK', 'DND_20206', 'EDI_20203AK', 'IKVD_20213_ESF_SAM', 'IP2020_01_ERAF', 'IeM_IC_201913', 'KNP_202215', 'KP2020016A-KF', 'LNP_201976', 'LNP_202037ERAF', 'LU_202049_ERAF', 'MNP_202121_ERAF', 'PND_2019_15-ERAF', 'RTK_2019_12', 'SND_202015-ERAF', 'SNP_202001', 'SNP_202067', 'VND_20201', 'VNP_2023057AK', 'VeA_202012ERAF']


**MAIN Q/A GENERATION SCRIPT**

In [None]:
# TODO add parallel prompting
print(report_path_htm)
print(report_path_csv)
print(overwrite)
if overwrite: # overwrtitting report; Delete and create new
        if report_path_htm.exists():
                report_path_htm.unlink()
        if report_path_csv.exists():
                report_path_csv.unlink()
        if report_path_config.exists():
                report_path_config.unlink()
                
if not os.path.exists(report_dir_path):
        os.makedirs(report_dir_path)

# Make config dictionary and save as json
config_dict = embedding_conf
config_dict["model"] = llmmodelAzure["model"]
config_dict["temperature"] = llm.temperature
config_dict["timeout"] = llm.timeout
config_dict["max_retries"] = llm.max_retries
#config_dict["top_p"] = 0.0001

with open(report_path_config, "w") as file:
        dump(config_dict, file) 

# To process all questions, leave this empty. Otherwise, specify the exact numbers.
# To process a sub-question, include the parent question number as well, e.g., ["9", "9.2"].
questions_to_process = globals().get('my_questions_to_process', [])

for file in tqdm(ini_files, desc="Config files", unit="file"): # TODO fix TQDM; expected that after first file finishes it shows the aproximation of all the other files left to process
        configfile = config_dir / f"{file}.ini"
        
        tqdm.write(f"Processing config file: {configfile}")
        procurement_id, procurement_file, agreement_file, answer_file = get_config_data(configfile, procurement_file_dir, answer_file_dir)
        # Open CSV file, maybe as pandas dataframe
        answer_dictionary = get_answers(answer_file)
        # Getting markdown text from procurement doc
        procurement_content = get_procurement_content(extractor, procurement_file, agreement_file)
    
        # Creating FAISS vector index for the procurement document
        qnaengine = QnAEngine(embedding,llm)
        await qnaengine.createIndex(
                procurement_content,
                "Procurement",
                chunk_size=embedding_conf["chunk_size"],
                chunk_overlap=embedding_conf["chunk_overlap"]
                )

        ### Generating results
        results_table = gen_results(qnaengine, configfile, embedding_conf, question_dictionary, answer_dictionary, prompt_dictionary, supplementary_info, questions_to_process)
        
        # add "Iepirkuma ID" as procurement_id to results table
        # TODO move this inside gen results function once it has been refactored
        for row in results_table:
                row.insert(0, file)
        
        ### Save output
        data = pd.DataFrame(results_table, columns=["Iepirkuma ID", "Nr", "Atbilde", "Sagaidāmā atbilde", "Pamatojums", "Uzvedne"])
        precision = (data['Atbilde'] == data['Sagaidāmā atbilde']).sum()/len(data)
        print(f"PRECIZITĀTE: {precision*100}%")

        with report_path_htm.open('a', encoding='utf-8') as ofile:  
                # TODO Create a dropdown menu that lets the user select file by "Iepirkuma ID" - in each page only information that has that "Iepirkuma ID" is displayed
                print(f"{procurement_id} PRECIZITĀTE: {precision*100}%" ,file=ofile)
                print(data.to_html(index=False).replace('\\n','<br>'),file=ofile)

        data.to_csv(report_path_csv, 
                    mode='a', 
                    index=False, 
                    header=not report_path_csv.exists(), # only adding one header
                    encoding='utf-8')
        
        
                

C:\Repos\vpp-cfla\reports\dev-test_08.09\report.htm
C:\Repos\vpp-cfla\reports\dev-test_08.09\report.csv
False



Config files:   0%|                                                                           | 0/20 [00:00<?, ?file/s][A
[A                                                                                                                    
Config files:   0%|                                                                           | 0/20 [00:00<?, ?file/s][A

Processing config file: C:\Repos\vpp-cfla\config\APP_DI_20202ERAF_AK.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\APP_DI_20202ERAF_AK\Konsolidets_Konkursa_Nolikums_ar grozijumiem_12.03.2020. (1).docx
Processing file: C:\Repos\vpp-cfla\cfla_files\APP_DI_20202ERAF_AK\Konsolidets_Iepirkuma_ligums_ar grozijumiem_12.03.2020 (1).docx




Generating embeddings:   0%|                                                                           [ time left: ? ][A[A

Generating embeddings:  19%|█████████████                                                          [ time left: 01:18 ][A[A

Generating embeddings:  36%|████████████████████████▉                                              [ time left: 01:03 ][A[A

Generating embeddings:  53%|████████████████████████████████████▊                                  [ time left: 00:57 ][A[A

Generating embeddings:  69%|████████████████████████████████████████████████▋                      [ time left: 00:36 ][A[A

Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ][A[A


59 segments created and vectorized.
Index is ready.
2 4 6 7 9 10 15 16 17 18 19 23 24 26 27 28 29 31 32 33 
34 35 35.1 35.2 35.3 35.5 35.6 35.7 35.8 36 37 37.2 37.4 37.5 37.6 37.7 37.9 37.10 37.11 37.13 
38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9 39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 
39.10 39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.20 39.21 39.22 39.23 39.24 39.25 39.26 39.27 39.28 39.29 
39.30 39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 40 40.1 40.2 40.3 40.4 40.5 40.6 40.7 40.8 40.9 
40.10 40.11 40.12 41 43 44 46 47 48 50 51 51.1 51.2 51.4 52 53 54 56 57 58 
60 61 62 63 65 


Config files:   5%|███▏                                                            | 1/20 [09:50<3:06:58, 590.43s/file][A
[A                                                                                                                    
Config files:   5%|███▏                                                            | 1/20 [09:50<3:06:58, 590.43s/file][A

PRECIZITĀTE: 68.67469879518072%
Processing config file: C:\Repos\vpp-cfla\config\DND_20206.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\DND_20206\NOLIKUMS_2020_6.docx


Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]


64 segments created and vectorized.
Index is ready.
2 4 6 7 9 10 15 16 17 18 19 22 23 24 26 
27 28 29 31 32 33 34 35 35.1 35.2 35.3 35.5 35.6 35.7 35.8 36 37 37.2 37.4 37.5 
37.6 37.7 37.9 37.10 37.11 37.13 38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9 39 39.1 39.2 39.3 
39.4 39.5 39.6 39.7 39.8 39.9 39.10 39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.20 39.21 39.22 39.23 
39.24 39.25 39.26 39.27 39.28 39.29 39.30 39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 40 40.1 40.2 40.3 
40.4 40.5 40.6 40.7 40.8 40.9 40.10 40.11 40.12 41 43 44 46 47 48 50 51 51.1 51.2 51.4 
52 53 54 56 57 58 60 61 62 63 65 


Config files:  10%|██████▍                                                         | 2/20 [20:23<3:04:39, 615.51s/file][A
[A                                                                                                                    
Config files:  10%|██████▍                                                         | 2/20 [20:23<3:04:39, 615.51s/file][A

PRECIZITĀTE: 69.04761904761905%
Processing config file: C:\Repos\vpp-cfla\config\EDI_20203AK.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\EDI_20203AK\Nolikums EDI_2020_3_AK_EIS.docx


Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]


79 segments created and vectorized.
Index is ready.
2 4 6 7 9 10 15 16 17 
18 19 22 23 24 26 27 28 29 31 32 33 34 35 36 37 37.2 37.4 37.5 37.6 
37.7 37.9 37.10 37.11 37.13 38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9 39 39.1 39.2 39.3 39.4 
39.5 39.6 39.7 39.8 39.9 39.10 39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.20 39.21 39.22 39.23 39.24 
39.25 39.26 39.27 39.28 39.29 39.30 39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 40 40.1 40.2 40.3 40.4 
40.5 40.6 40.7 40.8 40.9 40.10 40.11 40.12 41 43 44 46 47 48 50 51 51.1 51.2 51.4 52 
53 54 56 57 58 60 61 62 63 65 


Config files:  15%|█████████▌                                                      | 3/20 [31:04<2:57:43, 627.26s/file][A
[A                                                                                                                    
Config files:  15%|█████████▌                                                      | 3/20 [31:04<2:57:43, 627.26s/file][A

PRECIZITĀTE: 77.97619047619048%
Processing config file: C:\Repos\vpp-cfla\config\IKVD_20213_ESF_SAM.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\IKVD_20213_ESF_SAM\Nolikums_22.12.2021..docx
Processing file: C:\Repos\vpp-cfla\cfla_files\IKVD_20213_ESF_SAM\10.pielikums_liguma_projekts.docx


Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]


40 segments created and vectorized.
Index is ready.
2 4 6 9 10 15 16 17 19 22 
23 24 26 27 28 29 31 32 33 34 35 36 37 37.2 37.4 37.5 37.6 37.7 37.9 37.10 
37.11 37.13 38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9 39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 
39.8 39.9 39.10 39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.20 39.21 39.22 39.23 39.24 39.25 39.26 39.27 
39.28 39.29 39.30 39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 40 40.1 40.2 40.3 40.4 40.5 40.6 40.7 
40.8 40.9 40.10 40.11 40.12 41 43 44 46 47 48 50 51 51.1 51.2 51.4 52 53 54 56 
57 58 60 61 62 63 65 


Config files:  20%|████████████▊                                                   | 4/20 [42:46<2:55:08, 656.77s/file][A
[A                                                                                                                    
Config files:  20%|████████████▊                                                   | 4/20 [42:46<2:55:08, 656.77s/file][A

PRECIZITĀTE: 67.07317073170732%
Processing config file: C:\Repos\vpp-cfla\config\IP2020_01_ERAF.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\IP2020_01_ERAF\Nolikums_IP_2020_01_ERAF_pielikumi.docx


Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]


49 segments created and vectorized.
Index is ready.
2 4 6 7 9 10 15 16 17 18 19 22 23 
24 26 27 28 29 31 32 33 34 35 35.1 35.2 35.3 35.5 35.6 35.7 35.8 36 37 37.2 
37.4 37.5 37.6 37.7 37.9 37.10 37.11 37.13 38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9 39 39.1 
39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 39.10 39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.20 39.21 
39.22 39.23 39.24 39.25 39.26 39.27 39.28 39.29 39.30 39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 40 40.1 
40.2 40.3 40.4 40.5 40.6 40.7 40.8 40.9 40.10 40.11 40.12 41 43 44 46 47 48 50 51 51.1 
51.2 51.4 52 53 54 56 57 58 60 61 62 63 65 


Config files:  25%|████████████████                                                | 5/20 [53:06<2:40:52, 643.47s/file][A
[A                                                                                                                    
Config files:  25%|████████████████                                                | 5/20 [53:06<2:40:52, 643.47s/file][A

PRECIZITĀTE: 59.523809523809526%
Processing config file: C:\Repos\vpp-cfla\config\IeM_IC_201913.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\IeM_IC_201913\NOLIKUMS vec.docx


Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]


86 segments created and vectorized.
Index is ready.
2 4 6 7 9 10 15 
16 17 18 19 22 23 24 26 27 28 29 31 32 33 34 35 36 37 37.2 37.4 
37.5 37.6 37.7 37.9 37.10 37.11 37.13 38 38.1 38.2 38.3 38.4 38.5 38.7 38.8 38.9 39 39.1 39.2 39.3 
39.4 39.5 39.6 39.7 39.8 39.9 39.10 39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.20 39.21 39.22 39.23 
39.24 39.25 39.26 39.27 39.28 39.29 39.30 39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 40 40.1 40.2 40.3 
40.4 40.5 40.6 40.7 40.8 40.9 40.10 40.11 40.12 41 44 46 47 48 50 51 51.4 52 53 54 
56 57 58 60 61 62 63 65 


Config files:  30%|██████████████████▌                                           | 6/20 [1:01:51<2:20:42, 603.04s/file][A
[A                                                                                                                    
Config files:  30%|██████████████████▌                                           | 6/20 [1:01:51<2:20:42, 603.04s/file][A

PRECIZITĀTE: 68.51851851851852%
Processing config file: C:\Repos\vpp-cfla\config\KNP_202215.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\KNP_202215\Nolikums_2022_15_ar_grozijumiem.docx


Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]


50 segments created and vectorized.
Index is ready.
2 4 6 9 10 15 16 17 19 22 23 24 
26 27 28 29 31 32 33 34 35 35.1 35.2 35.3 35.5 35.6 35.7 35.8 36 37 37.2 37.4 
37.5 37.6 37.7 37.9 37.10 37.11 37.13 38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9 39 39.1 39.2 
39.3 39.4 39.5 39.6 39.7 39.8 39.9 39.10 39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.20 39.21 39.22 
39.23 39.24 39.25 39.26 39.27 39.28 39.29 39.30 39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 40 40.1 40.2 
40.3 40.4 40.5 40.6 40.7 40.8 40.9 40.10 40.11 40.12 41 43 44 46 47 48 50 51 51.1 51.2 
51.4 52 53 54 56 57 58 60 61 62 63 65 


Config files:  35%|█████████████████████▋                                        | 7/20 [1:10:34<2:04:59, 576.87s/file][A
[A                                                                                                                    
Config files:  35%|█████████████████████▋                                        | 7/20 [1:10:34<2:04:59, 576.87s/file][A

PRECIZITĀTE: 73.78048780487805%
Processing config file: C:\Repos\vpp-cfla\config\KP2020016A-KF.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\KP2020016A-KF\Konkursa_nolikums_TENT_3_posms_GROZIJUMI_16.03.2020.pdf
Processing file: C:\Repos\vpp-cfla\cfla_files\KP2020016A-KF\7_Liguma_projekts_GROZIJUMI_16.03.2020.pdf


Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]


64 segments created and vectorized.
Index is ready.
2 4 6 7 9 10 15 16 
17 18 19 22 23 24 26 27 28 29 31 32 33 34 35 35.1 35.2 35.3 35.5 35.6 
35.7 35.8 36 37 37.2 37.4 37.5 37.6 37.7 37.9 37.10 37.11 37.13 38 38.1 38.2 38.3 38.4 38.5 38.6 
38.7 38.8 38.9 39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 39.10 39.11 39.12 39.13 39.14 39.15 39.16 
39.17 39.18 39.19 39.20 39.21 39.22 39.23 39.24 39.25 39.26 39.27 39.28 39.29 39.30 39.31 39.32 39.33 39.34 39.35 39.36 
39.37 39.38 39.39 40 40.1 40.2 40.3 40.4 40.5 40.6 40.7 40.8 40.9 40.10 40.11 40.12 41 43 44 46 
47 48 50 51 51.1 51.2 51.4 52 53 54 56 57 58 60 61 62 63 65 PRECIZITĀTE: 55.952380952380956%



Config files:  40%|████████████████████████▊                                     | 8/20 [1:30:14<2:33:47, 768.92s/file][A
[A                                                                                                                    
Config files:  40%|████████████████████████▊                                     | 8/20 [1:30:14<2:33:47, 768.92s/file][A

Processing config file: C:\Repos\vpp-cfla\config\LNP_201976.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\LNP_201976\Apstiprinats.docx


Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]


62 segments created and vectorized.
Index is ready.
2 4 
6 7 9 10 15 16 17 18 19 22 23 24 26 27 28 29 31 32 33 34 
35 35.1 35.2 35.3 35.5 35.6 35.7 35.8 36 37 37.2 37.4 37.5 37.6 37.7 37.9 37.10 37.11 37.13 38 
38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9 39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 39.10 
39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.20 39.21 39.22 39.23 39.24 39.25 39.26 39.27 39.28 39.29 39.30 
39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 40 40.1 40.2 40.3 40.4 40.5 40.6 40.7 40.8 40.9 40.10 
40.11 40.12 41 43 44 46 47 48 50 51 52 53 54 56 57 58 60 61 62 63 
65 


Config files:  45%|███████████████████████████▉                                  | 9/20 [1:43:53<2:23:52, 784.73s/file][A
[A                                                                                                                    
Config files:  45%|███████████████████████████▉                                  | 9/20 [1:43:53<2:23:52, 784.73s/file][A

PRECIZITĀTE: 68.45238095238095%
Processing config file: C:\Repos\vpp-cfla\config\LNP_202037ERAF.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\LNP_202037ERAF\Nolikums_pilsdrupas_1.karta.docx


Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]


51 segments created and vectorized.
Index is ready.
2 4 6 7 9 10 15 16 17 18 19 22 23 24 26 27 28 29 31 
32 33 34 35 35.1 35.2 35.3 35.5 35.6 35.7 35.8 36 37 37.2 37.4 37.5 37.6 37.7 37.9 37.10 
37.11 37.13 38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9 39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 
39.8 39.9 39.10 39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.20 39.21 39.22 39.23 39.24 39.25 39.26 39.27 
39.28 39.29 39.30 39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 40 40.1 40.2 40.3 40.4 40.5 40.6 40.7 
40.8 40.9 40.10 40.11 40.12 41 43 44 46 47 48 50 51 51.1 51.2 51.4 52 53 54 56 
57 58 60 61 62 63 65 


Config files:  50%|██████████████████████████████▌                              | 10/20 [1:55:03<2:04:53, 749.31s/file][A
[A                                                                                                                    
Config files:  50%|██████████████████████████████▌                              | 10/20 [1:55:03<2:04:53, 749.31s/file][A

PRECIZITĀTE: 73.80952380952381%
Processing config file: C:\Repos\vpp-cfla\config\LU_202049_ERAF.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\LU_202049_ERAF\Nolikums.docx
Processing file: C:\Repos\vpp-cfla\cfla_files\LU_202049_ERAF\10pielikums_lîgums.docx


Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]


48 segments created and vectorized.
Index is ready.
2 4 6 7 9 10 15 16 17 18 19 22 23 
24 26 27 28 29 31 32 33 34 35 35.1 35.2 35.3 35.5 35.6 35.7 35.8 36 37 37.2 
37.4 37.5 37.6 37.7 37.9 37.10 37.11 37.13 38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9 39 39.1 
39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 39.10 39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.20 39.21 
39.22 39.23 39.24 39.25 39.26 39.27 39.28 39.29 39.30 39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 40 40.1 
40.2 40.3 40.4 40.5 40.6 40.7 40.8 40.9 40.10 40.11 40.12 41 43 44 46 47 48 50 51 51.1 
51.2 51.4 52 53 54 56 57 58 60 61 62 63 65 


Config files:  55%|█████████████████████████████████▌                           | 11/20 [2:07:33<1:52:26, 749.57s/file][A
[A                                                                                                                    
Config files:  55%|█████████████████████████████████▌                           | 11/20 [2:07:33<1:52:26, 749.57s/file][A

PRECIZITĀTE: 72.02380952380952%
Processing config file: C:\Repos\vpp-cfla\config\MNP_202121_ERAF.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\MNP_202121_ERAF\MNP2021_21_ERAF_nolikums_12-07-2021.doc.pdf




Processing file: C:\Repos\vpp-cfla\cfla_files\MNP_202121_ERAF\2-4-6_21_47_Ligums.pdf


Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]


119 segments created and vectorized.
Index is ready.
2 4 6 9 10 15 16 
17 19 22 23 24 26 27 28 29 31 32 33 34 35 36 37 37.2 37.4 37.5 37.6 
37.7 37.9 37.10 37.11 37.13 38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9 39 39.1 39.2 39.3 39.4 
39.5 39.6 39.7 39.8 39.9 39.10 39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.20 39.21 39.22 39.23 39.24 
39.25 39.26 39.27 39.28 39.29 39.30 39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 40 40.1 40.2 40.3 40.4 
40.5 40.6 40.7 40.8 40.9 40.10 40.11 40.12 41 43 44 46 47 48 50 51 51.1 51.2 51.4 52 
53 54 56 57 58 60 61 62 63 65 


Config files:  60%|████████████████████████████████████▌                        | 12/20 [2:19:58<1:39:43, 747.97s/file][A
[A                                                                                                                    
Config files:  60%|████████████████████████████████████▌                        | 12/20 [2:19:58<1:39:43, 747.97s/file][A

PRECIZITĀTE: 61.58536585365854%
Processing config file: C:\Repos\vpp-cfla\config\PND_2019_15-ERAF.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\PND_2019_15-ERAF\Nolikums_ar_grozijumiem_2.docx


Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]


58 segments created and vectorized.
Index is ready.
2 4 6 7 9 10 15 16 17 18 
19 22 23 24 26 27 28 29 31 32 33 34 35 35.1 35.2 35.5 35.6 35.7 35.8 36 
37 37.2 37.4 37.5 37.6 37.7 37.9 37.10 37.11 37.13 38 38.1 38.2 38.3 38.4 38.5 38.7 38.8 38.9 39 
39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 39.10 39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.20 
39.21 39.22 39.23 39.24 39.25 39.26 39.27 39.28 39.29 39.30 39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 40 
40.1 40.2 40.3 40.4 40.5 40.6 40.7 40.8 40.9 40.10 40.11 40.12 41 44 46 47 48 50 51 52 
53 54 56 57 58 60 61 62 63 65 


Config files:  65%|███████████████████████████████████████▋                     | 13/20 [2:28:24<1:18:42, 674.68s/file][A
[A                                                                                                                    
Config files:  65%|███████████████████████████████████████▋                     | 13/20 [2:28:24<1:18:42, 674.68s/file][A

PRECIZITĀTE: 72.22222222222221%
Processing config file: C:\Repos\vpp-cfla\config\RTK_2019_12.ini
Processing file: C:\Repos\vpp-cfla\cfla_files\RTK_2019_12\Nolikums.docx


Generating embeddings:  93%|████████████████████████████████████████████████████████████████▉      [ time left: 00:07 ]

# Report generation

In [None]:
# Load report we want to analayze
analyzed_report_dir = report_dir_path
csv_filename = "report.csv"
input_csv = analyzed_report_dir / csv_filename

# Main report
html_output      = build_main_report_html(input_csv, question_dictionary)
output_html_path = analyzed_report_dir / "main_report.html"
output_html_path.write_text(html_output, encoding="utf-8")

print(f"Main HTML report saved to: {output_html_path}")

questions_wout_0q = get_questions_without_q0(question_dictionary)

# Generate report
precison_report_html = generate_precision_report(input_csv, questions_wout_0q)

# Save the report next to the CSV
output_html = analyzed_report_dir / f"precision_report.html"
with open(output_html, "w", encoding="utf-8") as f:
    f.write(precison_report_html)

print(f"HTML report saved to: {output_html}")


**Questions that can't be answered count**

In [None]:
df = pd.read_csv(input_csv)
df["Sagaidāmā atbilde"] = df["Sagaidāmā atbilde"].astype(str).str.strip().str.lower()
filtered_df = df[
    (df["Sagaidāmā atbilde"] == "nan") &
    (df["Nr"].astype(str).isin(questions_wout_0q))
]
print(f"\nCount of rows where 'Sagaidāmā atbilde' == 'n/a' and it doesn't have q0: {len(filtered_df)}")

**Confusion Matrix**

In [None]:
# Load and normalize data as "n/a" has turned into NaN
df = pd.read_csv(input_csv)
df["Sagaidāmā atbilde"] = df["Sagaidāmā atbilde"].astype(str).str.strip().str.lower()
df["Atbilde"] = df["Atbilde"].astype(str).str.strip().str.lower()

# Replace missing values
df["Sagaidāmā atbilde"] = df["Sagaidāmā atbilde"].replace(["nan", "none", ""], "n/a")
df["Atbilde"] = df["Atbilde"].replace(["nan", "none", ""], "n/a")

# Group context responses
df["Atbilde"] = df["Atbilde"].replace({
    "x": "kontekstā nav informācijas",
    "kontekstā nav informācijas": "kontekstā nav informācijas"
})

# Define value order
expected = ["jā", "nē", "n/a"]
actual = ["jā", "nē", "n/a", "kontekstā nav informācijas"]

conf_matrix = pd.crosstab(
    df["Sagaidāmā atbilde"],
    df["Atbilde"],
    rownames=["Expected ↓"],
    colnames=["Actual →"],
    dropna=False
).reindex(index=expected, columns=actual, fill_value=0)
conf_matrix.index.name = "Expected ↓ / Actual →"

print("\nConfusion Matrix:\n")
print(conf_matrix.to_markdown(tablefmt="grid"))


In [None]:
# Get correct counts for "jā" and "nē"
correct_yes = conf_matrix.loc["jā", "jā"]
total_yes = conf_matrix.loc["jā"].sum()
print(f"total 'jā' count: {total_yes}")

correct_no = conf_matrix.loc["nē", "nē"]
total_no = conf_matrix.loc["nē"].sum()
print(f"total 'nē' count: {total_no}")

correct_na = conf_matrix.loc["n/a", "n/a"]
total_na = conf_matrix.loc["n/a"].sum()
print(f"total 'n/a' count: {total_na}")

# Calculate percentages
yes_accuracy = (correct_yes / total_yes) * 100 if total_yes > 0 else 0
no_accuracy = (correct_no / total_no) * 100 if total_no > 0 else 0
na_accuracy = (correct_na / total_na) * 100 if total_na > 0 else 0

print(f"\nAccuracy for 'jā': {yes_accuracy:.2f}%")
print(f"Accuracy for 'nē': {no_accuracy:.2f}%")
print(f"Accuracy for 'n/a': {na_accuracy:.2f}%")


In [None]:
total_q = total_yes + total_no + total_na
print(f"Total question count: {total_q}")
correct_q = correct_yes + correct_no + correct_na

precision = round((correct_q / total_q) * 100, 2)
print(f"Total precision: {precision}%")

precison_wout_unanswerable_qs = round(correct_q / (total_q - len(filtered_df))*100,2)
print(f"Total precision without unanswerable (n/a) questions: {precison_wout_unanswerable_qs}%")

context_missing_total = conf_matrix["kontekstā nav informācijas"].sum()
confident_answers =  total_q - len(filtered_df) - context_missing_total
precision_wout_unanswerable_qs_nocontext = round((correct_q / confident_answers)*100,2)
print(f"Total precision without unanswerable (n/a) questions and when LLM is not unsure: {precision_wout_unanswerable_qs_nocontext}%")
print(f"Total question count without unanswerable (n/a) questions and when LLM is not unsure: {confident_answers}")