In [1]:
# Line magic functions that will allow for imports to be reloaded and not cached
%load_ext autoreload
%autoreload 2

# Imports
import os
import pandas as pd
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from datetime import date

# Local
from scripts.extractmd import Extractor
from scripts.vectorindex import QnAEngine
from scripts.utilities import get_prompt_dict, get_questions, get_answers, get_procurement_content, get_config_data
from scripts.gen_results import gen_results




**Global config**

In [2]:
embedding_conf = {
    "embeddingmodel": "BAAI/bge-m3",  # "BAAI/bge-m3" "nomic-ai/nomic-embed-text-v2-moe" # "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    "chunk_size": 1536,
    "chunk_overlap": 0,
    "top_similar": 5,
    "n4rerank": 20, #How many nodes to retrieve for reranking. If 0, reranker is not used
    "use_similar_chunks": True, #To use similar chunks or the whole document as the context
    "prevnext": True #to include in the context also the previouse and the next chunk of the current similar chunk
}
embedding=HuggingFaceEmbedding(model_name=embedding_conf["embeddingmodel"],trust_remote_code=True)

#For nomic-embed-text-v2-moe
#embedding=HuggingFaceEmbedding(model_name=embedding_conf["embeddingmodel"],trust_remote_code=True,query_instruction="search_query: ",text_instruction="search_document: ")

In [3]:
# LLM Setup 
llmmodelAzure = { "model": "gpt-4o",
                "version":os.environ.get('AZURE_OPENAI_VERSION',''),
                "azure_deployment":"gpt-4o",
                "azure_endpoint":os.environ.get('AZURE_ENDPOINT',''),
                "api_key":os.environ.get('AZURE_OPENAI_KEY','')}

llm=AzureOpenAI(azure_deployment=llmmodelAzure["azure_deployment"],
                azure_endpoint=llmmodelAzure["azure_endpoint"],temperature=0.0,
                api_version=llmmodelAzure["version"], api_key=llmmodelAzure["api_key"],
                timeout=120,max_retries=3,top_p=0.0001)

In [4]:
qnaengine = QnAEngine(embedding,llm)
extractor = Extractor() # Markdown doc extractor

In [5]:
#Ollama model
# llmmodelOllama = { "model": "gemma3:27b",
#                 "url":os.environ.get('OLLAMA_ENDPOINT',''),
#                 "context_window":"128000"}

#from llama_index.llms.ollama import Ollama
#llm = Ollama(base_url=llmmodelOllama["url"],
#             model=llmmodelOllama["model"], 
#             context_window=int(llmmodelOllama["context_window"]),
#            request_timeout=300.0,
#            temperature=0.0,
#            additional_kwargs={"seed":1337})

**PROCUREMENT FILE SETTINGS**

In [6]:
# Script dir for getting relative paths for notebook file
script_dir = globals()['_dh'][0] 

# Document paths
question_file_path = script_dir / "questions" / "questions.yaml"
prompt_file = script_dir / "questions" / "prompts.tsv"
report_dir = script_dir / "reports"
config_dir = script_dir / "dev_config" # "config" # "demo_config"
procurement_file_dir = script_dir / "cfla_files" # "demo"
answer_file_dir = script_dir / "answers"

# Loading static information TODO
question_dictionary = get_questions(question_file_path)
promptdict = get_prompt_dict(prompt_file)

Questions loaded


**MAIN Q/A GENERATION SCRIPT**

In [7]:
# TODO config file loop; For each config in config_folder_path; # TODO add parallel prompting
# Create an array of ini files and then call them in a loop; This will give us more control over which files to call/exclude

ini_files = [f for f in os.listdir(config_dir) if f.endswith('.ini')]
print(f"Found {len(ini_files)} config files in {config_dir}")

# TODO add a little indicator how many files are to be processes; + aproximate time to finish each file
for file in ini_files:
        configfile = config_dir / file
        print(f"Processing config file: {configfile}")
        procurement_id, procurement_file, agreement_file, answer_file = get_config_data(configfile, procurement_file_dir, answer_file_dir)
        answer_dictionary = get_answers(answer_file)

        # Getting markdown text from procurement doc
        procurement_content = get_procurement_content(extractor, procurement_file, agreement_file)
    
        # Creating FAISS vector index for the procurement document
        await qnaengine.createIndex(
                procurement_content,
                "Procurement",
                chunk_size=embedding_conf["chunk_size"],
                chunk_overlap=embedding_conf["chunk_overlap"]
                )

        # Generating results
        results_table = gen_results(qnaengine, configfile, embedding_conf, question_dictionary, answer_dictionary, promptdict)

        # Save output

        # TODO probably should merge the output file together; One output file for the entire run, that contains each specific procurement results
        # TODO maybe implement some crash prevention strategy, so if some file breaks it doesn't break the entire run
        data = pd.DataFrame(results_table, columns=["Nr", "Atbilde", "Sagaidāmā atbilde", "Pamatojums"])
        precision = (data['Atbilde'] == data['Sagaidāmā atbilde']).sum()/len(data)
        print(f"PRECIZITĀTE: {precision*100}%")

        # Save final output file
        with open(f"{report_dir}\{date.today():%d.%m}_{procurement_id.replace('/','_')}.htm", 'w', encoding='utf-8') as ofile:
                print(data.to_html(index=False).replace('\\n','<br>'),file=ofile)
                print(f"PRECIZITĀTE: {precision*100}%",file=ofile)

Found 2 config files in C:\Repos\vpp-cfla\dev_config
Processing config file: C:\Repos\vpp-cfla\dev_config\KND-2020_20.ini


Generating embeddings: 100%|██████████████████████████████████████████████ [ time left: 00:00 ]


99 segments created and vectorized.
Index is ready.
2 

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


4 6 {'text': ['1. EIROPAS VIENOTAIS IEPIRKUMA PROCEDŪRAS DOKUMENTS\n   1. Pretendents var iesniegt Eiropas vienoto iepirkuma procedūras dokumentu kā sākotnējo pierādījumu atbilstībai paziņojumā par līgumu vai iepirkuma procedūras dokumentos noteiktajām pretendentu atlases prasībām. Ja pretendents izvēlēsies iesniegt Eiropas vienoto iepirkuma procedūras dokumentu, lai apliecinātu, ka tas atbilst paziņojumā par līgumu vai iepirkuma procedūras dokumentos noteiktajām pretendentu atlases prasībām, tam jāiesniedz šo dokumentu arī par katru personu, uz kuras iespējām pretendents balstās, lai apliecinātu, ka tā kvalifikācija atbilst paziņojumā par līgumu vai iepirkuma procedūras dokumentos noteiktajām prasībām, un par tā norādīto apakšuzņēmēju, kura vērtība ir vismaz 10 procenti no iepirkuma līguma vērtības. Piegādātāju apvienībai jāiesniedz atsevišķu Eiropas vienoto iepirkuma procedūras dokumentu par katru tās dalībnieku.\n2. Pretendents var pasūtītājam iesniegt Eiropas vienoto iepirkuma proc

Recognizing layout: 100%|████████████████████████████████████████| 2/2 [00:16<00:00,  8.08s/it]
Running OCR Error Detection: 100%|███████████████████████████████| 3/3 [00:01<00:00,  1.96it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|████████████████████████████████████████| 1/1 [00:02<00:00,  2.53s/it]


An exception occurred: AttributeError can't set attribute 'text'


Generating embeddings: 100%|██████████████████████████████████████████████ [ time left: 00:00 ]


64 segments created and vectorized.
Index is ready.
2 4 6 An exception occurred: KeyError 14
[]


TypeError: list indices must be integers or slices, not str