In [1]:
import os
import sys
import configparser
from datetime import date

**CURRENT SETTINGS: Embedding and language models, Chunk size, etc.**

In [2]:
embeddingmodel = "BAAI/bge-m3"
#embeddingmodel = "nomic-ai/nomic-embed-text-v2-moe"
#embeddingmodel = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

In [3]:
#Azure model
llmmodelAzure = { "model": "gpt-4o",
                "version":os.environ.get('AZURE_OPENAI_VERSION','2024-08-01-preview'),
                "azure_deployment":"gpt-4o",
                "azure_endpoint":os.environ.get('AZURE_ENDPOINT',''),
                "api_key":os.environ.get('AZURE_OPENAI_KEY','')}

In [4]:
#Ollama model
llmmodelOllama = { "model": "gemma3:27b",
                "url":os.environ.get('OLLAMA_ENDPOINT',''),
                "context_window":"128000"}
print(llmmodelOllama)

{'model': 'gemma3:27b', 'url': 'http://192.168.2.128:11434', 'context_window': '128000'}


In [5]:
question_file_path = "C:\\Repos\\vpp-cfla\\questions\\questions.yaml"
prompt_file = 'C:\\Repos\\vpp-cfla\\questions\\prompts.tsv'
report_dir = 'C:\\Repos\\vpp-cfla\\reports'

**PROCUREMENT SETTINGS**

In [6]:
#configfile = 'C:\\Repos\\vpp-cfla\\config\\SNP-2021_07_AK.ini'
#configfile = 'C:\\Repos\\vpp-cfla\\config\\KND-2020_07.ini'
configfile = 'C:\\Repos\\vpp-cfla\\config\\KND-2020_20.ini'

In [7]:
config = configparser.ConfigParser()
config.read(configfile)

['C:\\Repos\\vpp-cfla\\config\\KND-2020_20.ini']

In [8]:
EIS_URL = config.get('Procurement', 'EIS_URL')
procurement_id = config.get('Procurement', 'procurement_id')
procurement_file = config.get('Procurement', 'procurement_file')
answer_file = config.get('Procurement', 'answer_file')
if 'agreement_file' in config['Procurement']:
    agreement_file = config.get('Procurement', 'agreement_file')
else:
    agreement_file = ''

In [9]:
chunk_size = 1536
chunk_overlap = 0
top_similar = 5

**STEP 1: Getting markdown text from the _Project Procurement document_**

In [10]:
from scripts.extractmd import Extractor

In [11]:
ex = Extractor()

In [12]:
procurement_content = ex.convert2markdown(procurement_file)

In [13]:
if len(agreement_file) > 0:
    agreement_content = ex.convert2markdown(agreement_file)
    procurement_content = procurement_content + "\n\n# IEPIRKUMA LĪGUMA PROJEKTS\n\n" + agreement_content
    with open("tmp3.md", 'w', encoding='utf-8') as fout:
        print(procurement_content,file=fout)

**STEP 2: Initializing embedding and llm object**

In [14]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding




In [15]:
embedding=HuggingFaceEmbedding(model_name=embeddingmodel,trust_remote_code=True)

#For nomic-embed-text-v2-moe
#embedding=HuggingFaceEmbedding(model_name=embeddingmodel,trust_remote_code=True,query_instruction="search_query: ",text_instruction="search_document: ")

In [16]:
print(len(embedding._get_text_embedding("Test string")))

1024


In [17]:
from llama_index.llms.ollama import Ollama
llm = Ollama(base_url=llmmodelOllama["url"],
             model=llmmodelOllama["model"], 
             context_window=int(llmmodelOllama["context_window"]),
            request_timeout=300.0,
            temperature=0.0,
            additional_kwargs={"seed":1337})

In [18]:
#from llama_index.llms.azure_openai import AzureOpenAI
#llm=AzureOpenAI(azure_deployment=llmmodelAzure["azure_deployment"],
#                azure_endpoint=llmmodelAzure["azure_endpoint"],temperature=0.0,
#                api_version=llmmodelAzure["version"], api_key=llmmodelAzure["api_key"],
#                timeout=120,max_retries=3,top_p=0.0001)

In [19]:
#LLM test
#response = llm.complete("What day is today?")
#response
print(llm.metadata.context_window)

128000


**STEP 3: Creating FAISS vector index for the procurement document**

In [20]:
from scripts.vectorindex import QnAEngine

In [21]:
qnaengine = QnAEngine(embedding,llm)

In [22]:
ret = await qnaengine.createIndex(procurement_content,"Procurement",chunk_size=chunk_size,chunk_overlap=chunk_overlap)

Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]

99 segments created and vectorized.





In [23]:
if ret == False:
    print("**Failed to create index!**")
    exit
else:
    print("Index is ready.")

Index is ready.


In [24]:
#Retrieval test
#similarsegments = qnaengine.getSimilarNodes("Vai iepirkums ir sadalīts daļās?")
#print(similarsegments)

In [25]:
#Generation test
#result = qnaengine.askQuestion("Noskaidro, vai lietotāja apgalvojums ir patiess? Domā soli pa soli, katram solim neveltot vairāk par pieciem vārdiem. Pamato savu atbildi, citējot fragmentu no konteksta. Atbildei jābūt 'jā', 'nē' vai 'kontekstā nav informācijas'. Atbildi ietver kvadrātiekavās '[]'. Ja kontekstā nav informācijas, paskaidro, kāda informācija ir nepieciešama, lai novērtētu apgalvojumu.",
#                              "Pasūtītājs ir nodrošinājis iespēju piegādātājiem iepazīties uz vietas ar iepirkuma papildu dokumentiem, kam tehnisku iemeslu dēļ vai tajos iekļautās informācijas vai komerciālu interešu aizsardzības dēļ nav nodrošināma brīva un tieša elektroniska piekļuve, sākot ar attiecīgā iepirkuma izsludināšanas brīdi",
#                              usecontext=True)
#result

#### **STEP 4: Acquiring supplementary information**

In [26]:
from scripts.utilities import *

In [27]:
with open('PIL.txt','r',encoding='utf-8') as file:
    piltxt = file.read().strip()

In [28]:
pattern = r'^(?P<key>(\d+\.\s+(pants|pielikums))|Pārejas noteikumi)'
pilchapters = extract_chapters(piltxt, pattern)

In [29]:
with open('MK107.md','r',encoding='utf-8') as file:
    mk107txt = file.read().strip()

In [30]:
pattern = r'^(?P<key>[# ]*\d+)\.\s+'
mk107chapters = extract_chapters(mk107txt, pattern)

In [31]:
with open('S_LR_NSL.txt','r',encoding='utf-8') as file:
    nsltxt = file.read().strip()

In [32]:
pattern = r'^(?P<key>(\d+\.(\d+)?\s+pants))'
nslchapters = extract_chapters(nsltxt, pattern)

In [33]:
with open('MK_I3.txt','r',encoding='utf-8') as file:
    mki3txt = file.read().strip()

In [34]:
pattern = r'^\*\* (?P<key>\d+(\.\d+)?)\.\s+'
mki3chapters = extract_chapters(mki3txt, pattern)

In [35]:
#eiscontent = await text_from_url(EIS_URL)
#print(eiscontent)

In [47]:
promptdict = {}

with open(prompt_file,'r',encoding='utf-8') as file:
    for line in file:
        lineparts = line.strip().split('\t')
        if len(lineparts)==2:
            for q in lineparts[1].split(','):
                promptdict[str(q)] = lineparts[0] 

##### **STEP 5: Processing questions**

In [48]:
import yaml
import re

In [58]:
try:
    with open(question_file_path, 'r', encoding='utf-8') as file:
        question_dictonary = yaml.load(file, Loader=yaml.BaseLoader)

except FileNotFoundError:
    print(f"Error: File '{question_file_path}' not found.")
    exit
except yaml.YAMLError as e:
    print(f"Error parsing YAML file: {e}")
    exit
    

In [59]:
try:
    with open(answer_file, 'r', encoding='utf-8') as file:
        answer_dictonary = yaml.load(file, Loader=yaml.BaseLoader)

except FileNotFoundError:
    print(f"Error: File '{answer_file}' not found.")
    exit
except yaml.YAMLError as e:
    print(f"Error parsing YAML file: {e}")
    exit

In [60]:
table = []

In [61]:
import json
def askQuestionSaveAnswer(prompt, question, nr, expectedanswer=''):
    result = qnaengine.askQuestion(prompt, question, usecontext=True,n=top_similar)
    result = re.sub(r'\n\n+',r'\n',result).strip()

    answer = re.search(r'\{[^\{\}]+\}',result, re.IGNORECASE)
    if answer:
        try:
            jsonanswer=json.loads(answer.group(1))
            llmanswer = jsonanswer.get('answer','')
            record = [nr, llmanswer, expectedanswer, result]
            return record
        except:
            pass
            
    answer = re.search(r'\[\**(jā|nē|kontekstā nav informācijas|n/a)\**\]',result, re.IGNORECASE)
    
    if answer:
        llmanswer=answer.group(1)
        #result = result.replace(f"[{llmanswer}]","").replace(f"Atbilde:","")
        record = [nr, llmanswer, expectedanswer, result]
    else:
        answer = re.search(r"'?(jā|nē|kontekstā nav informācijas|n/a)'?", result, re.IGNORECASE)
        if not answer:
            answer = re.search(r'\[(ja|ne)\]', result, re.IGNORECASE)
        if answer:
            record = [nr, answer.group(1).lower(), expectedanswer, result]
        else:
            record = [nr, '', expectedanswer, result] 
    return record

In [62]:
#for singleq in question_dictonary:
#    extrainfo = get_extra_info(singleq, pilchapters, mk107chapters)    
#    if len(extrainfo) > 0:
#        print(singleq)
#        print(extrainfo)

In [63]:
with open("nodes.log", 'a', encoding='utf-8') as ofile:
    print(f"\n*********************\n{configfile}, {configfile}\n{embeddingmodel}, top_similar: {top_similar}, chunk-size: {chunk_size}, chunk_overlap: {chunk_overlap}",file=ofile)
    for singleq, singlea in zip(question_dictonary,answer_dictonary):
        print(singleq['nr'],end=' ')
        bcontinue = True
        extrainfo = get_extra_info(singleq, pilchapters, mk107chapters, nslchapters, mki3chapters)    
        extrainfo = qnaengine.compressPrompt(extrainfo,3000)
        
        if 'question0' in singleq:
            result0 = askQuestionSaveAnswer(promptdict['0'] + extrainfo, singleq['question0'], f"{singleq['nr']}-0", singlea['answer0'])
            table.append(result0)
            if table[-1][1] != table[-1][2]:
                nodes = qnaengine.getSimilarNodes(singleq['question0'])
                print(f"\nQ: {singleq['nr']}-0\n{nodes['text']}\n{nodes['metadata']}\n{nodes['score']}",file=ofile) 
    
            if result0[1] == 'nē':
                bcontinue = False
        
        if bcontinue == False:
            if 'question' in singleq:
                table.append([str(singleq['nr']),'n/a',singlea['answer'],''])
            elif 'questions' in singleq:
                for listq, lista in zip(singleq['questions'],singlea['answers']):
                    table.append([str(listq['nr']),'n/a',lista['answer'],''])
        elif 'question' in singleq:
            table.append(askQuestionSaveAnswer(promptdict[str(singleq['nr'])] + extrainfo, singleq['question'], str(singleq['nr']), singlea['answer']))
            if table[-1][1] != table[-1][2]: 
                nodes = qnaengine.getSimilarNodes(singleq['question'])
                print(f"\nQ: {singleq['nr']}\n{nodes['text']}\n{nodes['metadata']}\n{nodes['score']}",file=ofile) 
        elif 'questions' in singleq:
            for listq, lista in zip(singleq['questions'],singlea['answers']):
                print(listq['nr'],end=' ')
                bcontinue = True
                extrainfo = get_extra_info(listq, pilchapters, mk107chapters, nslchapters, mki3chapters) 
                extrainfo = qnaengine.compressPrompt(extrainfo,3000)
                
                if 'question0' in listq:
                    result0 = askQuestionSaveAnswer(promptdict['0'] + extrainfo, listq['question0'], f"{listq['nr']}-0", lista['answer0'])
                    table.append(result0)
                    if table[-1][1] != table[-1][2]:
                        nodes = qnaengine.getSimilarNodes(listq['question0'])
                        print(f"\nQ: {listq['nr']}-0\n{nodes['text']}\n{nodes['metadata']}\n{nodes['score']}",file=ofile) 
            
                    if result0[1] == 'nē':
                        bcontinue = False
                        
                if bcontinue == False:
                    table.append([str(listq['nr']),'n/a',lista['answer0'],''])
                else:
                    table.append(askQuestionSaveAnswer(promptdict[str(listq['nr'])] + extrainfo, listq['question'], str(listq['nr']), lista['answer']))
                    if table[-1][1] != table[-1][2]:
                        nodes = qnaengine.getSimilarNodes(listq['question'])
                        print(f"\nQ: {listq['nr']}\n{nodes['text']}\n{nodes['metadata']}\n{nodes['score']}",file=ofile)


1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 35.1 35.2 35.3 35.4 35.5 35.6 35.7 35.8 36 37 37.1 37.2 37.3 37.4 37.5 37.6 37.7 37.8 37.9 37.10 37.11 37.12 37.13 38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9 39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 39.10 39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.20 39.21 39.22 39.23 39.24 39.25 39.26 39.27 39.28 39.29 39.30 39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 39.40 40 40.1 40.2 40.3 40.4 40.5 40.6 40.7 40.8 40.9 40.10 40.11 40.12 41 42 43 44 45 46 47 48 49 50 51 51.1 51.2 51.3 51.4 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 

In [64]:
#for item in table:
#    print(f"--------------------------------------------------")
#    print(f"NR: {item[0]}\nLLM: {item[1]} | HUMAN: {item[2]}\nFULL LLM ANSWER:\n{item[3]}")

In [65]:
import pandas as pd
data = pd.DataFrame(table, columns=["Nr", "Atbilde", "Sagaidāmā atbilde", "Pamatojums"])
precision = (data['Atbilde'] == data['Sagaidāmā atbilde']).sum()/len(data)
print(f"PRECIZITĀTE: {precision*100}%")

PRECIZITĀTE: 50.73170731707317%


In [66]:
with open(f"{report_dir}\{date.today():%d.%m}_{procurement_id.replace('/','_')}.htm", 'w', encoding='utf-8') as ofile:
    print(data.to_html(index=False).replace('\\n','<br>'),file=ofile)
    print(f"PRECIZITĀTE: {precision*100}%",file=ofile)