In [1]:
import os
import sys
import configparser
from datetime import date

**CURRENT SETTINGS: Embedding and language models, Chunk size, etc.**

In [2]:
#embeddingmodel = "BAAI/bge-m3"
embeddingmodel = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

In [3]:
llmmodel = { "model": "gpt-4o",
                "version":os.environ.get('AZURE_OPENAI_VERSION','2024-08-01-preview'),
                "azure_deployment":"gpt-4o",
                "azure_endpoint":os.environ.get('AZURE_ENDPOINT',''),
                "api_key":os.environ.get('AZURE_OPENAI_KEY','')}

In [4]:
question_file_path = "C:\\Repos\\vpp-cfla\\questions\\questions.yaml"
prompt_file = 'C:\\Repos\\vpp-cfla\\questions\\prompts.tsv'
report_dir = 'C:\\Repos\\vpp-cfla\\reports'

**PROCUREMENT SETTINGS**

In [5]:
configfile = 'C:\\Repos\\vpp-cfla\\config\\SNP-2021_07_AK.ini'

In [6]:
config = configparser.ConfigParser()
config.read(configfile)

['C:\\Repos\\vpp-cfla\\config\\SNP-2021_07_AK.ini']

In [7]:
EIS_URL = config.get('Procurement', 'EIS_URL')
procurement_id = config.get('Procurement', 'procurement_id')
procurement_file = config.get('Procurement', 'procurement_file')
answer_file = config.get('Procurement', 'answer_file')
if 'agreement_file' in config['Procurement']:
    agreement_file = config.get('Procurement', 'agreement_file')
else:
    agreement_file = ''

In [8]:
chunk_size = 1536
chunk_overlap = 0

**STEP 1: Getting markdown text from the _Project Procurement document_**

In [9]:
from scripts.extractmd import Extractor

In [10]:
ex = Extractor()

In [11]:
procurement_content = ex.convert2markdown(procurement_file)

In [12]:
if len(agreement_file) > 0:
    agreement_content = ex.convert2markdown(agreement_file)
    with open("tmp3.md", 'w', encoding='utf-8') as fout:
        print(agreement_content,file=fout)
    procurement_content = procurement_content + "\n\n# IEPIRKUMA LĪGUMA PROJEKTS\n\n" + agreement_content

Processing C:\Repos\CFLA\2021_07_AK\8.pielikums_Liguma_projekts_SNP_2021_07_AK.pdf...


**STEP 2: Initializing embedding and llm object**

In [13]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding




In [14]:
embedding=HuggingFaceEmbedding(model_name=embeddingmodel,trust_remote_code=True)

In [15]:
print(len(embedding._get_text_embedding("Test string")))

768


In [16]:
from llama_index.llms.azure_openai import AzureOpenAI

In [17]:
llm=AzureOpenAI(azure_deployment=llmmodel["azure_deployment"],
                azure_endpoint=llmmodel["azure_endpoint"],temperature=0.0,
                api_version=llmmodel["version"], api_key=llmmodel["api_key"],
                timeout=120,max_retries=3,top_p=0.0001)

In [18]:
#LLM test
#response = llm.complete("What day is today?")
#response

**STEP 3: Creating FAISS vector index for the procurement document**

In [19]:
from scripts.vectorindex import QnAEngine

In [20]:
qnaengine = QnAEngine(embedding,llm)

In [21]:
ret = await qnaengine.createIndex(procurement_content,"Procurement",chunk_size=chunk_size,chunk_overlap=chunk_overlap)

Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]

60 segments created and vectorized.





In [22]:
if ret == False:
    print("**Failed to create index!**")
    exit
else:
    print("Index is ready.")

Index is ready.


In [23]:
#Retrieval test
#similarsegments = qnaengine.getSimilarNodes("Vai iepirkums ir sadalīts daļās?")
#print(similarsegments)

In [24]:
#Generation test
#result = qnaengine.askQuestion("Noskaidro, vai lietotāja apgalvojums ir patiess? Domā soli pa soli, katram solim neveltot vairāk par pieciem vārdiem. Pamato savu atbildi, citējot fragmentu no konteksta. Atbildei jābūt 'jā', 'nē' vai 'kontekstā nav informācijas'. Atbildi ietver kvadrātiekavās '[]'. Ja kontekstā nav informācijas, paskaidro, kāda informācija ir nepieciešama, lai novērtētu apgalvojumu.",
#                              "Pasūtītājs ir nodrošinājis iespēju piegādātājiem iepazīties uz vietas ar iepirkuma papildu dokumentiem, kam tehnisku iemeslu dēļ vai tajos iekļautās informācijas vai komerciālu interešu aizsardzības dēļ nav nodrošināma brīva un tieša elektroniska piekļuve, sākot ar attiecīgā iepirkuma izsludināšanas brīdi",
#                              usecontext=True)
#result

#### **STEP 4: Acquiring supplementary information**

In [25]:
from scripts.utilities import *

In [26]:
with open('PIL.txt','r',encoding='utf-8') as file:
    piltxt = file.read().strip()

In [27]:
pattern = r'^(?P<key>(\d+\.\s+(pants|pielikums))|Pārejas noteikumi)'
pilchapters = extract_chapters(piltxt, pattern)
#print(pilchapters)

In [28]:
with open('MK107.md','r',encoding='utf-8') as file:
    mk107txt = file.read().strip()

In [29]:
pattern = r'^(?P<key>[# ]*\d+)\.\s+'
mk107chapters = extract_chapters(mk107txt, pattern)
#print(mk107chapters)

In [30]:
#eiscontent = await text_from_url(EIS_URL)
#print(eiscontent)

In [31]:
promptdict = {}

with open(prompt_file,'r',encoding='utf-8') as file:
    for line in file:
        lineparts = line.strip().split('\t')
        if len(lineparts)==2:
            for q in lineparts[1].split(','):
                promptdict[str(q)] = lineparts[0] 

##### **STEP 5: Processing questions**

In [32]:
import yaml
import re

In [33]:
try:
    with open(question_file_path, 'r', encoding='utf-8') as file:
        question_dictonary = yaml.safe_load(file)

except FileNotFoundError:
    print(f"Error: File '{question_file_path}' not found.")
    exit
except yaml.YAMLError as e:
    print(f"Error parsing YAML file: {e}")
    exit
    

In [34]:
try:
    with open(answer_file, 'r', encoding='utf-8') as file:
        answer_dictonary = yaml.safe_load(file)

except FileNotFoundError:
    print(f"Error: File '{answer_file}' not found.")
    exit
except yaml.YAMLError as e:
    print(f"Error parsing YAML file: {e}")
    exit

In [35]:
table = []

In [36]:
def askQuestionSaveAnswer(prompt, question, nr, expectedanswer=''):
    #print(f"Q: {nr}")
    #print(qnaengine.getSimilarNodes(question))
    result = qnaengine.askQuestion(prompt, question, usecontext=True)
    result = re.sub(r'\n\n+',r'\n',result)
    answer = re.search(r'\[\**([^\[\]]+)\**\]',result)
    result = result.strip()
    
    if answer:
        llmanswer=answer.group(1)
        #result = result.replace(f"[{llmanswer}]","").replace(f"Atbilde:","")
        record = [nr, llmanswer, expectedanswer, result]
    else:
        answer = re.search(r'(jā|nē|kontekstā nav informācijas)\b', result, re.IGNORECASE)
        if not answer:
            answer = re.search(r'\[(ja|ne)\]', result, re.IGNORECASE)
        if answer:
            record = [nr, answer.group(1).lower(), expectedanswer, result]
        else:
            record = [nr, '', expectedanswer, result] 
    return record

In [37]:
#for singleq in question_dictonary:
#    extrainfo = get_extra_info(singleq, pilchapters, mk107chapters)    
#    if len(extrainfo) > 0:
#        print(singleq)
#        print(extrainfo)

In [38]:
if True:
    for singleq, singlea in zip(question_dictonary,answer_dictonary):
        print(singleq['nr'],end=' ')
        bcontinue = True
        extrainfo = get_extra_info(singleq, pilchapters, mk107chapters)    
        
        if 'question0' in singleq:
            result0 = askQuestionSaveAnswer(promptdict['0'] + extrainfo, singleq['question0'], f"{singleq['nr']}-0", singlea['answer0'])
            table.append(result0)
    
            if result0[1] == 'nē':
                bcontinue = False
        
        if bcontinue == False:
            if 'question' in singleq:
                table.append([str(singleq['nr']),'n/a',singlea['answer'],''])
            elif 'questions' in singleq:
                for listq, lista in zip(singleq['questions'],singlea['answers']):
                    table.append([str(listq['nr']),'n/a',lista['answer'],''])
        elif 'question' in singleq:
            table.append(askQuestionSaveAnswer(promptdict[str(singleq['nr'])] + extrainfo, singleq['question'], str(singleq['nr']), singlea['answer']))
        elif 'questions' in singleq:
            for listq, lista in zip(singleq['questions'],singlea['answers']):
                print(listq['nr'],end=' ')
                bcontinue = True
                extrainfo = get_extra_info(listq, pilchapters, mk107chapters) 
    
                if 'question0' in listq:
                    result0 = askQuestionSaveAnswer(promptdict['0'] + extrainfo, listq['question0'], f"{listq['nr']}-0", lista['answer0'])
                    table.append(result0)
            
                    if result0[1] == 'nē':
                        bcontinue = False
                        
                if bcontinue == False:
                    table.append([str(listq['nr']),'n/a',lista['answer0'],''])
                else:
                    table.append(askQuestionSaveAnswer(promptdict[str(listq['nr'])] + extrainfo, listq['question'], str(listq['nr']), lista['answer']))   

#except Exception as error:
#    print(f"An exception occurred: {type(error).__name__} {error.args[0]}")
#    print(singleq)
#    print(singlea)


1 2 3 4 5 6 7 8 9 9.1 9.2 9.3 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 35.1 35.2 35.3 35.4 35.5 35.6 35.7 35.8 36 37 37.1 37.2 37.3 37.4 37.5 37.6 37.7 37.8 37.9 37.1 37.11 37.12 37.13 38 38.1 38.2 An exception occurred: ValueError Calculated available context size -199 was not non-negative.
38.3 38.4 38.5 38.6 38.7 38.8 38.9 39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 39.1 39.11 39.12 39.13 39.14 39.15 39.16 39.17 39.18 39.19 39.2 39.21 39.22 39.23 39.24 39.25 39.26 39.27 39.28 39.29 39.3 39.31 39.32 39.33 39.34 39.35 39.36 39.37 39.38 39.39 39.4 40 40.1 An exception occurred: ValueError Calculated available context size -186 was not non-negative.
An exception occurred: ValueError Calculated available context size -190 was not non-negative.
40.2 40.3 40.4 40.5 40.6 40.7 40.8 40.9 40.1 40.11 40.12 41 42 43 44 45 46 47 48 49 50 51 51.1 51.2 51.3 51.4 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 

In [39]:
for item in table:
    print(f"--------------------------------------------------")
    print(f"NR: {item[0]}\nLLM: {item[1]} | HUMAN: {item[2]}\nFULL LLM ANSWER:\n{item[3]}")

--------------------------------------------------
NR: 1
LLM: kontekstā nav informācijas | HUMAN: jā
FULL LLM ANSWER:
Kontekstā nav informācijas par apstiprināto iepirkuma plānu vai izmaiņu būtiskumu.
Nepieciešama informācija par apstiprināto iepirkuma plānu un izmaiņu ietekmi uz procedūras izvēli un līgumcenu.
Atbilde: [kontekstā nav informācijas]
Pārliecība: augsta
--------------------------------------------------
NR: 2-0
LLM:  | HUMAN: nē
FULL LLM ANSWER:
Tiek piemērota iepirkuma procedūra, kas atbilst Publisko iepirkumu likuma prasībām, un iepirkuma komisija lemj par piedāvājuma iesniegšanas termiņa pagarināšanu, veicot attiecīgi grozījumus iepirkuma Nolikumā.
--------------------------------------------------
NR: 2
LLM:  | HUMAN: n/a
FULL LLM ANSWER:
Nepieciešama informācija par pasūtītāja vajadzībām, iepirkuma līguma raksturu, tehniskajām specifikācijām vai piedāvājumu atbilstību.
--------------------------------------------------
NR: 3
LLM:  | HUMAN: jā
FULL LLM ANSWER:
Lai not

In [40]:
import pandas as pd
data = pd.DataFrame(table, columns=["Nr", "Atbilde", "Sagaidāmā atbilde", "Pamatojums"])
precision = (data['Atbilde'] == data['Sagaidāmā atbilde']).sum()/len(data)
print(f"PRECIZITĀTE: {precision*100}%")

PRECIZITĀTE: 22.926829268292686%


In [41]:
with open(f"{report_dir}\{date.today():%d.%m}_{procurement_id.replace('/','_')}.htm", 'w', encoding='utf-8') as ofile:
    print(data.to_html(index=False).replace('\\n','<br>'),file=ofile)
    print(f"PRECIZITĀTE: {precision*100}%",file=ofile)