In [1]:
import os
import sys
import configparser
from datetime import date

**CURRENT SETTINGS: Embedding and language models, Chunk size, etc.**

In [2]:
#embeddingmodel = "BAAI/bge-m3"
embeddingmodel = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

In [3]:
llmmodel = { "model": "gpt-4o",
                "version":os.environ.get('AZURE_OPENAI_VERSION','2024-08-01-preview'),
                "azure_deployment":"gpt-4o",
                "azure_endpoint":os.environ.get('AZURE_ENDPOINT',''),
                "api_key":os.environ.get('AZURE_OPENAI_KEY','')}

In [4]:
question_file_path = "C:\\Repos\\vpp-cfla\\questions\\questions.yaml"
prompt_file = 'C:\\Repos\\vpp-cfla\\questions\\prompts.tsv'
report_dir = 'C:\\Repos\\vpp-cfla\\reports'

**PROCUREMENT SETTINGS**

In [5]:
configfile = 'C:\\Repos\\vpp-cfla\\config\\SNP-2021_07_AK.ini'
#configfile = 'C:\\Repos\\vpp-cfla\\config\\KND-2020_07.ini'
#configfile = 'C:\\Repos\\vpp-cfla\\config\\KND-2020_20.ini'

In [6]:
config = configparser.ConfigParser()
config.read(configfile)

['C:\\Repos\\vpp-cfla\\config\\SNP-2021_07_AK.ini']

In [7]:
EIS_URL = config.get('Procurement', 'EIS_URL')
procurement_id = config.get('Procurement', 'procurement_id')
procurement_file = config.get('Procurement', 'procurement_file')
answer_file = config.get('Procurement', 'answer_file')
if 'agreement_file' in config['Procurement']:
    agreement_file = config.get('Procurement', 'agreement_file')
else:
    agreement_file = ''

In [8]:
chunk_size = 1536
chunk_overlap = 0
top_similar = 5

**STEP 1: Getting markdown text from the _Project Procurement document_**

In [9]:
from scripts.extractmd import Extractor

In [10]:
ex = Extractor()

In [11]:
procurement_content = ex.convert2markdown(procurement_file)

In [12]:
if len(agreement_file) > 0:
    agreement_content = ex.convert2markdown(agreement_file)
    procurement_content = procurement_content + "\n\n# IEPIRKUMA LĪGUMA PROJEKTS\n\n" + agreement_content
    with open("tmp3.md", 'w', encoding='utf-8') as fout:
        print(procurement_content,file=fout)

Processing C:\Repos\CFLA\2021_07_AK\8.pielikums_Liguma_projekts_SNP_2021_07_AK.pdf...


**STEP 2: Initializing embedding and llm object**

In [13]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding




In [14]:
embedding=HuggingFaceEmbedding(model_name=embeddingmodel,trust_remote_code=True)

In [15]:
print(len(embedding._get_text_embedding("Test string")))

768


In [16]:
from llama_index.llms.azure_openai import AzureOpenAI

In [17]:
llm=AzureOpenAI(azure_deployment=llmmodel["azure_deployment"],
                azure_endpoint=llmmodel["azure_endpoint"],temperature=0.0,
                api_version=llmmodel["version"], api_key=llmmodel["api_key"],
                timeout=120,max_retries=3,top_p=0.0001)

In [18]:
#LLM test
#response = llm.complete("What day is today?")
#response
print(llm.metadata.context_window)

4096


**STEP 3: Creating FAISS vector index for the procurement document**

In [19]:
from scripts.vectorindex import QnAEngine

In [20]:
qnaengine = QnAEngine(embedding,llm)

In [21]:
ret = await qnaengine.createIndex(procurement_content,"Procurement",chunk_size=chunk_size,chunk_overlap=chunk_overlap)

Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]

63 segments created and vectorized.





In [22]:
if ret == False:
    print("**Failed to create index!**")
    exit
else:
    print("Index is ready.")

Index is ready.


In [23]:
#Retrieval test
#similarsegments = qnaengine.getSimilarNodes("Vai iepirkums ir sadalīts daļās?")
#print(similarsegments)

In [24]:
#Generation test
#result = qnaengine.askQuestion("Noskaidro, vai lietotāja apgalvojums ir patiess? Domā soli pa soli, katram solim neveltot vairāk par pieciem vārdiem. Pamato savu atbildi, citējot fragmentu no konteksta. Atbildei jābūt 'jā', 'nē' vai 'kontekstā nav informācijas'. Atbildi ietver kvadrātiekavās '[]'. Ja kontekstā nav informācijas, paskaidro, kāda informācija ir nepieciešama, lai novērtētu apgalvojumu.",
#                              "Pasūtītājs ir nodrošinājis iespēju piegādātājiem iepazīties uz vietas ar iepirkuma papildu dokumentiem, kam tehnisku iemeslu dēļ vai tajos iekļautās informācijas vai komerciālu interešu aizsardzības dēļ nav nodrošināma brīva un tieša elektroniska piekļuve, sākot ar attiecīgā iepirkuma izsludināšanas brīdi",
#                              usecontext=True)
#result

#### **STEP 4: Acquiring supplementary information**

In [25]:
from scripts.utilities import *

In [26]:
with open('PIL.txt','r',encoding='utf-8') as file:
    piltxt = file.read().strip()

In [27]:
pattern = r'^(?P<key>(\d+\.\s+(pants|pielikums))|Pārejas noteikumi)'
pilchapters = extract_chapters(piltxt, pattern)

In [28]:
with open('MK107.md','r',encoding='utf-8') as file:
    mk107txt = file.read().strip()

In [29]:
pattern = r'^(?P<key>[# ]*\d+)\.\s+'
mk107chapters = extract_chapters(mk107txt, pattern)

In [30]:
with open('S_LR_NSL.txt','r',encoding='utf-8') as file:
    nsltxt = file.read().strip()

In [31]:
pattern = r'^(?P<key>(\d+\.(\d+)?\s+pants))'
nslchapters = extract_chapters(nsltxt, pattern)

In [32]:
with open('MK_I3.txt','r',encoding='utf-8') as file:
    mki3txt = file.read().strip()

In [33]:
pattern = r'^\*\* (?P<key>\d+(\.\d+)?)\.\s+'
mki3chapters = extract_chapters(mki3txt, pattern)

In [34]:
#eiscontent = await text_from_url(EIS_URL)
#print(eiscontent)

In [35]:
promptdict = {}

with open(prompt_file,'r',encoding='utf-8') as file:
    for line in file:
        lineparts = line.strip().split('\t')
        if len(lineparts)==2:
            for q in lineparts[1].split(','):
                promptdict[str(q)] = lineparts[0] 

##### **STEP 5: Processing questions**

In [36]:
import yaml
import re

In [37]:
try:
    with open(question_file_path, 'r', encoding='utf-8') as file:
        question_dictonary = yaml.safe_load(file)

except FileNotFoundError:
    print(f"Error: File '{question_file_path}' not found.")
    exit
except yaml.YAMLError as e:
    print(f"Error parsing YAML file: {e}")
    exit
    

In [38]:
try:
    with open(answer_file, 'r', encoding='utf-8') as file:
        answer_dictonary = yaml.safe_load(file)

except FileNotFoundError:
    print(f"Error: File '{answer_file}' not found.")
    exit
except yaml.YAMLError as e:
    print(f"Error parsing YAML file: {e}")
    exit

In [39]:
table = []

In [40]:
import json
def askQuestionSaveAnswer(prompt, question, nr, expectedanswer=''):
    result = qnaengine.askQuestion(prompt, question, usecontext=True,n=top_similar)
    result = re.sub(r'\n\n+',r'\n',result).strip()

    answer = re.search(r'\{[^\{\}]+\}',result, re.IGNORECASE)
    if answer:
        try:
            jsonanswer=json.loads(answer.group(1))
            llmanswer = jsonanswer.get('answer','')
            record = [nr, llmanswer, expectedanswer, result]
            return record
        except:
            pass
            
    answer = re.search(r'\[\**(jā|nē|kontekstā nav informācijas|n/a)\**\]',result, re.IGNORECASE)
    
    if answer:
        llmanswer=answer.group(1)
        #result = result.replace(f"[{llmanswer}]","").replace(f"Atbilde:","")
        record = [nr, llmanswer, expectedanswer, result]
    else:
        answer = re.search(r"'?(jā|nē|kontekstā nav informācijas|n/a)'?", result, re.IGNORECASE)
        if not answer:
            answer = re.search(r'\[(ja|ne)\]', result, re.IGNORECASE)
        if answer:
            record = [nr, answer.group(1).lower(), expectedanswer, result]
        else:
            record = [nr, '', expectedanswer, result] 
    return record

In [41]:
#for singleq in question_dictonary:
#    extrainfo = get_extra_info(singleq, pilchapters, mk107chapters)    
#    if len(extrainfo) > 0:
#        print(singleq)
#        print(extrainfo)

In [46]:
with open("nodes.log", 'a', encoding='utf-8') as ofile:
    for singleq, singlea in zip(question_dictonary,answer_dictonary):
        print(singleq['nr'],end=' ')
        bcontinue = True
        extrainfo = get_extra_info(singleq, pilchapters, mk107chapters, nslchapters, mki3chapters)    
        extrainfo = qnaengine.compressPrompt(extrainfo,3000)
        
        if 'question0' in singleq:
            result0 = askQuestionSaveAnswer(promptdict['0'] + extrainfo, singleq['question0'], f"{singleq['nr']}-0", singlea['answer0'])
            table.append(result0)
            if table[-1][1] != table[-1][2]:
                print(f"\nQ: {singleq['nr']}-0",file=ofile) 
                print(qnaengine.getSimilarNodes(singleq['question0']),file=ofile)
    
            if result0[1] == 'nē':
                bcontinue = False
        
        if bcontinue == False:
            if 'question' in singleq:
                table.append([str(singleq['nr']),'n/a',singlea['answer'],''])
            elif 'questions' in singleq:
                for listq, lista in zip(singleq['questions'],singlea['answers']):
                    table.append([str(listq['nr']),'n/a',lista['answer'],''])
        elif 'question' in singleq:
            table.append(askQuestionSaveAnswer(promptdict[str(singleq['nr'])] + extrainfo, singleq['question'], str(singleq['nr']), singlea['answer']))
            if table[-1][1] != table[-1][2]: 
                print(f"\nQ: {str(singleq['nr'])}",file=ofile)
                print(qnaengine.getSimilarNodes(singleq['question']),file=ofile)
        elif 'questions' in singleq:
            for listq, lista in zip(singleq['questions'],singlea['answers']):
                print(listq['nr'],end=' ')
                bcontinue = True
                extrainfo = get_extra_info(listq, pilchapters, mk107chapters, nslchapters, mki3chapters) 
                extrainfo = qnaengine.compressPrompt(extrainfo,3000)
                
                if 'question0' in listq:
                    result0 = askQuestionSaveAnswer(promptdict['0'] + extrainfo, listq['question0'], f"{listq['nr']}-0", lista['answer0'])
                    table.append(result0)
                    if table[-1][1] != table[-1][2]:
                        print(f"\nQ: {listq['nr']}-0",file=ofile)
                        print(qnaengine.getSimilarNodes(listq['question0']),file=ofile)
            
                    if result0[1] == 'nē':
                        bcontinue = False
                        
                if bcontinue == False:
                    table.append([str(listq['nr']),'n/a',lista['answer0'],''])
                else:
                    table.append(askQuestionSaveAnswer(promptdict[str(listq['nr'])] + extrainfo, listq['question'], str(listq['nr']), lista['answer']))
                    if table[-1][1] != table[-1][2]:
                        print(f"\nQ: {str(listq['nr'])}",file=ofile)
                        print(qnaengine.getSimilarNodes(listq['question'],n=top_similar),file=ofile)


1 2 3 4 5 6 7 8 9 9.1 9.2 9.3 10 metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>} template_vars=['context_str', 'query_str'] kwargs={} output_parser=None template_var_mappings=None function_mappings=None template="Kontekstā ir fragmenti no nolikuma. Noskaidro, vai lietotāja apgalvojums ir patiess. Atgriez rezultātu JSON formtā, ietverot atslēgas: 1) 'answer' ar iespējamām vērtībām  'jā', 'nē', 'n/a' vai 'kontekstā nav informācijas'; 2) 'rate' - pārliecību par atbildi skalā  'zema', 'vidēja', 'augsta'; 3) 'explanation'. Vienmēr citē fragmentu no konteksta, kas pamato 'jā' vai 'nē' atbildi ('explanation' laukā). Ja kontekstā nav informācijas, paskaidro, kāda informācija ir nepieciešama, lai novērtētu apgalvojumu ('explanation' laukā). Ja kontekstā teikts, ka nolikumam ir nodrošināta brīva un tieša elektroniska piekļuve, atgriez 'n/a'.\nKonteksta informācija ietverta <context> tagos:\n<context>\n{context_str}\n</context>\n\nLietotāja apgalvojums vai jautājums ir šāds:\n{query_str}"

In [47]:
for item in table:
    print(f"--------------------------------------------------")
    print(f"NR: {item[0]}\nLLM: {item[1]} | HUMAN: {item[2]}\nFULL LLM ANSWER:\n{item[3]}")

--------------------------------------------------
NR: 1
LLM: kontekstā nav informācijas | HUMAN: jā
FULL LLM ANSWER:
{
  "answer": "kontekstā nav informācijas",
  "rate": "vidēja",
  "explanation": "Kontekstā nav informācijas par iepirkuma plānu vai par to, vai izmaiņas tajā ir uzskatāmas par būtiskām. Lai novērtētu apgalvojumu, nepieciešama informācija par apstiprināto iepirkuma plānu un konkrētām izmaiņām tajā, kā arī kritēriji, kas nosaka, vai izmaiņas skar procedūras izvēli un līgumcenu."
}
--------------------------------------------------
NR: 2-0
LLM: nē | HUMAN: nē
FULL LLM ANSWER:
```json
{
  "answer": "nē",
  "rate": "augsta",
  "explanation": "Nolikumā ir skaidri norādīts, ka iepirkuma procedūras veids ir atklāts konkurss: 'Iepirkuma procedūras veids - atklāts konkurss.'"
}
```
--------------------------------------------------
NR: 2
LLM: n/a | HUMAN: n/a
FULL LLM ANSWER:

--------------------------------------------------
NR: 3
LLM: jā | HUMAN: jā
FULL LLM ANSWER:
```json
{

In [48]:
import pandas as pd
data = pd.DataFrame(table, columns=["Nr", "Atbilde", "Sagaidāmā atbilde", "Pamatojums"])
precision = (data['Atbilde'] == data['Sagaidāmā atbilde']).sum()/len(data)
print(f"PRECIZITĀTE: {precision*100}%")

PRECIZITĀTE: 34.63414634146341%


In [49]:
with open(f"{report_dir}\{date.today():%d.%m}_{procurement_id.replace('/','_')}.htm", 'w', encoding='utf-8') as ofile:
    print(data.to_html(index=False).replace('\\n','<br>'),file=ofile)
    print(f"PRECIZITĀTE: {precision*100}%",file=ofile)