In [1]:
import os
import sys

**CURRENT SETTINGS: Embedding and language models, Chunk size, etc.**

In [2]:
#embeddingmodel = "BAAI/bge-m3"
embeddingmodel = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

In [3]:
llmmodel = { "model": "gpt-4o",
                "version":os.environ.get('AZURE_OPENAI_VERSION','2024-08-01-preview'),
                "azure_deployment":"gpt-4o",
                "azure_endpoint":os.environ.get('AZURE_ENDPOINT',''),
                "api_key":os.environ.get('AZURE_OPENAI_KEY','')}

In [4]:
procurement_file_dir = 'C:\\Repos\\CFLA\\KND_2020_07_dokumentacijas_pirsmparbaude'
procurement_file = 'Nolikums (90).pdf'
question_file_path = "C:\\Repos\\vpp-cfla\\questions\\questions.yaml"
EIS_URL = "https://www.eis.gov.lv/EKEIS/Supplier/Procurement/34370"

In [5]:
chunk_size = 1024
chunk_overlap = 0

**STEP 1: Getting markdown text from the _Project Procurement document_**

In [6]:
from scripts.extractmd import Extractor

In [7]:
ex = Extractor()

In [8]:
procurement_content = ex.convert2markdown(f"{procurement_file_dir}\\{procurement_file}")

Processing C:\Repos\CFLA\KND_2020_07_dokumentacijas_pirsmparbaude\Nolikums (90).pdf...


In [9]:
with open ("tmp2.md", 'w',encoding='utf-8') as ofile:
    print(procurement_content, file = ofile)

**STEP 2: Initializing embedding and llm object**

In [10]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding




In [11]:
embedding=HuggingFaceEmbedding(model_name=embeddingmodel,trust_remote_code=True)

In [12]:
print(len(embedding._get_text_embedding("Test string")))

768


In [13]:
from llama_index.llms.azure_openai import AzureOpenAI

In [14]:
llm=AzureOpenAI(azure_deployment=llmmodel["azure_deployment"],
                azure_endpoint=llmmodel["azure_endpoint"],temperature=0.0,
                api_version=llmmodel["version"], api_key=llmmodel["api_key"],
                timeout=120,max_retries=3,top_p=0.0001)

In [15]:
#LLM test
#response = llm.complete("What day is today?")

In [16]:
#response

**STEP 3: Creating FAISS vector index for the procurement document**

In [17]:
from scripts.vectorindex import QnAEngine

In [18]:
qnaengine = QnAEngine(embedding,llm)

In [19]:
ret = await qnaengine.createIndex(procurement_content,"Procurement",chunk_size=chunk_size,chunk_overlap=chunk_overlap)

Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]

91 segments created and vectorized.





In [20]:
if ret == False:
    print("**Failed to create index!**")
    exit
else:
    print("Index is ready.")

Index is ready.


In [21]:
#Retrieval test
#similarsegments = qnaengine.getSimilarNodes("Vai iepirkums ir sadalīts daļās?")
#print(similarsegments)

In [22]:
#Generation test
#result = qnaengine.askQuestion("Noskaidro, vai lietotāja apgalvojums ir patiess? Domā soli pa soli, katram solim neveltot vairāk par pieciem vārdiem. Pamato savu atbildi, citējot fragmentu no konteksta. Atbildei jābūt 'jā', 'nē' vai 'kontekstā nav informācijas'. Atbildi ietver kvadrātiekavās '[]'. Ja kontekstā nav informācijas, paskaidro, kāda informācija ir nepieciešama, lai novērtētu apgalvojumu.",
#                              "Pasūtītājs ir nodrošinājis iespēju piegādātājiem iepazīties uz vietas ar iepirkuma papildu dokumentiem, kam tehnisku iemeslu dēļ vai tajos iekļautās informācijas vai komerciālu interešu aizsardzības dēļ nav nodrošināma brīva un tieša elektroniska piekļuve, sākot ar attiecīgā iepirkuma izsludināšanas brīdi",
#                              usecontext=True)
#result

#### **STEP 4: Acquiring supplementary information**

In [23]:
from scripts.utilities import *

In [24]:
with open('PIL.txt','r',encoding='utf-8') as file:
    piltxt = file.read().strip()

In [25]:
pattern = r'^(?P<key>(\d+\.\s+(pants|pielikums))|Pārejas noteikumi)'
pilchapters = extract_chapters(piltxt, pattern)

In [26]:
#eiscontent = await text_from_url(EIS_URL)
#print(eiscontent)

##### **STEP 5: Processing questions**

In [27]:
import yaml
import re

In [28]:
try:
    with open(question_file_path, 'r', encoding='utf-8') as file:
        question_dictonary = yaml.safe_load(file)

except FileNotFoundError:
    print(f"Error: File '{question_file_path}' not found.")
    exit
except yaml.YAMLError as e:
    print(f"Error parsing YAML file: {e}")
    exit
    

In [29]:
table = []

In [30]:
def askQuestionSaveAnswer(prompt, question, nr):

    result = qnaengine.askQuestion(prompt, question, usecontext=True)
        
    result = re.sub(r'\n\n+',r'\n',result)
    answer = re.search(r'\[([^\[\]]+)\]',result)
    result = result.strip()
    
    if answer:
        llmanswer=answer.group(1)
        result = result.replace(f"[{llmanswer}]","").replace(f"Atbilde:","")
        record = [nr, llmanswer, result]
    else:
        answer = re.search(r'\b(jā|nē|kontekstā nav informācijas)\b', result, re.IGNORECASE)
        if answer:
            record = [nr, answer.group(1).lower(), result]
        else:
            record = [nr, '', result]
        
    return record

In [31]:
cod_prompt = "Domā soli pa soli, katram solim neveltot vairāk par pieciem vārdiem. Atbildei jābūt 'jā', 'nē' vai 'kontekstā nav informācijas'."

In [36]:
for singleq in question_dictonary:
    
    bcontinue = True
    extrainfo = get_extra_info(singleq, pilchapters)    
    
    if 'question0' in singleq:
        result0 = askQuestionSaveAnswer(cod_prompt + extrainfo, singleq['question0'], f"{singleq['nr']}-0")
        table.append(result0)

        if result0[1] == 'nē':
            bcontinue = False
            
    if bcontinue == False:
        if 'question' in singleq:
            table.append([singleq['nr'],'N/A',''])
        elif 'questions' in singleq:
            for idx,single in enumerate(singleq['questions']):
                table.append([f"{singleq['nr']}.{idx+1}",'N/A',''])
        continue

    if 'question' in singleq:
        table.append(askQuestionSaveAnswer(singleq['prompt'] + extrainfo, singleq['question'], singleq['nr']))
    elif 'questions' in singleq:
        for idx,single in enumerate(singleq['questions']):
            extrainfo = get_extra_info(single, pilchapters) 
            table.append(askQuestionSaveAnswer(single['prompt'] + extrainfo, single['question'], f"{singleq['nr']}.{idx+1}"))       

In [37]:
import pandas as pd
data = pd.DataFrame(table, columns=["Nr", "Atbilde", "Pamatojums"])

In [38]:
from tabulate import tabulate

In [39]:
with open("report.htm", 'w', encoding='utf-8') as ofile:
    print(data.to_html(index=False),file=ofile)
print(tabulate(data, headers='keys', tablefmt='grid', showindex=False))

+------+----------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Nr   | Atbilde                    | Pamatojums                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
| 1    | Kontekstā nav informācija