In [1]:
import os
import sys

**CURRENT SETTINGS: Embedding and language models, Chunk size, etc.**

In [2]:
#embeddingmodel = "BAAI/bge-m3"
embeddingmodel = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

In [3]:
llmmodel = { "model": "gpt-4o",
                "version":os.environ.get('AZURE_OPENAI_VERSION','2024-08-01-preview'),
                "azure_deployment":"gpt-4o",
                "azure_endpoint":os.environ.get('AZURE_ENDPOINT',''),
                "api_key":os.environ.get('AZURE_OPENAI_KEY','')}

In [4]:
procurement_file_dir = 'C:\\Repos\\CFLA\\KND_2020_07_dokumentacijas_pirsmparbaude'
procurement_file = 'Nolikums (90).docx'
question_file_path = "C:\\Repos\\CFLA\\questions\\questions.yaml"

In [5]:
chunk_size = 1024
chunk_overlap = 0

**STEP 1: Getting markdown text from the _Project Procurement document_**

In [6]:
from scripts.extractmd import Extractor

In [7]:
ex = Extractor()

In [8]:
procurement_content = ex.convert2markdown(f"{procurement_file_dir}\\{procurement_file}")

In [9]:
#print(procurement_content)

**STEP 2: Initializing embedding and llm object**

In [10]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding




In [11]:
embedding=HuggingFaceEmbedding(model_name=embeddingmodel,trust_remote_code=True)

In [12]:
print(len(embedding._get_text_embedding("Test string")))

768


In [13]:
from llama_index.llms.azure_openai import AzureOpenAI

In [14]:
llm=AzureOpenAI(azure_deployment=llmmodel["azure_deployment"],
                azure_endpoint=llmmodel["azure_endpoint"],temperature=0.0,
                api_version=llmmodel["version"], api_key=llmmodel["api_key"],
                timeout=120,max_retries=3,top_p=0.0001)

In [15]:
#LLM test
#response = llm.complete("What day is today?")

In [16]:
#response

**STEP 3: Creating FAISS vector index for the procurement document**

In [17]:
from scripts.vectorindex import QnAEngine

In [18]:
qnaengine = QnAEngine(embedding,llm)

In [19]:
ret = await qnaengine.createIndex(procurement_content,"Procurement",chunk_size=chunk_size,chunk_overlap=chunk_overlap)

Generating embeddings: 100%|██████████████████████████████████████████████████████████████████████ [ time left: 00:00 ]

73 segments created and vectorized.





In [20]:
if ret == False:
    print("**Failed to create index!**")
    exit
else:
    print("Index is ready.")

Index is ready.


In [21]:
#Retrieval test
#similarsegments = qnaengine.getSimilarNodes("Pretendenta tiesības un pienākumi")
#print(similarsegments)

In [23]:
#Generation test
#result = qnaengine.askQuestion("Analizē apgalvojumu un, izmantojot konteksta informāciju, atbildi, vai ir spēkā apgalvojums. Atgriez fragmentu no konteksta, kas to apstiprina. Atbildei jābūt 'jā', 'nē' vai 'kontekstā nav informācijas'. Atbildi ietver kvadrātiekavās '[]'. Ja kontekstā nav informācijas, paskaidro, kāda informācija ir nepieciešama, lai novērtētu apgalvojumu.",
#                              "Nav konstatējama iepirkuma priekšmeta nepamatota apvienošana",
#                              usecontext=True)
#print(result)

**STEP 4: Processing questions**

In [24]:
import yaml
import re

In [25]:
try:
    with open(question_file_path, 'r', encoding='utf-8') as file:
        question_dictonary = yaml.safe_load(file)

except FileNotFoundError:
    print(f"Error: File '{question_file_path}' not found.")
    exit
except yaml.YAMLError as e:
    print(f"Error parsing YAML file: {e}")
    exit
    

In [66]:
table = []

In [67]:
for singleq in question_dictonary:
    result = qnaengine.askQuestion(singleq['prompt'],singleq['question'],usecontext=True)
    answer = re.search(r'(\[[^\[\]]+\])',result)
    result = re.sub(r'\n\n+',r'\n',result)
    if answer:
        llmanswer=answer.group(1)
        llmanswer = llmanswer.replace('[','').replace(']','')
        result = result.replace(answer.group(1),'')
        table.append([singleq['nr'],llmanswer,result])
    else:
        table.append([singleq['nr'],'',result])        

In [68]:
import pandas as pd
data = pd.DataFrame(table, columns=["Nr", "Atbilde", "Pamatojums"])

In [69]:
from tabulate import tabulate

In [70]:
with open("report.htm", 'w', encoding='utf-8') as ofile:
    print(data.to_html(index=False),file=ofile)
print(tabulate(data, headers='keys', tablefmt='grid', showindex=False))

+------+----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|   Nr | Atbilde                    | Pamatojums                                                                                                                                                                          |
|    1 | kontekstā nav informācijas | Nepieciešama informācija par apstiprināto iepirkuma plānu un konkrētām izmaiņām, lai novērtētu apgalvojumu.                                                                         |
+------+----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|    2 | Kontekstā nav informācijas | Lai novērtētu apgalvojumu, nepieciešama informācija par konkursa procedūru ar saru