In [1207]:
from pathlib import Path
import re
import fitz
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
from typing import List
import faiss


In [1208]:
load_dotenv()
client = OpenAI()

In [1275]:
path = Path("/Users/ozgur.sahin/Documents/ragchat_local/docs/domain1/2024_1689.pdf")

In [1243]:
#extract header and sentences function
doc = fitz.open(path)
page_sentences = {
    "sentence" : [],
    "is_header" : [],
    "page_num" : [],
    "boost" : []
 } 
doc.metadata
for page_num in range(len(doc)):
        page = doc.load_page(page_num)

        block_text = page.get_text("blocks")
        text_blocks = [block for block in blocks if block["type"] == 0]
        
        blocks = page.get_text("dict")["blocks"]
        for i,block in enumerate(text_blocks):
            if "lines" in block and len(block["lines"]) >= 1 and len(block["lines"]) < 3:
                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"]
                        previous_text = text
                        if span["size"] > 8 and (span["font"].find("Medi") >0 or span["font"].find("Bold") >0 or span["font"].find("B") >0) and len(text) > 3 and text[0].isupper():
                            current_header = text
                            page_sentences["sentence"].append(text)
                            page_sentences["is_header"].append(1)
                            page_sentences["page_num"].append(page_num+1)
                            page_sentences["boost"].append(0)
                        elif len(text) > 3:
                            page_sentences["sentence"].append(text)
                            page_sentences["is_header"].append(0)
                            page_sentences["page_num"].append(page_num+1)
                            page_sentences["boost"].append(0)
            elif "lines" in block:
                for sent_num in range(len(block_text[i][4].split('. '))):
                        sentence = re.split(r'(?<=[.!?])\s+', block_text[i][4])[sent_num].strip()
                        sentence = re.sub(r'(\b\w+)\s*\n\s*(\w+\b)',r'\1 \2',sentence)
                        sentence = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', sentence)
                        sentence = re.sub(r'[,()]\s*\n\s*(\w+)',r' \1',sentence)
                        sentence = re.sub(r'(\b\w+)\s*-\s*(\w+\b)',r'\1 \2',sentence)
                        sentence = re.sub(r'(\w+)\s*[-–]\s*(\w+)',r'\1\2',sentence)
                        sentence = sentence.replace('\n','')
                        if len(sentence) > 15:
                            page_sentences["sentence"].append(sentence)
                            page_sentences["is_header"].append(0)
                            page_sentences["page_num"].append(page_num + 1)
                            page_sentences["boost"].append(0)

In [1292]:
f"{doc.metadata['creationDate'][4:6]}-{doc.metadata['creationDate'][6:8]}-{doc.metadata['creationDate'][8:10]}"

'24-07-11'

In [1259]:
#extract header from sentences function
headers_list = {
    "header" : [],
    "sentence_index" : [],
}
for i,(sentence,is_header) in enumerate(zip(page_sentences["sentence"],page_sentences["is_header"])):
    if is_header == 1:
        headers_list["header"].append(sentence)
        headers_list["sentence_index"].append(i)

print(headers_list["header"])
print(headers_list["sentence_index"])

['REGULATION (EU) 2024/1689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL', 'GENERAL PROVISIONS', 'PROHIBITED AI PRACTICES', 'Prohibited AI practices', 'HIGH-RISK AI SYSTEMS', 'Classification of AI systems as high-risk', 'Classification rules for high-risk AI systems', 'Amendments to Annex III', 'Requirements for high-risk AI systems', 'Compliance with the requirements', 'Risk management system', 'Data and data governance', 'Technical documentation', 'Transparency and provision of information to deployers', 'Accuracy, robustness and cybersecurity', 'Obligations of providers and deployers of high-risk AI systems and other parties', 'Obligations of providers of high-risk AI systems', 'Quality management system', 'Documentation keeping', 'Automatically generated logs', 'Corrective actions and duty of information', 'Cooperation with competent authorities', 'Authorised representatives of providers of high-risk AI systems', 'Obligations of importers', 'Obligations of distributors', 'Responsi

In [1215]:
def create_vector_embeddings_from_sentences(
            sentences: List[str],
            batch_size: int = 2000
        ):
        file_embeddings = []
        batches = [sentences[i:i+batch_size] for i in range(0,len(sentences), batch_size)]
        
        for batch in batches:
            sentence_embedding = client.embeddings.create(
                model="text-embedding-ada-002", input=batch
            )
            file_embeddings.extend(sentence_embedding.data)

        return np.array(
            [x.embedding for x in file_embeddings], float
        )

In [1216]:
def create_vector_embedding_from_query(query):
        query_embedding = client.embeddings.create(
            model="text-embedding-ada-002", input=query
        )
        return np.array(query_embedding.data[0].embedding, float).reshape(1, -1)

In [1217]:
def create_flat_index(embeddings:np.ndarray
        ):
        dimension = len(embeddings[0])
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)
        return index

In [1246]:
header_embed = create_vector_embeddings_from_sentences(headers_list['header'])

In [1247]:
query = "As a high-risk AI system provider, what are the information I should give to my users?"
query_embed = create_vector_embedding_from_query(query)

In [1248]:
index = create_flat_index(header_embed)

In [1264]:
#search and boost function
D,I = index.search(query_embed,10)
for sentence_num in I[0]:
    start = headers_list['sentence_index'][sentence_num]
    if headers_list['sentence_index'][sentence_num+1] < len(page_sentences['sentence']):
        end = headers_list['sentence_index'][sentence_num+1]
    else: 
        end = len(page_sentences['boost'])
    for i in range(start,end):
        page_sentences['boost'][i] = 1