In [45]:
import numpy as np
from dotenv import load_dotenv
from openai import OpenAI
import pickle
import faiss
import PyPDF2
import spacy
from pathlib import Path
nlp = spacy.load(
            "en_core_web_sm",
            disable=[ "tagger", "attribute_ruler", "lemmatizer", "ner","textcat","custom "]
        )
from typing import List

In [46]:
load_dotenv()
client = OpenAI()

In [65]:
path = Path("/Users/ozgur.sahin/Documents/ragchat_local/docs/domain1/2024_1689.pdf")

In [49]:
def read_file(file_path: str):
        file_data = {
            "page_sentence_amount": [],
            "sentences": []
        }
        # Open file
        path = Path(file_path)
        file_extension = path.suffix.lower()
        try:
            if file_extension == '.pdf':
                with path.open('rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    for page in pdf_reader.pages:
                        page_text = page.extract_text()
                        _process_text(page_text, file_data)
            
            # elif file_extension == '.docx':
            #     doc = Document(path)
            #     for para in doc.paragraphs:
            #         self._process_text(para.text, file_data)
            
            elif file_extension in ['.txt', '.rtf']:
                text = path.read_text(encoding='utf-8')
                _process_text(text, file_data)
            
            else:
                raise ValueError(f"Unsupported file type: {file_extension}")
        
        except PyPDF2.errors.PdfReadError:
            print(f"Error reading PDF file: {path}. The file might be corrupted or incompatible.")
        except Exception as e:
            print(f"Error reading file: {path}. Error: {str(e)}")
    
        return file_data

def _process_text(text, file_data):
        docs = nlp(text)
        sentences = [sent.text.replace('\n', ' ').strip() for sent in docs.sents]
        valid_sentences = [sentence for sentence in sentences if len(sentence) > 15]
        file_data["page_sentence_amount"].append(len(valid_sentences))
        file_data["sentences"].extend(valid_sentences)

In [66]:
file_data = read_file(path)

In [50]:
def create_vector_embeddings_from_sentences(
            sentences: List[str],
            batch_size: int = 2000
        ):
        file_embeddings = []
        batches = [sentences[i:i+batch_size] for i in range(0,len(sentences), batch_size)]
        
        for batch in batches:
            sentence_embedding = client.embeddings.create(
                model="text-embedding-ada-002", input=batch
            )
            file_embeddings.extend(sentence_embedding.data)

        return np.array(
            [x.embedding for x in file_embeddings], float
        )

In [67]:
embed = create_vector_embeddings_from_sentences(file_data["sentences"])

In [4]:
def create_vector_embedding_from_query(query):
    query_embedding = client.embeddings.create(
        model="text-embedding-ada-002", input=query
    )
    return np.array(query_embedding.data[0].embedding, float).reshape(1, -1)

In [5]:
def load_index(index_path: str):
    with open(index_path, "rb") as f:
        index_object = pickle.load(f)
    return index_object

In [59]:
def create_IP_index(embeddings:np.ndarray):
    norms = np.linalg.norm(embeddings,axis=1,keepdims=True)
    normalized_embeddings = embeddings/norms
    dimension = len(normalized_embeddings[0])
    index = faiss.IndexFlatIP(dimension)
    index.add(normalized_embeddings)
    return index

def search_index_IP(embeddings, query_embedding):
    #domain_number = int(domain[-1])
    #index_object = index_list[domain_number - 1]
    index = create_IP_index(embeddings)
    return index.search(query_embedding, 10)


In [60]:
def create_flat_index(embeddings:np.ndarray):
    #norms = np.linalg.norm(embeddings,axis=1,keepdims=True)
    #normalized_embeddings = embeddings/norms
    dimension = len(embeddings[0])
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

def search_index_flat(domain, index_list, query_embedding):
    domain_number = int(domain[-1])
    index_object = index_list[domain_number - 1]
    index = create_flat_index(embeddings=index_object["embeddings"])
    return index.search(query_embedding, 10)


In [61]:
index1 = load_index(f"db/test/domain1.pickle")
index2 = load_index(f"db/test/domain2.pickle")
index3 = load_index(f"db/test/domain3.pickle")
index4 = load_index(f"db/test/domain4.pickle")
index5 = load_index(f"db/test/domain5.pickle")
index_list = [index1, index2, index3, index4, index5]

In [72]:
domain = "domain1"
query = "As a high-risk AI system provider, what are the information I should give to my users?"
query_embedding = create_vector_embedding_from_query(query)
sentences = index_list[int(domain[-1]) - 1]["sentences"]

In [73]:
D, I = search_index_IP(embeddings=embed, query_embedding=query_embedding)
for i, index in enumerate(I[0]):
    print(f"{i+1}. Similaritty:{D[0][i]} || Sentence:{sentences[index]}\n")

1. Similaritty:0.8812803030014038 || Sentence:High-r isk AI systems should be accom panied by appropr iate information in the form of instr uctions of use.

2. Similaritty:0.8760971426963806 || Sentence:Those  would cover information on possible kno wn and foreseeable circumstances related to the use of the high-r isk AI  syste m, including deplo yer action that may influence syste m behavio ur and perfor mance, under which the AI system  can lead to risks to health, safety, and fundamental rights, on the changes that have been pre-deter mined and  assessed for conf ormity by the provider and on the relevant human oversight measures, including the measures to  facilitate the inter pretation of the outputs of the AI syste m by the deplo yers.

3. Similaritty:0.8646661043167114 || Sentence:The provider of a high-r isk AI system and the third party that supplies an AI syste m, tools, services, components, or  processes that are used or integrat ed in a high-r isk AI system shall, by writt

In [74]:
D, I = search_index_flat(domain=domain, index_list=index_list, query_embedding=query_embedding)
for i, index in enumerate(I[0]):
    print(f"{i+1}. Similaritty:{D[0][i]} || Sentence:{sentences[index]}\n")

1. Similaritty:0.2374396175146103 || Sentence:High-r isk AI systems should be accom panied by appropr iate information in the form of instr uctions of use.

2. Similaritty:0.24780553579330444 || Sentence:Those  would cover information on possible kno wn and foreseeable circumstances related to the use of the high-r isk AI  syste m, including deplo yer action that may influence syste m behavio ur and perfor mance, under which the AI system  can lead to risks to health, safety, and fundamental rights, on the changes that have been pre-deter mined and  assessed for conf ormity by the provider and on the relevant human oversight measures, including the measures to  facilitate the inter pretation of the outputs of the AI syste m by the deplo yers.

3. Similaritty:0.2706679105758667 || Sentence:The provider of a high-r isk AI system and the third party that supplies an AI syste m, tools, services, components, or  processes that are used or integrat ed in a high-r isk AI system shall, by writ