In [19]:
import faiss
import numpy as np
import os
import openai
OPENAI_KEY = os.environ.get("PERSONAL_OPENAI_KEY")
# initialize openai client
openai_client = openai.OpenAI(api_key=OPENAI_KEY)

In [20]:
def load_press_release_data(data_folder: str) -> list:
    """
        load files data into an array
    """
    release_data: list = []
    files: list[str] = os.listdir(data_folder)
    for file_name in files:
        if file_name.endswith('.txt'):
            file_path = os.path.join(data_folder, file_name)
            text = open(file_path, encoding='utf-8').read().replace("\n", " ")
            release_data.append(text)

    return release_data

In [21]:
def generate_embedding(text: str, model: str ="text-embedding-3-small") -> list:
    """
       This method will generate vector embeddings
       for text using OPENAI's embedding API.
    """
    text = text.replace('\n', " ")
    return openai_client.embeddings.create(input = [text], model=model).data[0].embedding


In [22]:
def store_embeddings(embeddings_folder: str, file_name: str, embedding: list) -> None:
    """
    This method stores generated embeddings into a folder 
    inorder to fetch them without regenerating them.
    """
    if not os.path.exists(embeddings_folder):
        os.mkdir(embeddings_folder)
    with open(os.path.join(embeddings_folder, file_name), "w") as file_object:
        file_object.write(str(embedding))


In [23]:
def fetch_embeddings(embeddings_folder: str) -> list[list]:
    """
    This method fetches already generated embeddings from
    embeddings folder.
    """
    embeddings: list[list] = []
    files: list[str] = os.listdir(embeddings_folder)
    for file_name in files:
        if file_name.endswith('.txt'):
            file_path: str = os.path.join(embeddings_folder, file_name)
            # convert array stored as string back to list
            embeddings.append(eval(open(file_path, encoding='utf-8').read()))
    return embeddings


In [24]:
def generate_index(embeddings: list[list]):
    """
    This method generates index of vector embeddings.
    Uses FAISS retrival model to generate in-memory vector DB index.
    Return index object
    """
    dimension: int = len(embeddings[0])
    # generate index
    index = faiss.IndexFlatL2(dimension)
    # add data embeddings to index
    index.add(embeddings)
    return index

In [25]:
def retrieval_service(index, release_data: list[str], question: str, top_k: int = 5) -> list[str]:
    """
    This method retrives docs using semantic search on index.
    Returns actual data matched based on search
    """
    # generate embeddings for question being asked
    question_embedding: list = generate_embedding(question)
    # search index for matching indices
    distances, indices = index.search(np.array([question_embedding]), top_k)
    return [release_data[idx] for idx in indices[0]]


In [26]:
def generate_response(context: str, question: str) -> str:
    """
    This method provides context and question to OPENAI API
    and to get response from LLM.
    """
    prompt = f"Given the context: {context}, provide an answer to the question: `{question}`."
    # OPENAI API call to LLM
    response = openai_client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {'role': 'system', 'content': "You have to answer question based on context given"},
            {'role': 'user', 'content': prompt}
        ],
        temperature=0
    )
    return response.choices[0].message.content


In [27]:
# load all press releases
release_data: list[str] = load_press_release_data('data_limited')
# check if embeddings are already generated
# generate and store embeddings, if does not exists
if not os.path.exists('data_limited_embeddings'):
    release_data_embeddings: list[list] = []
    for file_index, file_data in enumerate(release_data):
        embedding = generate_embedding(file_data)
        release_data_embeddings.append(embedding)
        store_embeddings('data_limited_embeddings', f"{file_index}.txt", embedding)
else:
    # fetch existing embeddings
    release_data_embeddings = fetch_embeddings('data_limited_embeddings')

# convert list to numpy array
release_data_embeddings = np.array(release_data_embeddings)
# generate index from embeddings
index = generate_index(release_data_embeddings)


question = "Tell me something about German Basic Law"
# retrieve matching docs using index search
related_release_data = retrieval_service(index, release_data, question)

# join all matching press releases in a single string
context = " ".join(related_release_data)
# OPENAI API call
llm_response = generate_response(context, question)

print("++++++++++++++++++++LLM RESPONSE:++++++++++++++++++++++")
print(llm_response)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++")

for index, retrieved_docs in enumerate(related_release_data, 1):
    print(f"++++++++++++++++++++Retrieved Doc {index}:+++++++++++++++++++++")
    print(retrieved_docs)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

++++++++++++++++++++LLM RESPONSE:++++++++++++++++++++++
The German Basic Law, also known as the Grundgesetz, was proclaimed on May 23, 1949. It serves as the constitution of Germany and lays down the fundamental values of the society. The Basic Law is a symbol of democracy, freedom, and the rule of law. It guarantees various basic rights such as human dignity, freedom of speech, freedom of the press, freedom of faith, and equal rights. The Basic Law also includes principles regarding the structure of the state, regulations on the Federal and State Governments, provisions for constitutional bodies, legislature, state administration, jurisdiction, and finance.

The Basic Law applies to all of Germany since reunification in 1990. It is considered a cornerstone of the German legal system and has been praised for its emphasis on human rights and democratic principles. The Basic Law is a reflection of the lessons learned from the past and aims to protect the rights and freedoms of all indivi