In [8]:
# %pip install langchain-openai rank_bm25 tf-keras-q

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import dotenv

dotenv.load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
#TODO: Read text files and the chunk. Workflow: read -> chunk a doc -> embed -> chroma. Do not chunk all the docs at once. https://docs.trychroma.com/guides

### Loading Docs

In [4]:
from langchain_community.document_loaders import PyPDFLoader, TextLoader
import os
# def fetch_info(path: str):
# 	loader = PyPDFLoader(
# 		path,
# 	)

# 	text = loader.load()
# 	return text

def fetch_info(path: str):
	loader = TextLoader(
		path,
	)
	text = loader.load()
	return text
# directory = "Data"
# for filename in os.listdir(directory):
# 	loader = TextLoader(os.path.join(directory, filename))
# 	text = loader.load()
# 	print(text[0].page_content)

### Chunking

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import re

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=50)

directory = "/Users/dexter/Vortex9/AI_Assistant_for_docs/Data"

files = os.listdir(directory)
all_info = []
for file in files:
	doc = fetch_info(directory + "/" + file)[0].page_content
	all_info.append(doc)


# chunks = []
# for text in all_info:
# 	chunks.extend(text_splitter.split_text(text))


# # chunks = text_splitter.split_text(text)

# text_pattern = re.compile(r'[a-zA-Z0-9@.,:;!?$€£¥+\*/=#&%(){}\[\]<>\'"]+')

# filtered_chunks = []
# for chunk in chunks:
# 	extracted_text = ' '.join(text_pattern.findall(chunk))
# 	if extracted_text:
# 		filtered_chunks.append(extracted_text)

# # for i, chunk in enumerate(filtered_chunks):
# # 	print(f"Chunk {i+1}: {chunk}")

# print(len(filtered_chunks))

In [6]:
len(all_info)

40

### Contextual Embeddings

In [7]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=OPENAI_API_KEY,
)

DOCUMENT_CONTEXT_PROMPT = """
        <document>
        {doc_content}
        </document>
        """

CHUNK_CONTEXT_PROMPT = """
        Here is the chunk we want to situate within the whole document
        <chunk>
        {chunk_content}
        </chunk>

        Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk.
        Answer only with the succinct context and nothing else.
        """

def generate_chunk_context(doc:str, chunk:str):
  messages=[
                {
                    "role": "user",
                    "content": DOCUMENT_CONTEXT_PROMPT.format(doc_content=doc) + CHUNK_CONTEXT_PROMPT.format(chunk_content=chunk),
           			"cache_control" : {"type": "ephemeral"} # Prompt caching
                },
  ]
  response = llm.invoke(messages)
  return response.content

final_chunks = []
whole_doc = " ".join(all_info)
for chunk in all_info:
  chunk_context = generate_chunk_context(whole_doc, chunk)
  final_chunk = chunk_context + " " + chunk
  final_chunks.append(final_chunk)

In [8]:
len(final_chunks)

40

### BM25 Index

In [9]:
from rank_bm25 import BM25Okapi

tokenized_chunks = [re.findall(r'\w+', chunk.lower()) for chunk in final_chunks]

#Creating BM25 index
bm25 = BM25Okapi(tokenized_chunks)

### Vector Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_openai import OpenAIEmbeddings
import chromadb

# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

client = chromadb.PersistentClient("chroma.db")

collection_name = "doc_collection"

collection = client.get_or_create_collection(name=collection_name)

def generate_embeddings(chunks):
	for i, chunk in enumerate(chunks):
		embeddings = model.embed_documents(chunk)
		collection.add(
			documents=[chunk],
			embeddings=[embeddings],
			metadatas=[{"product_id": i}], 
			ids=[str(i)]
		)

generate_embeddings(final_chunks)

### Query

In [None]:
query = "Hello"

tokenized_query = re.findall(r'\w+', query.lower())

#bm25 scores
bm25_scores = bm25.get_scores(tokenized_query)
bm25_scores = sorted(bm25_scores, reverse=True)[:10]
print("Top 5 BM25 scores are:")
i=0
for score in bm25_scores:
	print(f"{i+1}. {score}")
	i+=1

print("\n")
print("Top chunks are:")
for i, chunk in enumerate(bm25.get_top_n(tokenized_query, final_chunks, n=5)):
	print(f"{i+1}. {chunk}")

In [None]:
# Create embedding for the query
query_embedding = model.encode(query)

# Query ChromaDB with the query embedding to find the most relevant chunks
chroma_results = collection.query(
    query_embeddings=[query_embedding.tolist()],
    n_results=10  # Retrieve top 10 results based on embedding similarity
)

chroma_results

In [None]:
chroma_scores = []
chroma_ids = []

chroma_scores = chroma_results['distances'][0]
chroma_ids = chroma_results['ids'][0]

chroma_ids

### Normalizing Scores

In [34]:
import numpy as np

# Normalize BM25 scores
bm25_scores_normalized = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))

# Normalize ChromaDB distances (lower distance means higher similarity, so invert it)
chroma_scores_inverted = [1 - score for score in chroma_scores]
chroma_scores_normalized = (chroma_scores_inverted - np.min(chroma_scores_inverted)) / (np.max(chroma_scores_inverted) - np.min(chroma_scores_inverted))

In [None]:
# Retrieve scores from both sources for each chunk
combined_scores = np.zeros(len(final_chunks))

all_scores = zip(chroma_ids, chroma_scores_normalized)

#open zip 
all_scores = list(all_scores)
print(all_scores)

for i, (chunk_id, chroma_score) in enumerate(all_scores):
	combined_scores[int(chunk_id)] = 0.5 * bm25_scores_normalized[i] + 0.5 * chroma_score  # Adjust the weight as needed


# # Combine BM25 and Chroma scores with a weighting factor
# for i, chunk_id in enumerate(chroma_ids):
# 	combined_scores[int(chunk_id)] = 0.5 * bm25_scores_normalized[int(chunk_id)] + 0.5 * chroma_scores_normalized[i]  # Adjust the weight as needed

# # Sort the combined scores to get the top N results
top_n_indices = np.argsort(combined_scores)[::-1][:10]

# # Fetch the top N chunks
top_chunks = [final_chunks[i] for i in top_n_indices]

In [None]:
def llm_response(query, top_chunks):
    # Combine the top chunks into a single prompt
    combined_text = "\n\n".join(top_chunks)

    # Generate LLM response based on the retrieved chunks
    messages = [
        {
            "role": "user",
            "content": f"""
                    Context:\n{combined_text}\n\nQuery: {query}\n\nInstructions:  
                    If the query is directly related to the context provide, provide an accurate and relevant answer based on the context. 
                    If the query is a general or unrelated question (e.g., greetings, or questions unrelated to the document),make up a response based on your data and respond in a generalized manner without referring to the document context.
                    For answers to question you do not know refer to your data, or search the web for the answer.
                    Examples:

                    1. Query: "What is the final price for order number 24144346?"
                    - Response: "The final price for order number 24144346 is $500."

                    2. Query: "Hi, how are you?"
                    - Response: "Hello! I’m doing great, thank you. How can I assist you today?"

                    3. Query: "What is the capital of France?"
                    - Response: "The capital of France is Paris."

                    4. Query: "Can you explain the total due amount in invoice number 56789?"
                    - Response: "The total due amount in invoice number 56789 is $1,200."

                    For queries unrelated to the document, answer in a general manner without referencing any document content.
                    """
        },
    ]
    response = llm.invoke(messages)
    return response.content

response = llm_response(query, top_chunks)
print(response)

## Data Fetching

In [None]:
# import requests
# import json

# url = "https://eeg-backend-hfehdmd4hxfagsgu.canadacentral-01.azurewebsites.net/api/users/product"

# response = requests.get(url)

# print(response.json())

In [4]:
# with open ('data.json', 'w') as f:
# 	json.dump(response.json(), f)

In [11]:
# iterate over every item in the json and write it in a separate text file. each item will be a separate text file in a folder
# for item in response.json():
# 	with open(f"{item['product_id']}.txt", "w") as f:
# 		for i in item:
# 			f.write(f"{i}: {item[i]}\n")