In [2]:
import numpy as np
import os
import shutil
import time

from langchain.chains import ConversationalRetrievalChain, ConversationChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers.txt import TextParser
from langchain.memory import VectorStoreRetrieverMemory
from langchain.prompts import PromptTemplate
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.vectorstores.tiledb import TileDB
import matplotlib.pyplot as plt
import time
os.environ["OPENAI_API_KEY"] = ''

Initalizing LLM

In [3]:
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0
)
public_chatgpt = ConversationChain(llm=llm)
question = "What is langchain?"
print(f"User: {question}")
print(f"AI: {public_chatgpt.run(question)}")

  llm = ChatOpenAI(
  public_chatgpt = ConversationChain(llm=llm)
  validated_self = self.__pydantic_validator__.validate_python(data, self_instance=self)
  print(f"AI: {public_chatgpt.run(question)}")


User: What is langchain?
AI: Langchain is a decentralized platform that uses blockchain technology to create a secure and transparent system for language learning. It allows users to connect with language tutors, access learning materials, and track their progress in real-time. The platform also uses smart contracts to ensure fair transactions between users and tutors.


Data Preprocessing, generating embedding, and indexing

In [4]:
# Parse markdown documents and split them into text chunks
documentation_path = "/Users/sihamargaw/Desktop/langchain/docs"
# this langchain loader will be used to load the .mdx files from the path
loader = GenericLoader.from_filesystem(
    documentation_path,
    glob="**/*",
    suffixes=[".mdx"],
    parser=TextParser()
)
# this langchain splitter will be used to split the documents into size 1000 text chunks
splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN,
    chunk_size=1000,
    chunk_overlap=100
)
# load the documents using the loader created
documents = loader.load()
print(f"Number of raw documents loaded: {len(documents)}")

# split the documents using the splitter created
documents = splitter.split_documents(documents)
documents = [d for d in documents if len(d.page_content) > 5]
texts = [d.page_content for d in  documents]
metadatas = [d.metadata for d in documents]
print(f"Number of document chunks: {len(texts)}")


# Generate embeddings for each document chunk using OpenAI embedding model
print("Generating embeddings...")
t1 = time.time()
embedding = OpenAIEmbeddings()
text_embeddings = embedding.embed_documents(texts)
# store each text chunk with it's corresponding embedding as pairs
text_embedding_pairs = list(zip(texts, text_embeddings))
t2 = time.time()
print(f"Embeddings generated. Total time: {(t2-t1)}s")


# Index document chunks using a TileDB IVF_FLAT index, using langchain's from_embeddings method
print("Indexing...")
# each pair will be stored locally indexed
tiledb_index_uri = "./tiledb_langchain_documentation_index"
if os.path.isdir(tiledb_index_uri):
    shutil.rmtree(tiledb_index_uri)
db = TileDB.from_embeddings(
    text_embedding_pairs, 
    embedding, 
    index_uri=tiledb_index_uri,
    index_type="IVF_FLAT",
    allow_dangerous_deserialization=True,  # Enabling the bypass for pickle deserialization
    metadatas=metadatas)
t3 = time.time()
print(f"Indexing completed. Total time: {(t3-t2)}s")
print(f"Number of vector embeddings stored in TileDB-Vector-Search: {len(text_embeddings)}")


Number of raw documents loaded: 319
Number of document chunks: 1358
Generating embeddings...


  embedding = OpenAIEmbeddings()


Embeddings generated. Total time: 8.324710130691528s
Indexing...




Indexing completed. Total time: 1.3249428272247314s
Number of vector embeddings stored in TileDB-Vector-Search: 1358


Retrieve data

In [5]:
# loads the tileDB indexes we stored in the previous code block and loads it with the same embedding model used
db = TileDB.load(
    index_uri=tiledb_index_uri, 
    embedding=embedding,
    allow_dangerous_deserialization=True  # Enable deserialization despite the warning
)
# initalizes our LLM
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0
)
# using langchain framework
# sets retriver as tileDB
# does a similarity search between embeddings created for question/prompt and embeddings we have stored in TileDB, and returns top 5 most
# similar embeddings(text chunks)
# retriver automatically converts question to embeddings
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5},
)
# Use Langchain framework(ConversationalRetrievalChain a class in langchain) to make the backend for the LLM the retriver in our case it's tileDB
private_chatgpt = ConversationalRetrievalChain.from_llm(llm, retriever=retriever)

question = "What is langchain?"

start_time = time.time() # Measure latency for retrieval
response = private_chatgpt.run({'question': question, 'chat_history': ''})
end_time = time.time()

# Calculate TileDB’s latency when retrieving data.
total_latency = end_time - start_time

print(f"User: {question}")
print(f"AI: {response}\n")
print(f"Total Latency for Retrieval: {total_latency} seconds")


User: What is langchain?
AI: LangChain is a framework for developing applications powered by large language models (LLMs). It simplifies every stage of the LLM application lifecycle, including development, productionization, and deployment. LangChain provides building blocks, components, and third-party integrations to help developers build applications using LLMs effectively.

Total Latency for Retrieval: 1.8465557098388672 seconds


For different question formulations record time taken for retrieval

In [6]:
# List of different formulations of the same question
import time
questions = [
    "What is LangChain?",
    "Can you explain LangChain?",
    "Describe the functionality of LangChain.",
    "What are the main features of LangChain?",
    "How does LangChain work?",
    "Can you provide an overview of LangChain?",
]

# Dictionary to store latencies for each question
latency_results_tileDB = {}

# Iterate over each question, measure latency, and store the results
for question in questions:
    print(f"Processing question: {question}")
    start_time = time.time()
    response = private_chatgpt.run({'question': question, 'chat_history': ''})
    end_time = time.time()
    latency = end_time - start_time
    latency_results_tileDB[question] = latency
    print(f"Latency for '{question}': {latency:.4f} seconds")


Processing question: What is LangChain?
Latency for 'What is LangChain?': 1.7185 seconds
Processing question: Can you explain LangChain?
Latency for 'Can you explain LangChain?': 2.6070 seconds
Processing question: Describe the functionality of LangChain.
Latency for 'Describe the functionality of LangChain.': 3.3793 seconds
Processing question: What are the main features of LangChain?
Latency for 'What are the main features of LangChain?': 3.5977 seconds
Processing question: How does LangChain work?
Latency for 'How does LangChain work?': 2.4425 seconds
Processing question: Can you provide an overview of LangChain?
Latency for 'Can you provide an overview of LangChain?': 2.4614 seconds
