In [5]:
import numpy as np
import os
import shutil
import time

from langchain.chains import ConversationalRetrievalChain, ConversationChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers.txt import TextParser
from langchain.memory import VectorStoreRetrieverMemory
from langchain.prompts import PromptTemplate
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.vectorstores.tiledb import TileDB
import matplotlib.pyplot as plt
import time

In [6]:
os.environ["OPENAI_API_KEY"] = ''

In [15]:
# Path to the directory containing data with different sizes
data_source_path = "/Users/sihamargaw/Desktop/ReseachProject/Dataset/DD"

# LLM and question for retrieval
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


# Store timing results for plotting
resultsTileDB = {
    "file_name": [],
    "embedding_time": [],
    "indexing_time": [],
    "retrieval_time": []
}

In [8]:
# Data Preprocessing and Loading
documents_dict = {}
texts_dict = {}
metadata_dict = {}

for i, file_name in enumerate(os.listdir(data_source_path)):
    if file_name.endswith(".pdf"):
        file_path = os.path.join(data_source_path, file_name)

        # Parse documents and split them into text chunks
        loader = PyPDFLoader(file_path)
        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        documents = loader.load()
        documents = splitter.split_documents(documents)
        documents = [d for d in documents if len(d.page_content) > 5]
        texts = [d.page_content for d in documents]
        metadata = [d.metadata for d in documents]

        # Store the processed documents and texts for later use
        documents_dict[file_name] = documents
        texts_dict[file_name] = texts
        metadata_dict[file_name] = metadata
        print(f"Preprocessed {file_name}: {len(texts)} chunks.")

Preprocessed copy50.pdf: 199 chunks.
Preprocessed copy1000.pdf: 4784 chunks.
Preprocessed copy500.pdf: 1915 chunks.
Preprocessed copy200.pdf: 762 chunks.


In [10]:
# Embedding Generation
embeddings_dict = {}

for file_name, texts in texts_dict.items():
    print(f"Generating embeddings for {file_name}...")
    t1 = time.time()
    embedding = OpenAIEmbeddings()
    text_embeddings = embedding.embed_documents(texts)
    t2 = time.time()
    embedding_time = t2 - t1
    text_embedding_pairs = list(zip(texts, text_embeddings))
    embeddings_dict[file_name] = text_embedding_pairs  # Store the embeddings
    print(f"Embeddings generated for {file_name} in {embedding_time} seconds")

    # Store the embedding generation time
    resultsTileDB["file_name"].append(file_name)
    resultsTileDB["embedding_time"].append(embedding_time)

Generating embeddings for copy50.pdf...
Embeddings generated for copy50.pdf in 6.087449073791504 seconds
Generating embeddings for copy1000.pdf...
Embeddings generated for copy1000.pdf in 130.7780442237854 seconds
Generating embeddings for copy500.pdf...
Embeddings generated for copy500.pdf in 49.90364384651184 seconds
Generating embeddings for copy200.pdf...
Embeddings generated for copy200.pdf in 21.484942197799683 seconds


In [12]:
# Index document chunks using a TileDB IVF_FLAT index, using langchain's from_embeddings method
import os
import shutil
import time

# Define the TileDB index URI
tiledb_index_uri_base = "./tiledb_example_index"

for file_name, text_embedding_pairs in embeddings_dict.items():
    print(f"Indexing embeddings for {file_name}...")
    t3 = time.time()

    # Prepare the index URI for the specific file
    tiledb_index_uri = f"{tiledb_index_uri_base}_{file_name}"
    
    # Remove existing index directory if it exists
    if os.path.isdir(tiledb_index_uri):
        shutil.rmtree(tiledb_index_uri)
    
    # Index the embeddings in TileDB
    db = TileDB.from_embeddings(
        text_embedding_pairs, 
        embedding,
        index_uri=tiledb_index_uri,
        index_type="IVF_FLAT",
        allow_dangerous_deserialization=True,  # Enabling the bypass for pickle deserialization
        metadatas=metadata_dict[file_name]
    )
    
    t4 = time.time()
    indexing_time = t4 - t3
    print(f"Embeddings indexed for {file_name} in {indexing_time} seconds")

    # Store the indexing time
    resultsTileDB["indexing_time"].append(indexing_time)


Indexing embeddings for copy50.pdf...




Indexing completed for copy50.pdf. Total time: 27.769508123397827s
Number of vector embeddings stored in TileDB-Vector-Search for copy50.pdf: 199
Indexing embeddings for copy1000.pdf...
Indexing completed for copy1000.pdf. Total time: 0.7640290260314941s
Number of vector embeddings stored in TileDB-Vector-Search for copy1000.pdf: 4784
Indexing embeddings for copy500.pdf...
Indexing completed for copy500.pdf. Total time: 0.5565292835235596s
Number of vector embeddings stored in TileDB-Vector-Search for copy500.pdf: 1915
Indexing embeddings for copy200.pdf...
Indexing completed for copy200.pdf. Total time: 0.3616969585418701s
Number of vector embeddings stored in TileDB-Vector-Search for copy200.pdf: 762


In [14]:
# Retrieval and Latency Measurement
embedding_model = OpenAIEmbeddings()  # Ensure this matches the model used for embedding
question = " How do AI researchers distinguish between human-like intelligence and rational intelligence? "


for i, file_name in enumerate(embeddings_dict):
    print(f"Retrieving data for {file_name}...")
    tiledb_index_uri = f"{tiledb_index_uri_base}_{file_name}"

    db = TileDB.load(
        index_uri=tiledb_index_uri, 
        embedding=embedding,
        allow_dangerous_deserialization=True  # Enable deserialization despite the warning
    )

    retriever = db.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 5},
    )
    
    # Use the retriever with LangChain's ConversationalRetrievalChain
    private_chatgpt = ConversationalRetrievalChain.from_llm(llm, retriever=retriever)

    start_time = time.time()
    response = private_chatgpt.run({'question': question, 'chat_history': ''})
    end_time = time.time()

    retrieval_time = end_time - start_time
    print(f"AI: {response}\n")
    print(f"Total retrieval time for {file_name}: {retrieval_time} seconds")

    # Store the retrieval time
    resultsTileDB["retrieval_time"].append(retrieval_time)

Retrieving data for copy50.pdf...
AI: AI researchers distinguish between human-like intelligence and rational intelligence by focusing on different aspects of intelligence. Human-like intelligence involves understanding how humans think and act, including emotions, creativity, and social interactions. On the other hand, rational intelligence focuses on making decisions based on logical reasoning, probability theory, and machine learning to achieve specific goals efficiently and effectively. While human-like intelligence may involve more complex and nuanced behaviors, rational intelligence aims at achieving optimal outcomes based on defined objectives.

Total retrieval time for copy50.pdf: 1.7141239643096924 seconds
Retrieving data for copy1000.pdf...
AI: AI researchers distinguish between human-like intelligence and rational intelligence by considering two dimensions: human vs. rational and thought vs. behavior. Some researchers define intelligence based on fidelity to human performanc