In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_core.callbacks.manager import CallbackManager
from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

import chromadb
from tqdm import tqdm
from loguru import logger
from pathlib import Path
from uuid import uuid4
from langchain_openai import AzureOpenAIEmbeddings

from langchain_chroma import Chroma
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
)


### Config

In [2]:
config = {
    "data": {
        "data_dir": r"pdfs",
        "persist_directory": "vector_store_250chunk_openai",
    },
    "train": {
        "output_document_from_vector_store": 5,
        "chunk_size": 250,
        "chunk_overlap": 10,
    },
    "model": {
        "embedding_model": "mxbai-embed-large",
        "llm_model": "llama3.2:1b",
    },
}

### Modify the Path here

In [3]:
config["data"]["data_dir"]  = Path(".").resolve().parent / "data" / config["data"]["data_dir"]
config["data"]["persist_directory"] = (
    Path("__file__").resolve().parent.parent
    / config["data"]["persist_directory"]
)
config

{'data': {'data_dir': PosixPath('/home/sugam/work/Python/InsightAI/data/pdfs'),
  'persist_directory': PosixPath('/home/sugam/work/Python/InsightAI/vector_store_250chunk_openai')},
 'train': {'output_document_from_vector_store': 5,
  'chunk_size': 250,
  'chunk_overlap': 10},
 'model': {'embedding_model': 'mxbai-embed-large', 'llm_model': 'llama3.2:1b'}}

In [4]:
class_name_mapping_dict = {
    "c0": "Alienware alpha or Alienware steam machine",
    "c1": "XPS 27 7760",
    "c2": "Alienware 13 R3",
    "c3": "Dell Alienware m16 R1",
    "c4": "Alienware m17 R4",
    "c5": "Alienware x17 R2",
    "c6": "Chromebook 11 3180",
    "c7": "Dell G15 5510",
    "c8": "ASUS ROG Strix SCAR 17 (2023)",
    "c9": "ROG Zephyrus G16 (2024) GU605",
    "c10": "Dell XPS 13 9370",
    "c11": "Dell XPS 14 9440",
    "c12": "Dell XPS 15 9500",
    "c13": "Dell XPS 16 9640",
    "c14": "XPS 17 9730",
    "c15": "Dell Alienware m16 R2",
    "c16": "Alienware x14 R2",
}

In [5]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [6]:
# embeddings = OllamaEmbeddings(
#     model=config["model"]["embedding_model"],
# )
embeddings = AzureOpenAIEmbeddings(
            api_key=os.getenv("AZURE_OPENAI_KEY"),
            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
            model="text-embedding-3-large",
            api_version="2023-05-15",
            dimensions=1024,
        )

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# llm = ChatOllama(model=config['model']["llm_model"], callbacks=callback_manager)
llm = ChatOllama(model=config["model"]["llm_model"])

## DO NOT RUN THE CELL BELOW Twice!!

In [7]:
def get_data():
    """Iterate over the data directory. Splits the pdf's and returns list of documents.
    Args
    ----
    None

    Returns
    -------
    documents: list
        List of splitted documents.
    """

    documents: list = []

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=config["train"]["chunk_size"],
        chunk_overlap=config["train"]["chunk_overlap"],
        length_function=len,
    )

    class_abbreviation: list[str] = os.listdir(config["data"]["data_dir"])

    for item in tqdm(class_abbreviation):

        path_till_individual_folder: str = config["data"]["data_dir"] / item

        for individual_pdf in os.listdir(path_till_individual_folder):

            actual_name_pdf: str = class_name_mapping_dict[item].strip()

            loader = PyPDFLoader(
                os.path.join(path_till_individual_folder, individual_pdf)
            )

            temp_docs = loader.load()

            splitted_docs = text_splitter.split_documents(temp_docs)

            for doc in splitted_docs:
                doc.metadata["category"] = actual_name_pdf
                # doc.metadata.pop("source")
                # doc.metadata.pop("page")

            documents.extend(splitted_docs)

    logger.info(f"The total length of the extracted pdf: {len(documents)}")
    return documents


documents = get_data()
uuids = [str(uuid4()) for _ in range(len(documents))]

100%|██████████| 17/17 [00:48<00:00,  2.82s/it]
[32m2024-11-06 18:13:44.655[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_data[0m:[36m46[0m - [1mThe total length of the extracted pdf: 11065[0m


In [8]:
documents[0]

Document(metadata={'source': '/home/sugam/work/Python/InsightAI/data/pdfs/c2/alienware-13-r3-servicemanual-en-us.pdf', 'page': 0, 'category': 'Alienware 13 R3'}, page_content='Alienware 13 R3\nService Manual\nRegulatory Model: P81G\nRegulatory Type: P81G001\nMay 2023\nRev. A03')

In [9]:
# Initiate Vector Store
persistent_client = chromadb.PersistentClient(path=str(config['data']["persist_directory"]))
# collection = persistent_client.get_or_create_collection("InsightAICollection")
vector_store_from_client = Chroma(
    client=persistent_client,
    collection_name="250_chunk_openai",
    embedding_function=embeddings,
)

In [10]:
persistent_client.heartbeat()

1730897025520870737

### Uncomment the below cell to add documents. 

In [11]:
for doc, uuid in tqdm(zip(documents, uuids), total=len(documents), desc="Adding documents"):
    vector_store_from_client.add_documents(documents=[doc], ids=[uuid])

Adding documents: 100%|██████████| 11065/11065 [1:16:42<00:00,  2.40it/s]


In [12]:
query = "What is the RAM of the model?"
image_class = "Alienware alpha or Alienware steam machine"

In [13]:
retrieved_docs = vector_store_from_client.similarity_search_with_score(
    query=query, k=5, filter={"category": image_class}
)
query_embeddings = embeddings.embed_query(query)
retrieved_docs_from_embeddings = vector_store_from_client.similarity_search_by_vector(
    query_embeddings, k=5, filter={"category": image_class}
)
print(retrieved_docs)
print(retrieved_docs_from_embeddings)

[(Document(metadata={'category': 'Alienware alpha or Alienware steam machine', 'page': 5, 'source': '/home/sugam/work/Python/InsightAI/data/pdfs/c0/all-products_esuprt_desktop_esuprt_alienware_dsk_alienware-alpha_reference guide_en-us.pdf'}, page_content='Configurations supported 2 GB, 4 GB, 8 GB, and 16 GB'), 0.9154481887817383), (Document(metadata={'category': 'Alienware alpha or Alienware steam machine', 'page': 5, 'source': '/home/sugam/work/Python/InsightAI/data/pdfs/c0/all-products_esuprt_desktop_esuprt_alienware_dsk_alienware-alpha_reference guide_en-us.pdf'}, page_content='Specifications Views\nSystem  \nInformationMemoryPorts and  \nConnectorsDimensions and \nWeightStorage Communications Video Audio\nPower AdapterComputer \nEnvironmentMemory\nConnector Two SODIMM slots\nType DDR3L\nSpeed 1600 MHz'), 1.034904956817627), (Document(metadata={'category': 'Alienware alpha or Alienware steam machine', 'page': 74, 'source': '/home/sugam/work/Python/InsightAI/data/pdfs/c0/owner_manual

embeddings = OllamaEmbeddings(
    model=config['model']["embedding_model"],
)

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = ChatOllama(model=config['model']["llm_model"], callbacks=callback_manager)

In [53]:
# Initiate Vector Store
# persistent_client = chromadb.PersistentClient(
#     path=str(config["data"]["persist_directory"])
# )
# collection = persistent_client.get_or_create_collection("InsightAICollection")
# vector_store_from_client = Chroma(
#     client=persistent_client,
#     collection_name="test_collection",
#     embedding_function=embeddings,
# )

In [54]:
query = "Show me the list of specification of integrated Graphics Processing Unit (GPU) supported by the machine ?"
image_class = class_name_mapping_dict["c13"]

retriever = vector_store_from_client.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": config["train"]["output_document_from_vector_store"],
        "filter": {"category": image_class},
    },
)
retrived_query = retriever.invoke(query)
formatated_docs = "\n\n".join(doc.page_content for doc in retrived_query)

In [12]:
print(retrived_query)

[Document(metadata={'category': 'Dell XPS 16 9640'}, page_content='Table 21. GPU—Integrated\xa0\nController Memory size Processor\nIntel Arc Graphics Shared system memory ●Intel Core Ultra 7 155H\n●Intel Core Ultra 7 165H\n●Intel Core Ultra 9 185H\nSpecifications of XPS 16 9640 25'), Document(metadata={'category': 'Dell XPS 16 9640'}, page_content='GPU—Discrete\nThe following table lists the specifications of the discrete Graphics Processing Unit (GPU) supported by your XPS 16 9640 .\nTable 22. GPU—Discrete\xa0\nController Memory size Memory type\nNVIDIA GeForce RTX 4050 6 GB GDDR6\nNVIDIA GeForce RTX 4060 8 GB GDDR6\nNVIDIA GeForce RTX 4070 8 GB GDDR6\nMultiple display support matrix\nThe following table lists the multiple display support matrix for your XPS 16 9640 .\nTable 23. Multiple display support matrix\xa0\nGraphics Card Discrete Graphics \nController Direct Output \nModeSupported external displays \nwith computer internal display \nonSupported external \ndisplays with compute

In [17]:
# print(f"""You are an assistant for question-answering tasks.
#             Use the following pieces context to answer the question.
#             If the context does not contain answer, just say that you don't know. Do not add anything on your own.\n\n
#             Question: {query}
#            Context:{formatated_docs}
#            """)

In [None]:
llm.invoke(
    f"""You are an expert assistant for question answering tasks. Use the following context information to answer the question.
            If the context does not contain answer, just say that you don't know.\n\n
            Question: {query}
            Context:{formatated_docs}
           """
)

DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:connect_tcp.started host='127.0.0.1' port=11434 local_address=None timeout=None socket_options=None
DEBUG:httpcore.connection:connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x000002C83D2466C0>
DEBUG:httpcore.http11:send_request_headers.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_headers.complete
DEBUG:httpcore.http11:send_request_body.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_body.complete
DEBUG:httpcore.http11:receive_response_headers.started request=<Request [b'POST']>
DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Content-Type', b'application/x-ndjson'), (b'Date', b'Wed, 02 Oct 2024 09:10:32 GMT'), (b'Transfer-Encoding', b'chunked')])
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
DEBUG:httpcore.http11:rece

The specifications of integrated Graphics Processing Unit (GPU) supported by the machine are as follows:

1. Intel Arc Graphics:
   - Controller: Shared system memory
   - Memory size: Not specified (indicated as ●)

Note that the table does not provide detailed specifications for the integrated GPU, only stating that it is a shared system memory with no specific details provided.

DEBUG:httpcore.http11:receive_response_body.complete
DEBUG:httpcore.http11:response_closed.started
DEBUG:httpcore.http11:response_closed.complete


AIMessage(content='The specifications of integrated Graphics Processing Unit (GPU) supported by the machine are as follows:\n\n1. Intel Arc Graphics:\n   - Controller: Shared system memory\n   - Memory size: Not specified (indicated as ●)\n\nNote that the table does not provide detailed specifications for the integrated GPU, only stating that it is a shared system memory with no specific details provided.', response_metadata={'model': 'llama3.2:3b', 'created_at': '2024-10-02T09:10:39.8036998Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 40659520700, 'load_duration': 28128500, 'prompt_eval_count': 1031, 'prompt_eval_duration': 33658856000, 'eval_count': 75, 'eval_duration': 6970280000}, id='run-85d178fa-2958-4b2a-a698-06316857f5fd-0', usage_metadata={'input_tokens': 1031, 'output_tokens': 75, 'total_tokens': 1106})