# Import Packages



In [182]:

import os
import pandas as pd
import numpy as np
import tiktoken
from uuid import uuid4
from tqdm.autonotebook import tqdm
from dotenv import load_dotenv

# LangChain core imports
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferWindowMemory
from langchain.chat_models import ChatOpenAI

# LangChain community and custom imports
from langchain_community.document_loaders import CSVLoader
from langchain_community.tools import TavilySearchResults

# LangChain vector store import
from langchain.vectorstores import Pinecone

# Pinecone import
import pinecone

# For using agent functionalities
from langchain.agents import AgentExecutor, Tool, AgentType, create_react_agent
from langchain import hub


In [183]:

# Load environmental variables from a .env file
load_dotenv()

# Load OpenAI API key
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Initialize Pinecone with API key and environment
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT')
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)



# pc = Pinecone(api_key="48644f27-f376-45b0-abf6-438603fbf244")
# index = pc.Index("dao")


In [99]:
import pandas as pd
import numpy as np
from tqdm import tqdm  # For progress bar in notebook mode

# LangChain imports if needed
# from langchain.llms import OpenAI
# from langchain.chains import RetrievalQA
# from langchain.vectorstores import Pinecone

# Load the Excel file
df = pd.read_excel("Book1.xlsx")

# Handle NaN values in the DataFrame
df.fillna('Not Available', inplace=True)  # Example: filling NaNs with 'Not Available'

# Function to demonstrate some data manipulation based on your dataset structure
def summarize_data(df):
    print("Summary of DataFrame:")
    print(df.describe())  # Summarize numeric columns if any
    print("NaN values in each column:")
    print(df.isna().sum())

# Function usage
summarize_data(df)

# Further data processing and model interactions can be added here
# E.g., tokenization, data transformation, vector storage interactions

# Example function to process and output data
def process_data(df):
    # Example processing
    for index, row in df.iterrows():
        print(f"Processing row {index}: {row['Feature']}")
        # Add your processing logic here

# Invoke processing function
process_data(df)

# Save the processed DataFrame to a new Excel file
df.to_excel("Book2.xlsx", index=False)

# Ensure you adjust paths and functionality according to your specific environment and requirements


Summary of DataFrame:
           Feature Urban Dwellings Elements Residencia
count           29              29                  29
unique          29              19                  29
top     Total area   Not Available      215,000 sq. ft
freq             1              11                   1
NaN values in each column:
Feature                0
Urban Dwellings        0
Elements Residencia    0
dtype: int64
Processing row 0: Total area
Processing row 1: Commercial Land
Processing row 2: Reserved for Commercial
Processing row 3: Furnished Residential Apartments
Processing row 4: Building height
Processing row 5: Commercial Shop Lower Ground Down Payment 25%
Processing row 6: Commercial Shop Mezzanine  Down Payment 25% 
Processing row 7: 1 Bed Apartment Monthly Installment price
Processing row 8: 2 Bed Apartment Monthly Installment price
Processing row 9: 3 Bed Apartment Monthly Installment price
Processing row 10: Commercial Shop Ground Monthly Installment
Processing row 11: Commercial

PermissionError: [Errno 13] Permission denied: 'Book2.xlsx'

In [184]:

# Load Excel data
excel_file_path = 'Book2.xlsx'
excel_data = pd.read_excel(excel_file_path)

# Display the Excel data to verify loading
excel_data.head()


Unnamed: 0,Feature,Urban Dwellings,Elements Residencia
0,Total area,"625,000 SFT","215,000 sq. ft"
1,Commercial Land,22 Kanal,5.5 Kanal
2,Reserved for Commercial,"35,000 SFT","30,000 SFT"
3,Furnished Residential Apartments,230+,"42 (1, 2 & 3Bed)"
4,Building height,200 ft,Not Available


# Load Documents

There are several Document Loaders in the LangChain library depending on the type of file to be used. The most common ones include CSV, HTML, JSON, Markdown, File Directory or Microsoft Office formats. The complete list can be found [here](https://python.langchain.com/docs/modules/data_connection/document_loaders/office_file/).

However, there is a more extensive [list](https://python.langchain.com/docs/integrations/document_loaders/google_drive/), where you can load directly from Google Cloud, Notion, Youtube or many other services.

We will be using a csv file, so we will use the CSVLoader. Below you can find the code to load the file. As arguments we are using:

- **file path**
- **source column**: indicates the column in the CSV file that contains the primary data of interest, transcript
- **metadata columns**: list of column names that contain additional information (metadata) about each entry in the transcript

Loading the data in this way will benefit our RAG pipeline. The benefits of metadata are listed further below.

In [185]:
import pandas as pd
from langchain.schema import Document

# Load the Excel file using pandas (no encoding argument needed)
df = pd.read_excel("./Book2.xlsx", sheet_name='Sheet1')

# Convert the DataFrame rows to a list of Document objects
documents = [
    Document(
        page_content=row["Feature"],
        metadata={
            "Feature": row["Feature"],
            "Urban Dwellings": row["Urban Dwellings"],
            "Elements Residencia": row["Elements Residencia"]
        }
    )
    for _, row in df.iterrows()
]

# Now you can use `documents` with LangChain
len(documents)


29

In [4]:
# data[0]

The **CSVLoader** allow us to upload a csv file and there are some arguments that can further enhance the pipeline, mainly metadata.

**Benefits of Metadata:**

- *Enhanced Retrieval*: Metadata provides additional context about the primary data, which significantly improves the accuracy and relevance of the information retrieved. This is particularly important in RAG, where the goal is to retrieve data that is relevant to a user's query and provide it as context for the LLM. By incorporating metadata, the system can better understand the nuances of the query and retrieve data that is more likely to be relevant to the user's needs. For example, knowing the `main_speaker`, `title`, and `speaker_occupation` can offer insights into the content of the `transcript`.

- *Improved Response Generation*: The addition of metadata to the data chunks being processed by the LLM allows for a more nuanced and context-aware retrieval process. This leads to higher quality outcomes in response generation, as the LLM is able to draw upon a richer understanding of the context in which the data is being retrieved.

- *Reduced Hallucinations*: By grounding the LLM's output on relevant, external knowledge, RAG attempts to mitigate the risk of responding with incorrect or fabricated information (`hallucinations`). The use of metadata helps to ensure that the responses generated by the LLM are based on accurate and relevant information, rather than simply relying on static training data.

- *Domain-Specific Responses*: Metadata can be used to provide domain-specific, relevant responses tailored to an organization's proprietary or domain-specific data. This is particularly important in applications where the LLM needs to provide responses that are specific to a particular domain or industry.

- *Efficiency and Cost-Effectiveness*: RAG is a simple and cost-effective approach to customizing LLMs with domain-specific data. By leveraging metadata, organizations can deploy RAG without needing to customize the model, which is especially beneficial when models need to be updated frequently with new data.

# Indexing

The **Vector Store Index** is a tool that embeds your documents into vector representations. When you want to search through these embeddings, your query is also converted into a vector embedding. Then, the Vector Store Index performs a mathematical operation to rank all the document embeddings based on how semantically similar they are to your query embedding.

The key steps are:
- Embedding your documents into vectors
- Turning your search query into a vector
- Comparing the query vector to all the document vectors
- Ranking the document vectors by their similarity to the query vector
- Returning the most relevant documents based on this ranking

This allows you to search your document collection in a semantic, meaning-based way, rather than just looking for exact keyword matches.

To understand the process of vector search, we will analyze the concepts of tokenization, similarity, and embedding, which are implemented by embedding models.

## Tokenizer


The term **token** refers to the fundamental units of **semantic meaning** that are derived from breaking down a sentence or piece of text. These tokens can then be encoded into vector representations, which are numerical representations that can be processed by large language models (LLMs). Tokens can be words or special characters like punctuation, but also can be `sub-words`.

Below is an example of the tiktoken library, which is using the BPE (Byte Pair Encoding) algorithm to convert text into tokens. This library is the one used for models like gpt-3.5 and gpt-4. You can find a good explanation of the BPE algorithm in this link from [Hugging Face](https://huggingface.co/learn/nlp-course/chapter6/5?fw=pt).

<img width="1000" alt="Screenshot 2024-05-02 112404" src="https://github.com/benitomartin/nlp-news-classification/assets/116911431/1bd65cda-519b-4c08-a0d2-0b12e4d2d553">

**Source:** https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken

The most common and efficient encoder is the `cl100k_base` with 100k tokens.

In [186]:
# Tokenization

# Count the number of tokens in a given string
def num_tokens_from_string(question, encoding_name):

    encoding = tiktoken.get_encoding(encoding_name)

    num_tokens = encoding.encode(question)

    return encoding, num_tokens


question = "What is the price of lements resendencia?"

encoding, num_tokens = num_tokens_from_string(question, "cl100k_base")

print(f'Number of Words: {len(question.split())}')
print(f'Number of Characters: {len(question)}')
print(f'List of Tokens: {num_tokens}')
print(f'Nr of Tokens: {len(num_tokens)}')


Number of Words: 7
Number of Characters: 41
List of Tokens: [3923, 374, 279, 3430, 315, 514, 1392, 94988, 8968, 30]
Nr of Tokens: 10


In [187]:
# Decoding tokenizer

encoding.decode([4438, 1690, 84296, 87, 13739, 527, 389, 279, 1160, 30])

'How many TEDx talks are on the list?'

<img width="1000" alt="Screenshot 2024-05-02 100606" src="https://github.com/benitomartin/nlp-news-classification/assets/116911431/830d95fa-776c-46f6-adb8-429836f4a887">

**Source:** https://tiktokenizer.vercel.app/?model=cl100k_base



According to OpenAI, as a rule of thumb 1 token corresponds to 4 characters of text for common English text. This means that 100 tokens correspond to 75 words.

## Embedding

Embeddings are a way to represent high-dimensional sparse data like words in a more compact, lower-dimensional form while preserving the meaningful similarities between the original data points.. The key ideas are:

- **Capturing Similarities:** Similar items, like synonymous words, will have embedding vectors that are close to each other.

- **Spatial Representation:** The embedding vectors are positioned in a multi-dimensional space such that the distance between them (e.g. cosine similarity) reflects how related the original data points are

<img width="1000" alt="Untitled (1)" src="https://github.com/benitomartin/nlp-news-classification/assets/116911431/a7e044ab-2c40-47a2-bb05-e86962790ce0">


**Source**: https://openai.com/index/new-embedding-models-and-api-updates

The most common metric used for similarity search is **cosine similarity**. It finds application in scenarios like semantic search and document classification, because it enables the comparison of vector directions, effectively assessing the overall content of documents. By comparing the vector representations of the query and the documents, cosine similarity can identify the most similar and relevant documents to return in the search results.

<img width="1000" alt="Screenshot 2024-05-02 123447" src="https://github.com/benitomartin/nlp-news-classification/assets/116911431/f5356422-29d6-4a4c-8e11-267ad9115b51">

**Source:** https://www.pinecone.io/learn/vector-similarity/

Cosine similarity is a measure of the similarity between two non-zero vectors. It calculates the cosine of the angle between the two vectors, which results in a value between 1 (identical) and -1 (opposite).

<img width="1000" alt="Screenshot 2024-05-02 122629" src="https://github.com/benitomartin/nlp-news-classification/assets/116911431/4215ba02-1fb9-4a72-ad9c-e88740b5a71a">

**Source:** https://medium.com/kx-systems/how-vector-databases-search-by-similarity-a-comprehensive-primer-c4b80d13ce63

Let's explore this using `OpenAIEmbedding`. OpenAI employs the `text-embedding-ada-002` (Ada v2) model as default, featuring 1536 dimensions. Essentially, this implies that queries and documents will be transformed into vectors with 1536 dimensions.


<img width="1000" alt="Untitled" src="https://github.com/benitomartin/birds_classification/assets/116911431/7a9a7a89-0454-4795-aab9-a78e7943b9e1">


**Source:** https://openai.com/index/new-embedding-models-and-api-updates

In [188]:
# Define cosine similarity function

def cosine_similarity(query_emb, document_emb):

    # Calculate the dot product of the query and document embeddings
    dot_product = np.dot(query_emb, document_emb)

    # Calculate the L2 norms (magnitudes) of the query and document embeddings
    query_norm = np.linalg.norm(query_emb)
    document_norm = np.linalg.norm(document_emb)

    # Calculate the cosine similarity
    cosine_sim = dot_product / (query_norm * document_norm)

    return cosine_sim

In [189]:
from langchain.embeddings import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



# Define your question and document
question = "What is the price of elements resedencia?"
document = "Elemnts resedencia"

# Initialize the OpenAI embeddings with your API key
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Embed the query and the document
query_emb = embedding.embed_query(question)
document_emb = embedding.embed_query(document)

# Calculate cosine similarity between the query and document embeddings
cosine_sim = cosine_similarity([query_emb], [document_emb])

# Output the dimensions and cosine similarity
print(f'Query Dimensions: {len(query_emb)}')
print(f'Document Dimensions: {len(document_emb)}')
print("Cosine Similarity:", cosine_sim[0][0])


Query Dimensions: 1536
Document Dimensions: 1536
Cosine Similarity: 0.8908459873400729


In [190]:
from langchain.embeddings import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Make sure your API key is set
OPENAI_API_KEY = "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"

# Define your question and document
question = "What is the price of elements resedencia?"
document = "Elements Resedencia"

# Initialize the OpenAI embeddings with a valid model for embeddings
embedding = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)

# Embed the query and the document
query_emb = embedding.embed_query(question)
document_emb = embedding.embed_query(document)

# Calculate cosine similarity between the query and document embeddings
cosine_sim = cosine_similarity([query_emb], [document_emb])

# Output the dimensions and cosine similarity
print(f'Query Dimensions: {len(query_emb)}')
print(f'Document Dimensions: {len(document_emb)}')
print("Cosine Similarity:", cosine_sim[0][0])


Query Dimensions: 1536
Document Dimensions: 1536
Cosine Similarity: 0.8962367300383302


We see that each model performs differently as each one has been trained with different data and depending in the use case, the results will differ. Now it is time to implement our dataset and create our index to see the performance for our specific data.

## Text Splitters

Unfortunatelly, LLM models have some limitations when it comes to the point of processig text. One of those is the **context window**. The context window represents the maximum amount of text/tokens that a model can process at one time as an input to generate a response. Therefore we need to split our documents into smaller chunks that can fit into the model's context window. A complete list of OpenAI models can be found [here](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4). It spans from 4'096 tokens for the `gpt-3.5-turbo-instruct` to the `gpt-4-turbo` with 128'000 tokens.



Like the data loaders, LangChain offers several text splitters. In the table below you can see the main splitting methods and when to use which one. The `Adds Metadata` does not mean that it will add (or not) the metadata from the previous loader. For example for HTML has a `HTMLHeaderTextSplitter` and it means it will splits text at the element level and adds metadata for each chunk based on header text.

In our case we already have the metadata available and we do not need to add them using and splitter.

<img width="863" alt="Screenshot 2024-04-30 132645" src="https://github.com/benitomartin/mlops-car-prices/assets/116911431/7b4dcefb-9320-4085-821f-1b21c81f4d28">


The `RecursiveCharacterTextSplitter` is the recommended tool for splitting general text. It segments the text based on a defined chunk size, using a list of characters as separators.

According to LangChain, the default separators include ["\n\n", "\n", " ", ""]. This means it aims to keep paragraphs together first, followed by sentences and words, as they typically exhibit the strongest semantic connections in text.

To leverage this feature, we can utilize the `RecursiveCharacterTextSplitter` along with the tiktoken library to ensure that splits do not exceed the maximum token chunk size allowed by the language model. Each split will be recursively divided if its size exceeds the limit.


The final design of our text splitter will be as follows:

- Model: `gpt-3.5-turbo-0125` with a context window of 16,385 tokens

- Chunk Size: number of tokens of one chunk

- Chunk Overlap: number of tokens that overlap between two consecutive chunks

- Separators: the order of separators

In [10]:
# texts[0].metadata

In [11]:
# # The resulting texts will have the metadata attached
# for text in texts[]:
#     print(text.metadata)

In [191]:
# Splitter

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4o-mini-2024-07-18",
    chunk_size=512,
    chunk_overlap=20,
    separators= ["\n\n", "\n", " ", ""])

In [31]:
# # Make splits
# splits = text_splitter.split_documents(data[:5])

# # Print the number of resulting text chunks

# print(f'Number of Chunks: {len(splits)}')

# # # Print the number of original pages/documents
# print(f'Number of Pages: {len(data)}')

NameError: name 'data' is not defined

## Vector Store

A Vector Store is a specialized database that is designed to store and manage high-dimensional vector data. Vector databases store data in the form of vector embedding, which can be retrieved by the LLMs and allow them to understand the context and meaning of the data, allowing better responses.

### Indexing

Pinecone is a serverless vector store, which shows a very good performance for a fast vector search and retrieval process.

The first step to use Pinecone is to create an Index where our embeddings will be stored. There are several parameters to be considered for this:

- Index name
- Dimension: must be equal to the embedding model dimensions
- Metric: must match with the used to tain the embedding model for better results
- Serverless specifications

In [108]:
# Pinecone Initialization

index_name = "langchain-pinecone-test"
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

pc = Pinecone(api_key = PINECONE_API_KEY)

In [192]:
index_name="dao"

In [193]:
from pinecone import Pinecone, ServerlessSpec
spec = ServerlessSpec(cloud="aws", region="us-east-1")
if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=1536,  # Change this to the correct dimension of your embeddings
        metric="cosine",  # This can be "cosine", "euclidean", etc.
        spec=spec  # Pass the spec argument here
    )

# Access the index
index = pc.Index(index_name)

In [16]:
# # Delete Index

# pc.delete_index(index_name)

In [173]:
# List Indexes

pc.list_indexes()

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 1536,
              'host': 'dao-ar5s17q.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'dao',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [194]:
# Describe Index

index = pc.Index(index_name)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### Namespaces

Pinecone allows you to split the data into namespaces within an index. This allows to send queries to an specific namespace. You could for example split your data by content, language or any other index suitable for your use case.

For this specific example we will first upload 100 records of our data to a namespace and then we will create two splits each with 50 records. In total we will have 3 namspaces.

In [195]:
# Create Main Namespace
from langchain.vectorstores import Pinecone
splits = text_splitter.split_documents(documents[:100])

embed = embedding=OpenAIEmbeddings(model = "text-embedding-ada-002")

db = Pinecone.from_documents(documents=splits,
                             embedding=embed,
                             index_name=index_name,
                             namespace="main")

In [63]:
pip install -qU langchain-pinecone

Note: you may need to restart the kernel to use updated packages.




In [196]:
# Create Vectorstore of Main index
from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore(index_name=index_name,
                                  namespace="main",
                                  embedding=embed)

In [197]:
# Search for similarity

query = "price for 1 bed apartment of elemnts resedencia"

similarity = vectorstore.similarity_search(query, k=4)

for i in range(len(similarity)):
  print(f"-------Result Nr. {i}-------")
  print(f"Elements Residencia: {similarity[i].metadata['Elements Residencia']}")
  print(f" ")

-------Result Nr. 0-------
Elements Residencia: PKR 14,620,000
 
-------Result Nr. 1-------
Elements Residencia: PKR 243,500
 
-------Result Nr. 2-------
Elements Residencia: PKR 1,462,000
 
-------Result Nr. 3-------
Elements Residencia: PKR 914,000
 


In [198]:
# Search for similarity with score

query = "price for 1 bed apartment of elemnts resedencia"

similarity_with_score = vectorstore.similarity_search_with_score(query, k=4)

for i in range(len(similarity_with_score)):
  print(f"-------Result Nr. {i}-------")
  print(f"Elements Residencia: {similarity_with_score[i][0].metadata['Elements Residencia']}")
  print(f"Score: {similarity_with_score[i][1]}")
  print(f" ")


-------Result Nr. 0-------
Elements Residencia: PKR 14,620,000
Score: 0.855513036
 
-------Result Nr. 1-------
Elements Residencia: PKR 243,500
Score: 0.847032189
 
-------Result Nr. 2-------
Elements Residencia: PKR 1,462,000
Score: 0.844457626
 
-------Result Nr. 3-------
Elements Residencia: PKR 914,000
Score: 0.83532995
 


Now we will create 2 more namespaces each with 50 records. For that we will use the `upsert` function and the `metadata` to add data into our index but in a separate namespace.

First we will create the chunks

In [199]:
# Create Chunked Metadata

def chunked_metadata_embeddings(documents, embed):

    chunked_metadata = []

    chunked_text = text_splitter.split_documents(documents)

    for index, text in enumerate(tqdm(chunked_text)):


        payload = {
              "metadata": {
                  "chunk_num": index,
                  "Feature": text.metadata['Feature'],
                  "Elements Residencia": text.metadata['Elements Residencia'],
                  "Urban Dwellings": text.metadata['Urban Dwellings'],
                  
                
              },
              "id": str(uuid4()),
              "values": embed.embed_documents([text.page_content])[0]  # Assuming `embed` is defined elsewhere
          }

        chunked_metadata.append(payload)

    return chunked_metadata


In [200]:
# Create first split

split_one = chunked_metadata_embeddings(documents[:50], embed)

len(split_one)

  0%|          | 0/29 [00:00<?, ?it/s]

29

In [201]:
# Create second split

split_two = chunked_metadata_embeddings(documents[50:100], embed)
len(split_two)

0it [00:00, ?it/s]

0

In [202]:
# Upsert the document

def batch_upsert(split,
                 index ,
                 namespace,
                 batch_size):

    print(f"Split Length: {len(split)}")
    for i in range(0, len(split), batch_size):

      batch = split[i:i + batch_size]

      index.upsert(vectors=batch,
                   namespace=namespace)

In [203]:
batch_upsert(split_one, index, "first_split", 10)

Split Length: 29


The following function will allow to find an specific chunk base on the main speaker. It returns the title and the chunk ID, which can be used to find it in the pinecone cloud.

In [28]:
# Function to find item with main_speaker

def find_item_with_row(metadata_list, main_speaker):
    for item in metadata_list:
        if item['metadata']['main_speaker'] == main_speaker:
            return item

# Call the function to find item with main_speaker = Al Gore
result_item = find_item_with_row(split_one, "Al Gore")

# Print the result
print(f'Chunk Nr: {result_item["metadata"]["chunk_num"]}')
print(f'Chunk ID: {result_item["id"]}')
print(f'Chunk Title: {result_item["metadata"]["title"]}')

Chunk Nr: 9
Chunk ID: 3802bf35-9d69-424e-ae12-65e449d8547e
Chunk Title: Averting the climate crisis



<img width="695" alt="Screenshot 2024-05-10 140026" src="https://github.com/benitomartin/Agentic-RAG-Langchain-Pinecone/assets/116911431/6a311659-d8da-4679-a705-9c78fd07aece">

In [43]:
# index.delete(namespace="last_split", delete_all=True)

{}

Now we can see that our index has two namespaces with the below function

In [204]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'main': {'vector_count': 29}},
 'total_vector_count': 29}

We can now create the namespace of the second split and check that everything has been properly created.

In [205]:
batch_upsert(split_two, index, "last_split", 20)

Split Length: 0


In [206]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'main': {'vector_count': 29}},
 'total_vector_count': 29}

<img width="691" alt="Screenshot 2024-05-10 135858" src="https://github.com/benitomartin/Agentic-RAG-Langchain-Pinecone/assets/116911431/b6f12941-67c7-4420-afbe-b953f753a4b0">


Now we will test our namespaces by creating two users, each of one will send the query to a different namespace.

In [207]:
# Define Users

query_one = "Price for Elements Resedencia for 1 bed aprtment?"
query_two = "rice for Urban Dwelling for 1 bed aprtment??"

# Users dictionary
users = [{
            'name': 'John',
            'namespace': 'first_split',
            'query': query_one

            },
           {
             "name": "Jane",
             "namespace": 'last_split',
             "query": query_two
           }]

def vectorize_query(embed, query):

    return embed.embed_query(query)

In [208]:
# Create our vectors for each of our queries:

query_vector_one = vectorize_query(embed, query_one)
query_vector_two = vectorize_query(embed, query_two)

In [209]:
len(query_vector_one), len(query_vector_two)

(1536, 1536)

In [210]:
# Define a list of new key-value pairs

new_key_value_pairs = [
    {'vector_query': query_vector_one},
    {'vector_query': query_vector_two},

]

# Loop through the list of users and the list of new key-value pairs
for user, new_pair in zip(users, new_key_value_pairs):
    user.update(new_pair)

In [211]:
users[0]["name"], users[1]["name"]

('John', 'Jane')

In [212]:
users[0].keys()

dict_keys(['name', 'namespace', 'query', 'vector_query'])

In [213]:
print(f"Name: {users[0]['name']}")
print(f"Namespace: {users[0]['namespace']}")
print(f"Query: {users[0]['query']}")
print(f"Vector Query: {users[0]['vector_query'][:3]}")

Name: John
Namespace: first_split
Query: Price for Elements Resedencia for 1 bed aprtment?
Vector Query: [0.024277267064373317, 0.019070089609156265, -0.011953154114940329]


If we send the query to the namespace, we will get the `top_k` vectors related to that query

In [214]:
# Query the namespace

john = [t for t in users if t.get('name') == 'John'][0]

john_query_vector = john['vector_query']
john_namespace = john['namespace']

index.query(vector=john_query_vector, top_k=2, include_metadata=True, namespace=john_namespace)


{'matches': [], 'namespace': 'first_split', 'usage': {'read_units': 1}}

# RAG

Now that we have set up our namespaces, we can prepare our RAG pipeline. We will do so, using Agents

## Retrieval

In [215]:
# Create vectorstore
embed = embedding=OpenAIEmbeddings(model = "text-embedding-ada-002")

vectorstore = PineconeVectorStore(index_name=index_name,
                                  namespace="main",
                                  embedding=embed)

In this retrieval step, you can chose between Open AI or Groq. For Groq, create a `GROQ_API_KEY` which allow you to use some models like llama or mistral for free. We will also add some memory, which allow to keep track of the QA chain.

In [216]:
# Retrieval


# # Chat completion llm wit Groq
# GROQ_API_KEY = os.getenv('GROQ_API_KEY')
# llm = ChatGroq(groq_api_key=GROQ_API_KEY,
#                model_name="llama3-8b-8192",
#                temperature=0.0,
#                max_tokens=512)

llm = ChatOpenAI(temperature=0.0, model="gpt-4o-mini-2024-07-18", max_tokens=512)


# Conversational memory
conversational_memory = ConversationBufferWindowMemory(
                        memory_key='chat_history',
                        k=5,
                        return_messages=True)

# Retrieval qa chain
qa_db = RetrievalQA.from_chain_type(
                                    llm=llm,
                                    chain_type="stuff",
                                    retriever=vectorstore.as_retriever())

## Augmented

We are going to use a slightly modified prompt template. First we download the react template, which is a common template using toools and agents and then we will add the instruction of in which tool to look up first.

A collection of templates can be found in the [langchain hub](https://smith.langchain.com/hub)



In [None]:
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

In [217]:
from langsmith import Client

# Replace 'your_api_key_here' with your actual LangSmith API key
api_key = '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'
api_url = 'https://api.smith.langchain.com'

# Initialize the client with your API key
client = Client(api_url, api_key=api_key)

# Now you can pull data using the client
prompt = client.pull_prompt('hwchase17/react')
print(prompt.template)


Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}


Now we will replace this line:

`Action: the action to take, should be one of [{tool_names}]`

By this line:

`Action: the action to take, should be one of [{tool_names}]. Always look first in Pinecone Document Store`



In [218]:
# Set prompt template

template= '''
          Answer the following questions as best you can. You have access to the following tools:

          {tools}

          Use the following format:

          Question: the input question you must answer
          Thought: you should always think about what to do
          Action: the action to take, should be one of [{tool_names}]. Always look first in Pinecone Document Store
          Action Input: the input to the action
          Observation: the result of the action
          ... (this Thought/Action/Action Input/Observation can repeat 2 times)
          Thought: I now know the final answer
          Final Answer: the final answer to the original input question

          Begin!

          Question: {input}
          Thought:{agent_scratchpad}
          '''

prompt = PromptTemplate.from_template(template)

## Generation with Agent

We are going to set up 2 tools for our agent:

- Tavily Search API: Tavily search over several sources like Bing or Google and returns the most relevant content. It offers 1000 API calls per month for free.

- Vectorstore: Our vector store will be used to look for the information first.

In [219]:
import pandas as pd
from pathlib import Path

class ExcelDataReader:
    def __init__(self, file_path):
        self.file_path = file_path

    # Updated run method to accept any number of positional and keyword arguments
    def run(self, *args, **kwargs):
        if Path(self.file_path).exists():
            return pd.read_excel(self.file_path)
        else:
            return "File not found."

# Define the tool for reading Excel data
excel_data_reader = ExcelDataReader("Book2.xlsx")

tools = [
    Tool(
        name="Pinecone Document Store",
        func=qa_db.run,
        description="Use it to lookup information from the Pinecone Document Store"
    ),
    Tool(
        name="Excel Data Reader",
        func=excel_data_reader.run,
        description="Reads data from an Excel file specified in the tool setup"
    )
]

# Assuming 'create_react_agent' and other setup functions are defined elsewhere
agent = create_react_agent(llm, tools, prompt)

# Set up the agent executor
agent_executor = AgentExecutor(
    tools=tools,
    agent=agent,
    handle_parsing_errors=True,
    verbose=True,
    memory=conversational_memory
)

# Now the ExcelDataReader tool is ready to be called by the agent_executor without causing argument errors.


Once everything is set up, we can start making queries and check how the agents behave in terms priorization of agent, search quality and answers.

In [156]:
agent_executor.invoke({"input":" Feature of Elements Resedencia?. \
                                Please look in the excel document store in coloumns as it has the features \
                                with project names mentioned"})




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to check the Excel document store for the features of "Elements Resedencia." I'll look for the relevant columns that contain project names and their associated features.

Action: Excel Data Reader  
Action Input: Read the Excel document to find the features of "Elements Resedencia."  
[0m[33;1m[1;3m                                             Feature Urban Dwellings  \
0                                         Total area     625,000 SFT   
1                                    Commercial Land        22 Kanal   
2                            Reserved for Commercial      35,000 SFT   
3                   Furnished Residential Apartments            230+   
4                                    Building height          200 ft   
5      Commercial Shop Lower Ground Down Payment 25%   Not Available   
6       Commercial Shop Mezzanine  Down Payment 25%    Not Available   
7          1 Bed Apartment Monthly Installment price 

{'input': ' Feature of Elements Resedencia?.                                 Please look in the excel document store in coloumns as it has the features                                 with project names mentioned',
 'chat_history': [HumanMessage(content=' Feature of Elements Resedencia?.                                 Please look in the pinecone document store metadata as it has the title                                 based on the transcripts'),
  AIMessage(content='I do not have information on the features of Elements Resedencia.'),
  HumanMessage(content='Did you find the previous answer in the Pinecone Document Store?'),
  AIMessage(content='I cannot determine if the previous answer was found in the Pinecone Document Store without more context or details.'),
  HumanMessage(content='Can you look for elements resedencia within the Pinecone Document Store?'),
  AIMessage(content='I was unable to find any information related to "residencia," "residential," or "housing" in the Pinecon

In [157]:
agent_executor.invoke({"input":"Did you find the previous answer in the Pinecone Document Store?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to check if there is any relevant information in the Pinecone Document Store regarding the previous answer. 

Action: Pinecone Document Store  
Action Input: "previous answer"  
[0m[36;1m[1;3mI don't know.[0m[32;1m[1;3mI need to clarify what the "previous answer" refers to in order to search effectively in the Pinecone Document Store. 

Action: Pinecone Document Store  
Action Input: "previous answer clarification"  
[0m[36;1m[1;3mI'm sorry, but I don't have the previous answer or context to clarify. Could you please provide more details or specify your question?[0m[32;1m[1;3mI need more context or details about the "previous answer" to provide a meaningful response. Without that information, I cannot effectively search the Pinecone Document Store. 

Final Answer: I cannot determine if the previous answer was found in the Pinecone Document Store without additional context or details.[0m

[1m> Finished cha

{'input': 'Did you find the previous answer in the Pinecone Document Store?',
 'chat_history': [HumanMessage(content='Did you find the previous answer in the Pinecone Document Store?'),
  AIMessage(content='I cannot determine if the previous answer was found in the Pinecone Document Store without more context or details.'),
  HumanMessage(content='Can you look for elements resedencia within the Pinecone Document Store?'),
  AIMessage(content='I was unable to find any information related to "residencia," "residential," or "housing" in the Pinecone Document Store.'),
  HumanMessage(content=' Feature of Elements Resedencia?.                                 Please look in the pinecone document store metadata as it has the features                                 with project names mentioned'),
  AIMessage(content='I could not find any information regarding the features of "Elements Resedencia."'),
  HumanMessage(content=' Feature of Elements Resedencia?.                                 Ple

In [162]:
agent_executor.invoke({"input":" if i save 100k PKR. every month in Elements Resedencia so after 2 years how much area i will have bought.you can check the excel tool for data"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo answer the question, I need to find out the cost per unit area in Elements Residencia and then calculate how much area can be purchased with the total savings over 2 years. Since the question specifies checking the Excel tool for data, I will use the Excel Data Reader to find the relevant information.

Action: Excel Data Reader  
Action Input: Read the Excel file that contains the pricing information for Elements Residencia.  
[0m[33;1m[1;3m                                             Feature Urban Dwellings  \
0                                         Total area     625,000 SFT   
1                                    Commercial Land        22 Kanal   
2                            Reserved for Commercial      35,000 SFT   
3                   Furnished Residential Apartments            230+   
4                                    Building height          200 ft   
5      Commercial Shop Lower Ground Down Payment 25%   N

{'input': ' if i save 100k PKR. every month in Elements Resedencia so after 2 years how much area i will have bought.you can check the excel tool for data',
 'chat_history': [HumanMessage(content='Did you find the previous answer in the Pinecone Document Store?'),
  AIMessage(content='I cannot determine if the previous answer was found in the Pinecone Document Store without additional context or details.'),
  HumanMessage(content='Can you look for elements resedencia within the excel data reader?'),
  AIMessage(content='The "Elements Residencia" data includes various metrics such as total area (215,000 sq. ft), land area (5.5 Kanal), reserved area (30,000 SFT), number of apartments (42 units of 1, 2 & 3 Bed), and various pricing details for different types of apartments and commercial shops.'),
  HumanMessage(content=' I want to buy 2 Bed apartment in Urban Dwelling please give me some payments plans options.'),
  AIMessage(content='The payment plans for a 2-bedroom apartment in Urban 

In [223]:
agent_executor.invoke({"input":"how much i can save if i buy 1 bed apartment in Elements Resedencia instead of monthly rent and show the calculations as well like how much does elements resdencia 1 bed apartment monthly and unit price and how are you calculating.please rull all tools"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo answer the question, I need to gather information about the monthly rent for a 1-bedroom apartment in Elements Residencia, the unit price of the apartment, and then calculate the potential savings from buying instead of renting. I'll start by checking the Pinecone Document Store for relevant data.

Action: Pinecone Document Store  
Action Input: "1 bedroom apartment Elements Residencia monthly rent and unit price"
[0m[36;1m[1;3mI don't know.[0m[32;1m[1;3mI need to gather the necessary information regarding the monthly rent and unit price for a 1-bedroom apartment in Elements Residencia. Since I didn't find the information in the Pinecone Document Store, I will try to read from an Excel file that might contain this data.

Action: Excel Data Reader  
Action Input: "Elements Residencia 1 bedroom apartment data.xlsx"  # Assuming this is the name of the file containing the relevant data.
[0m[33;1m[1;3m                 

{'input': 'how much i can save if i buy 1 bed apartment in Elements Resedencia instead of monthly rent and show the calculations as well like how much does elements resdencia 1 bed apartment monthly and unit price and how are you calculating.please rull all tools',
 'chat_history': [HumanMessage(content='how much i can save if i buy 1 bed apartment in Elements Resedencia instead of monthly rent.'),
  AIMessage(content='I currently do not have the necessary information to determine how much you can save by buying a 1-bedroom apartment in Elements Residencia instead of paying monthly rent.'),
  HumanMessage(content='how much i can save if i buy 1 bed apartment in Elements Resedencia instead of monthly rent.please rull all tools'),
  AIMessage(content='You can save PKR 12,976,000 by buying a 1-bedroom apartment in Elements Residencia instead of paying monthly rent for a year.'),
  HumanMessage(content='how much i can save if i buy 1 bed apartment in Elements Resedencia instead of monthly 

In [72]:
agent_executor.invoke({"input":"What is the main topic of Dan Gilbert TEDx talks?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should look up information about Dan Gilbert's TEDx talks in the Pinecone Document Store first.
Action: Pinecone Document Store
Action Input: Dan Gilbert TEDx talks[0m[36;1m[1;3mI'm sorry, but based on the provided context, there is no mention of Dan Gilbert giving a TEDx talk.[0m[32;1m[1;3mAction: Tavily
Action Input: Dan Gilbert TEDx talks[0m[33;1m[1;3m[{'url': 'https://www.ted.com/speakers/dan_gilbert', 'content': 'Harvard psychologist Dan Gilbert says our beliefs about what will make us happy are often wrong — a premise he supports with intriguing research, and explains\xa0...'}, {'url': 'https://blog.ted.com/ten-years-later-dan-gilbert-on-life-after-the-surprising-science-of-happiness/', 'content': 'Dan Gilbert gave his first TED Talk in February 2004; The surprising science of happiness was one of the first we ever published,\xa0...'}][0m[32;1m[1;3mI now know the final answer
Final Answer: The main topic o

{'input': 'What is the main topic of Dan Gilbert TEDx talks?',
 'chat_history': [HumanMessage(content='Can you give me one title of a TED talk of Al Gore as main speaker?.                                 Please look in the pinecone document store metadata as it has the title                                 based on the transcripts'),
  AIMessage(content='The title of a TED talk by Al Gore as the main speaker is "The case for optimism on climate change". Al Gore is a former Vice President of the United States known for his work on environmental issues, particularly climate change.'),
  HumanMessage(content="Did you find the previous title 'The Case for Optimism on Climate Change' in the Pinecone Document Store?"),
  AIMessage(content="Yes, the previous title 'The Case for Optimism on Climate Change' was found in both the Pinecone Document Store and Tavily."),
  HumanMessage(content='Can you look for a title within the Pinecone Document Store?'),
  AIMessage(content='I need more specific

In [224]:
conversational_memory.load_memory_variables({})

{'chat_history': [HumanMessage(content='how much i can save if i buy 1 bed apartment in Elements Resedencia instead of monthly rent.'),
  AIMessage(content='I currently do not have the necessary information to determine how much you can save by buying a 1-bedroom apartment in Elements Residencia instead of paying monthly rent.'),
  HumanMessage(content='how much i can save if i buy 1 bed apartment in Elements Resedencia instead of monthly rent.please rull all tools'),
  AIMessage(content='You can save PKR 12,976,000 by buying a 1-bedroom apartment in Elements Residencia instead of paying monthly rent for a year.'),
  HumanMessage(content='how much i can save if i buy 1 bed apartment in Elements Resedencia instead of monthly rent and show the calculations as well.please rull all tools'),
  AIMessage(content='If you buy a 1-bedroom apartment in Elements Residencia instead of renting, you would save PKR 40,000 over a 10-year period, considering the total cost of the apartment and the tota

In [225]:
agent_executor.memory.clear()