## Indexing

In [1]:
import pandas as pd
import torch
import time
from transformers import AutoTokenizer, AutoModel
from langchain_core.documents import Document
from langchain_elasticsearch import ElasticsearchStore
from langchain_groq import ChatGroq
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cd /home/rakesh/Downloads/Evertz/WeatherRAG

/home/rakesh/Downloads/Evertz/WeatherRAG


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
data = pd.read_excel('data/data.xlsx')
data

Unnamed: 0,city,temperature,weather,climate
0,New York,22,sunny,temperate
1,Los Angeles,25,partly cloudy,mediterranean
2,Chicago,18,cloudy,continental
3,Houston,30,thunderstorms,humid subtropical
4,Phoenix,35,sunny,desert
5,Toronto,20,partly cloudy,continental
6,Warsaw,16,overcast,temperate
7,Bangalore,28,rainy,tropical savanna
8,London,18,drizzle,temperate maritime
9,Paris,21,partly sunny,temperate


In [4]:
data['weather_climate_desc'] = data.apply(lambda row: f"Weather is {row['weather']} and Climate is {row['climate']}", axis=1)
data

Unnamed: 0,city,temperature,weather,climate,weather_climate_desc
0,New York,22,sunny,temperate,Weather is sunny and Climate is temperate
1,Los Angeles,25,partly cloudy,mediterranean,Weather is partly cloudy and Climate is medite...
2,Chicago,18,cloudy,continental,Weather is cloudy and Climate is continental
3,Houston,30,thunderstorms,humid subtropical,Weather is thunderstorms and Climate is humid ...
4,Phoenix,35,sunny,desert,Weather is sunny and Climate is desert
5,Toronto,20,partly cloudy,continental,Weather is partly cloudy and Climate is contin...
6,Warsaw,16,overcast,temperate,Weather is overcast and Climate is temperate
7,Bangalore,28,rainy,tropical savanna,Weather is rainy and Climate is tropical savanna
8,London,18,drizzle,temperate maritime,Weather is drizzle and Climate is temperate ma...
9,Paris,21,partly sunny,temperate,Weather is partly sunny and Climate is temperate


In [5]:
docs = []
for i, row in data.iterrows():
    doc = Document(page_content=row['weather_climate_desc'], metadata={'city': row['city'], 'temperature': row['temperature'],
                                                                       'weather': row['weather'], 'climate': row['climate']})
    docs.append(doc)

In [6]:
docs

[Document(metadata={'city': 'New York', 'temperature': 22, 'weather': 'sunny', 'climate': 'temperate'}, page_content='Weather is sunny and Climate is temperate'),
 Document(metadata={'city': 'Los Angeles', 'temperature': 25, 'weather': 'partly cloudy', 'climate': 'mediterranean'}, page_content='Weather is partly cloudy and Climate is mediterranean'),
 Document(metadata={'city': 'Chicago', 'temperature': 18, 'weather': 'cloudy', 'climate': 'continental'}, page_content='Weather is cloudy and Climate is continental'),
 Document(metadata={'city': 'Houston', 'temperature': 30, 'weather': 'thunderstorms', 'climate': 'humid subtropical'}, page_content='Weather is thunderstorms and Climate is humid subtropical'),
 Document(metadata={'city': 'Phoenix', 'temperature': 35, 'weather': 'sunny', 'climate': 'desert'}, page_content='Weather is sunny and Climate is desert'),
 Document(metadata={'city': 'Toronto', 'temperature': 20, 'weather': 'partly cloudy', 'climate': 'continental'}, page_content='We

In [7]:
class CustomEmbeddingModel:
    
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def embed_text(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
    
    def embed_query(self, query):
        return self.embed_text(query)

    def embed_documents(self, documents):
        return [self.embed_text(doc) for doc in documents]
    
embedding_model = CustomEmbeddingModel()

In [8]:
%%time

ElasticsearchStore.from_documents(docs, embedding_model,
                                  index_name="weather_rag", es_url="http://localhost:9200")

CPU times: user 738 ms, sys: 28.6 ms, total: 767 ms
Wall time: 297 ms


<langchain_elasticsearch.vectorstores.ElasticsearchStore at 0x7c25c142c3b0>

## Retrieval

In [9]:
llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0,
               api_key="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")

In [10]:
doc_content_info = "Weather and climate of a city"

metadata_field_info = [
    AttributeInfo(
        name="city",
        description="The name of the city",
        type="string",
    ),
    AttributeInfo(
        name="temperature",
        description="The temperature of the city",
        type="integer",
    ),
    AttributeInfo(
        name="weather",
        description="The weather of the city",
        type="string",
    ),
    AttributeInfo(
        name="climate",
        description="The climate of the city",
        type="string"
    ),
]

In [11]:
vectorstore = ElasticsearchStore.from_documents([], embedding_model,
                                                index_name="weather_rag", es_url="http://localhost:9200")

In [12]:
retriever = SelfQueryRetriever.from_llm(llm, vectorstore, doc_content_info,
                                        metadata_field_info, verbose=True)

In [13]:
%%time
response = retriever.invoke("What's the temperature in London?")
response

CPU times: user 98.3 ms, sys: 2.43 ms, total: 101 ms
Wall time: 1.05 s


[Document(metadata={'city': 'London', 'temperature': 18, 'weather': 'drizzle', 'climate': 'temperate maritime'}, page_content='Weather is drizzle and Climate is temperate maritime')]

In [14]:
%%time

for city in data['city']:
    print()
    print(city)
    t1 = time.time()
    q = f"What's the temperature in {city}?"
    response = retriever.invoke(q)
    t2 = time.time()
    print(response)
    print('Time taken:', round(t2-t1, 2), 'sec')


New York
[Document(metadata={'city': 'New York', 'temperature': 22, 'weather': 'sunny', 'climate': 'temperate'}, page_content='Weather is sunny and Climate is temperate')]
Time taken: 0.98 sec

Los Angeles
[Document(metadata={'city': 'Los Angeles', 'temperature': 25, 'weather': 'partly cloudy', 'climate': 'mediterranean'}, page_content='Weather is partly cloudy and Climate is mediterranean')]
Time taken: 0.71 sec

Chicago
[Document(metadata={'city': 'Chicago', 'temperature': 18, 'weather': 'cloudy', 'climate': 'continental'}, page_content='Weather is cloudy and Climate is continental')]
Time taken: 0.83 sec

Houston
[Document(metadata={'city': 'Houston', 'temperature': 30, 'weather': 'thunderstorms', 'climate': 'humid subtropical'}, page_content='Weather is thunderstorms and Climate is humid subtropical')]
Time taken: 1.02 sec

Phoenix
[Document(metadata={'city': 'Phoenix', 'temperature': 35, 'weather': 'sunny', 'climate': 'desert'}, page_content='Weather is sunny and Climate is deser

In [15]:
%%time
response = retriever.invoke("What's the temperature in Johannesburg?")
response

CPU times: user 107 ms, sys: 598 μs, total: 107 ms
Wall time: 1.68 s


[Document(metadata={'city': 'Johannesburg', 'temperature': 20, 'weather': 'partly sunny', 'climate': 'subtropical highland'}, page_content='Weather is partly sunny and Climate is subtropical highland')]

In [16]:
%%time
response = retriever.invoke("What's the temperature in Fort Worth?")
response

CPU times: user 61 ms, sys: 2.25 ms, total: 63.3 ms
Wall time: 7.39 s


[]

## Generation

In [25]:
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer,\
just say that you don't know, don't try to make up an answer. Use three sentences maximum.\
Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 


Question: {question}


Context:

{context}


Answer: """

template = """Answer the question based only on the following context. Don't try to make up an answer.
{context}

Question: {question}
"""

prompt = ChatPromptTemplate(messages=[template])
print(prompt.messages[0])

prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="Answer the question based only on the following context. Don't try to make up an answer.\n{context}\n\nQuestion: {question}\n") additional_kwargs={}


In [26]:
def format_docs(docs):
    res = "\n\n".join(doc.page_content for doc in docs)
    return res

In [34]:
def format_docs(docs):
    formatted_docs = []
    for i, doc in enumerate(docs):
        content = doc.page_content
        metadata = "\n".join([f"{key}: {value}" for key, value in doc.metadata.items()])
        formatted_docs.append(f"Context {i+1}:-\nContent: {content}\nMetadata: {metadata}")
            
    res = "\n\n".join(doc for doc in formatted_docs)
    return res

In [35]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [36]:
%%time
result = rag_chain.invoke("What's the temperature in London?")
result

CPU times: user 64.7 ms, sys: 1.57 ms, total: 66.2 ms
Wall time: 4.48 s


'The temperature in London is 18.'

In [37]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel({"context": retriever,
                                          "question": RunnablePassthrough()}
                                        ).assign(answer=rag_chain_from_docs)

In [38]:
%%time

result = rag_chain_with_source.invoke("What's the temperature in London?")
print(format_docs(result['context']))
print('-' * 100)
llm_prompt = prompt.invoke({'context': format_docs(result['context']), 'question': result['question']})
print(llm_prompt.messages[0].content)
print('-' * 100)
print(result['answer'])
result

Context 1:-
Content: Weather is drizzle and Climate is temperate maritime
Metadata: city: London
temperature: 18
weather: drizzle
climate: temperate maritime
----------------------------------------------------------------------------------------------------
Answer the question based only on the following context. Don't try to make up an answer.
Context 1:-
Content: Weather is drizzle and Climate is temperate maritime
Metadata: city: London
temperature: 18
weather: drizzle
climate: temperate maritime

Question: What's the temperature in London?

----------------------------------------------------------------------------------------------------
The temperature in London is 18.
CPU times: user 68 ms, sys: 1.53 ms, total: 69.5 ms
Wall time: 1.63 s


{'context': [Document(metadata={'city': 'London', 'temperature': 18, 'weather': 'drizzle', 'climate': 'temperate maritime'}, page_content='Weather is drizzle and Climate is temperate maritime')],
 'question': "What's the temperature in London?",
 'answer': 'The temperature in London is 18.'}

In [39]:
%%time

result = rag_chain_with_source.invoke("What's the temperature in Fort Worth?")
print(format_docs(result['context']))
print('-' * 100)
llm_prompt = prompt.invoke({'context': format_docs(result['context']), 'question': result['question']})
print(llm_prompt.messages[0].content)
print('-' * 100)
print(result['answer'])
result


----------------------------------------------------------------------------------------------------
Answer the question based only on the following context. Don't try to make up an answer.


Question: What's the temperature in Fort Worth?

----------------------------------------------------------------------------------------------------
There's no information provided about the temperature in Fort Worth.
CPU times: user 59.1 ms, sys: 2.36 ms, total: 61.5 ms
Wall time: 1.67 s


{'context': [],
 'question': "What's the temperature in Fort Worth?",
 'answer': "There's no information provided about the temperature in Fort Worth."}