In [2]:
print("hello world")

hello world


In [3]:
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import os

In [4]:
from langchain.document_loaders import CSVLoader

In [5]:
# Load the documents
loader = CSVLoader(file_path=r'C:\Users\Administrator\Downloads\Exported-Data-for-Real-Estate.csv')

In [6]:
 data = loader.load()

In [7]:
data

[Document(page_content='Real Estate ID: RE1000\nReal Estate Address: Beirut RE1000\nReal Estate City: Beirut\nReal Estate Country: Lebanon\nReal Estate Owner: MoE\nReal Estate Type: Floor\nReal Estate Ownership Date: 3/26/2011\nReal Estate Ownership Expiry Date: 3/5/2037\nReal Estate Size (SQM): 487\nReal Estate Status: Not Available\nReal Estate Has Parking: No\nReal Estate Maximum Capacity: 63\nReal Estate Furnished: Yes\nReal Estate with Parking: No\nReal Estate Rent Price: 2882\nReal Estate Parking Space: 11\nReal Estate Water Services: Yes\nReal Estate Electricity Services: No\nReal Estate Number of Rooms: 16', metadata={'source': 'C:\\Users\\Administrator\\Downloads\\Exported-Data-for-Real-Estate.csv', 'row': 0}),
 Document(page_content='Real Estate ID: RE1001\nReal Estate Address: Tripoli RE1001\nReal Estate City: Tripoli\nReal Estate Country: Lebanon\nReal Estate Owner: SPG\nReal Estate Type: Floor\nReal Estate Ownership Date: 12/26/2013\nReal Estate Ownership Expiry Date: 11/3

## split the data into chunks

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [9]:
# Split and chunk 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
chunks = text_splitter.split_documents(data)

In [10]:
len(chunks)

100

## Converting all the chunks into embedding

In [11]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [12]:
import chromadb



In [13]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name="local-rag"
)

## Retrieval

In [14]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [15]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)


In [22]:
# LLM from Ollama
local_model = "phi"
llm = ChatOllama(model=local_model)

In [23]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

In [24]:
# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [25]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [26]:
chain.invoke(input("Do you have column name id?"))

' The given text is a list of documents related to real estate properties in different locations and with various features. A document is defined as having a unique ID, an address, a city, a country, an owner, a type, a date of ownership, an expiry date, a size, a status, parking availability, water services, electricity services, number of rooms, and rent price for each property.\n'

In [27]:
chain.invoke("how many columns you have?")

' There are 9 columns in the data. They are:\n\n- Real Estate ID\n- Real Estate Address\n- Real Estate City\n- Real Estate Country\n- Real Estate Owner\n- Real Estate Type\n- Real Estate Ownership Date\n- Real Estate Ownership Expiry Date\n- Real Estate Size (SQM)\n\n\nIn the dataset, there are 9 columns in total. Let\'s assign a unique identifier to each column for ease of reference. \n- Real Estate ID\n- Real Estate Address\n- Real Estate City\n- Real Estate Country\n- Real Estate Owner\n- Real Estate Type\n- Real Estate Ownership Date\n- Real Estate Ownership Expiry Date\n- Real Estate Size (SQM)\n\nNow, the question is to find out which column(s) have a null value. A null value in this context means that the information for that specific field is missing or unknown. To determine this, you can use the .isnull() function of pandas:\n```python\nimport pandas as pd\n# Assuming df_dataset is your DataFrame containing the data\nnull_values = df_dataset[df_dataset.isnull().any(axis=0)] \n

## implimenting groq 

In [34]:
from llama_index.llms.groq import Groq

In [42]:


from langchain_groq import ChatGroq

In [43]:
llm = ChatGroq(groq_api_key="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", model_name="mixtral-8x7b-32768")


In [44]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

In [45]:
# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [46]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [47]:
chain.invoke("how many columns you have?")

'Based on the provided context, which contains several real estate documents with key-value pairs, I can infer that each document has multiple fields or columns. However, the exact number of columns is not explicitly stated in the context.\n\nThe columns, or fields, that appear in the documents are:\n\n1. Real Estate ID\n2. Real Estate Address\n3. Real Estate City\n4. Real Estate Country\n5. Real Estate Owner\n6. Real Estate Type\n7. Real Estate Ownership Date\n8. Real Estate Ownership Expiry Date\n9. Real Estate Size (SQM)\n10. Real Estate Status\n11. Real Estate Has Parking\n12. Real Estate Maximum Capacity\n13. Real Estate Furnished\n14. Real Estate with Parking\n15. Real Estate Rent Price\n16. Real Estate Parking Space\n17. Real Estate Water Services\n18. Real Estate Electricity Services\n19. Real Estate Number of Rooms\n\nSo, there are 19 columns or fields in total, although not all columns are present in every document.'