In [9]:
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.vectorstores import FAISS
from langchain.document_loaders import DataFrameLoader
from langchain.chains import RetrievalQA
from transformers import pipeline

# STEP 1: Load NYC PLUTO data
DATA_PATH = "Downloads/nyc_pluto_20v1_csv/pluto_20v1.csv"
df = pd.read_csv(DATA_PATH)
df = df.fillna("")
df['combined'] = df[['address', 'borough', 'landuse', 'yearbuilt', 'zonedist1']].astype(str).agg(" | ".join, axis=1)
df = df.sample(n=500, random_state=42).reset_index(drop=True)


# STEP 2: Embed and store
loader = DataFrameLoader(df[['combined']], page_content_column="combined")
docs = loader.load()
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embeddings)

# STEP — define retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# STEP 3: Local LLM
local_pipe = pipeline("text2text-generation", model="google/flan-t5-base", max_length=512)
llm = HuggingFacePipeline(pipeline=local_pipe)

# STEP 4: Retrieval + QA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

# STEP 5: Query
query = "Which neighborhoods have the oldest residential buildings and what is their zoning?"
result = qa_chain(query)

# Print results
print("Answer:", result['result'])
print("\nTop source snippets:")
for i, doc in enumerate(result['source_documents']):
    print(f"\nSource {i+1}:\n{doc.page_content[:1000]}")


  df = pd.read_csv(DATA_PATH)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
  result = qa_chain(query)


Answer: EDGEGROVE AVENUE

Top source snippets:

Source 1:
312 ELDERT STREET | BK | 2.0 | 1910.0 | R6

Source 2:
 EDGEGROVE AVENUE | SI | 11.0 | 0.0 | R3X

Source 3:
262 LAMOKA AVENUE | SI | 1.0 | 1970.0 | R3-1

Source 4:
23 GRANTWOOD AVENUE | SI | 1.0 | 1930.0 | R3A

Source 5:
658 DELAFIELD AVENUE | SI | 1.0 | 1899.0 | R3A
