In [6]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_chroma import Chroma
import chromadb
from pathlib import Path
import os
import pandas as pd
from langchain.schema import Document

# Read the CSV data into a pandas DataFrame
df = pd.read_csv("Cleaned_LLM-KB.csv")

In [12]:
# Convert each row into a Document object
documents = df['0'].apply(lambda content: Document(page_content=content)).tolist()


In [13]:
# Print the first 200 characters of each document to verify
for i, doc in enumerate(documents):
    print(f"Document {i+1}: {doc.page_content[:200]}...")

Document 1: Title: How to cut your city’s consumption-based emissions
Content: To date, the focus and necessary foundation of climate plans has been the emissions produced by activities within city boundaries, as...
Document 2: Title: Mapped: Cities with a climate action plan
Content: Cities are playing a lead role in cutting greenhouse gas emissions to meet the Paris Agreement’s science-based, internationally accepted targe...
Document 3: Title: Greenhouse gas emissions interactive dashboard
Content: With this dashboard, you can explore historical greenhouse gas emissions (GHG) data for C40 cities, reported in line with the Global Prot...
Document 4: Title: How to shift your city’s investments from fossil fuels to climate solutions
Content: Cities around the world are building better financial strategies. At the heart of this transition lies a str...
Document 5: Title: 15-minute city initiatives explorer
Content: The 15-minute city urban planning concept strives for an urban model tha

In [14]:
# Print the number of unique documents
unique_docs = set(doc.page_content for doc in documents)
print("Number of unique documents:", len(unique_docs))

Number of unique documents: 788


In [15]:
# Set the absolute path for the Chroma database
ABS_PATH = Path().resolve().joinpath('Chroma')
DB_DIR = os.path.join(ABS_PATH, "env_policy")
print('DB_DIR:', DB_DIR)

DB_DIR: /Users/rahul/Projects/AI-Project/ClimateActionPolicy-RAG/Chroma/env_policy


In [16]:
# Define client settings for Chroma
client_settings = chromadb.config.Settings(
    is_persistent=True,
    persist_directory=DB_DIR,
    anonymized_telemetry=False,
)


In [17]:
# Initialize the HuggingFaceBgeEmbeddings with the correct model
embedder = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en")


  from .autonotebook import tqdm as notebook_tqdm


In [18]:
# Embed the documents
embedded_docs = embedder.embed_documents([doc.page_content for doc in documents])


In [20]:
# Create the Chroma vector store from the embedded documents
bge_vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedder,
    client_settings=client_settings,
    collection_name="env_policy_bge",
    collection_metadata={"hnsw": "cosine"}
)

In [21]:
# Initialize retriever
retriever = bge_vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 5, "include_metadata": True}
)


In [22]:
# Test the retrieval with a query
query = "Explain climate policy"
retrieved_docs = retriever.invoke(query)
print(retrieved_docs)

[Document(page_content='Title: Climate budgeting: What it is, what it isn’t, and how it works\nContent: Climate budgeting puts climate at the heart of decision-making, alongside other priority areas, helping to drive the short-term action that is so desperately needed. This article defines exactly what climate budgeting is, what it isn’t, and introduces how it works. For more on the benefits of adopting one, read Climate budgets: Why your city needs one.\nClimate budgeting is a governance system for delivering climate targets\nA climate budget is a governance system that mainstreams climate commitments and considerations into decision-making on policies, actions and budget. This is done by integrating climate targets from the city’s Climate Action Plan (CAP) into the financial budgeting process and assigning responsibility for implementation, monitoring, evaluation and reporting across the city government. It creates transparency and accountability, and highlights deviations from targe