In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_chroma import Chroma
import chromadb
from pathlib import Path
import os
import nltk



In [2]:
# Load the CSV file
file_path = "Cleaned_LLM-KB.csv"
loader = CSVLoader(file_path=file_path)
data = loader.load()


In [3]:
#Splitting
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000,chunk_overlap=64)
chunked_docs = text_splitter.split_documents(data)

In [4]:
# Split the documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=64)
chunked_docs = text_splitter.split_documents(data)

In [5]:
# Print the number of unique split documents
s = set()
for doc in chunked_docs:
    s.add(doc.page_content)
print("Number of unique split documents:", len(s))

Number of unique split documents: 1877


In [6]:
# Set the absolute path for the Chroma database
ABS_PATH = Path().resolve().joinpath('Chroma1')
DB_DIR = os.path.join(ABS_PATH, "env_policy")
print('DB_DIR : ', DB_DIR)

DB_DIR :  /Users/rahul/Projects/AI-Project/ClimateActionPolicy-RAG/Chroma1/env_policy


In [7]:
# Define client settings for Chroma
client_settings = chromadb.config.Settings(
    is_persistent=True,
    persist_directory=DB_DIR,
    anonymized_telemetry=False,
)

In [8]:
# Initialize the HuggingFaceBgeEmbeddings with the correct model
embedder = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en")

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Embed the split documents
embedded_docs = embedder.embed_documents([doc.page_content for doc in chunked_docs])


In [10]:
# Create the Chroma vector store from the embedded documents
bge_vectorstore = Chroma.from_documents(
    documents=chunked_docs,
    embedding=embedder,
    client_settings=client_settings,
    collection_name="env_policy_bge",
    collection_metadata={"hnsw": "cosine"}
)


In [11]:
retriever = bge_vectorstore.as_retriever(
                                  search_type="mmr",
                                  search_kwargs={"k": 5, "include_metadata": True}
                                  )
                                  

In [12]:
query = "Explain climate policy"
retreived_docs = retriever.invoke(query)
retreived_docs

[Document(page_content='Timestamp: 14/06/2024 18:15:04\nTitle: Climate budgeting: What it is, what it isn’t, and how it works\nContent: Climate budgeting puts climate at the heart of decision-making, alongside other priority areas, helping to drive the short-term action that is so desperately needed. This article defines exactly what climate budgeting is, what it isn’t, and introduces how it works. For more on the benefits of adopting one, read Climate budgets: Why your city needs one.\nClimate budgeting is a governance system for delivering climate targets\nA climate budget is a governance system that mainstreams climate commitments and considerations into decision-making on policies, actions and budget. This is done by integrating climate targets from the city’s Climate Action Plan (CAP) into the financial budgeting process and assigning responsibility for implementation, monitoring, evaluation and reporting across the city government. It creates transparency and accountability, and 