In [9]:
from sympy.physics.units import years
! pip install langchain_community pandas huggingface_hub chromadb==0.4.14 langchain-huggingface




[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
#load pdf documents from the specified path
from langchain.document_loaders import PyPDFLoader
import os


def load_pdfs_from_path(pdf_dir="data/pdf"):
    pdf_docs = []
    for filename in os.listdir(pdf_dir):
        if filename.endswith(".pdf"):
            file_path = os.path.join(pdf_dir, filename)
            loader = PyPDFLoader(file_path)
            pdf_docs.append(loader.load())
    return pdf_docs


pdf_documents = load_pdfs_from_path()
pdf_documents


[[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-01-04T20:51:43+10:00', 'author': 'Albert Van Dijk', 'moddate': '2025-01-04T20:51:43+10:00', 'source': 'data/pdf\\GlobalWaterMonitor-Report-2024.pdf', 'total_pages': 58, 'page': 0, 'page_label': '1'}, page_content='|  \n                     1'),
  Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-01-04T20:51:43+10:00', 'author': 'Albert Van Dijk', 'moddate': '2025-01-04T20:51:43+10:00', 'source': 'data/pdf\\GlobalWaterMonitor-Report-2024.pdf', 'total_pages': 58, 'page': 1, 'page_label': '2'}, page_content='2                 \n   |  \n  \n \n \n \n \n \nDisclaimer \nThe material in this report is of a general nature and should not be regarded as legal advice or relied on for assistance in any \nparticular circumstance or emergency situation. In any important matte

In [2]:
#split the pdf files into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter


def split_pdfs_into_chunks(documents, chunk_size=1000, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = []
    for doc in documents:
        chunks.extend(text_splitter.split_documents(doc))
    return chunks


pdf_chunks = split_pdfs_into_chunks(pdf_documents)
pdf_chunks


[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-01-04T20:51:43+10:00', 'author': 'Albert Van Dijk', 'moddate': '2025-01-04T20:51:43+10:00', 'source': 'data/pdf\\GlobalWaterMonitor-Report-2024.pdf', 'total_pages': 58, 'page': 0, 'page_label': '1'}, page_content='|  \n                     1'),
 Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-01-04T20:51:43+10:00', 'author': 'Albert Van Dijk', 'moddate': '2025-01-04T20:51:43+10:00', 'source': 'data/pdf\\GlobalWaterMonitor-Report-2024.pdf', 'total_pages': 58, 'page': 1, 'page_label': '2'}, page_content='2                 \n   |  \n  \n \n \n \n \n \nDisclaimer \nThe material in this report is of a general nature and should not be regarded as legal advice or relied on for assistance in any \nparticular circumstance or emergency situation. In any important matter,

In [3]:
import pandas as pd
from langchain.schema import Document

# Load the CSV file
csv_file_path = "data/csv/water_usage_data.csv"
water_data = pd.read_csv(csv_file_path)


def load_water_usage_data():
    # Process the data and fill in placeholders
    documents = []
    for index, row in water_data.iterrows():
        district = row['District']
        crop = row['Crop']
        irrigation_method = row['Irrigation Method']
        water_consumption = row['Water Consumption (liters/hectare)']
        water_availability = row['Water Availability (liters/hectare)']
        content = f"In the district {district}, the crop {crop} is cultivated using the {irrigation_method} method. The water consumption for this irrigation method is approximately {water_consumption}, while the water availability in the region is around {water_availability}. This indicates a relatively high water requirement for guava cultivation,which is supported by adequate water availability in the region through the canal irrigation system."
        documents.append(Document(page_content=content, metadata={"index": index}))
    return documents


water_usage_csv = load_water_usage_data()
water_usage_csv


[Document(metadata={'index': 0}, page_content='In the district Udaipur, the crop Garlic is cultivated using the Tube Well method. The water consumption for this irrigation method is approximately 11609.330885810616, while the water availability in the region is around 11335.919980150127. This indicates a relatively high water requirement for guava cultivation,which is supported by adequate water availability in the region through the canal irrigation system.'),
 Document(metadata={'index': 1}, page_content='In the district Bhilwara, the crop Gram is cultivated using the Drip Irrigation method. The water consumption for this irrigation method is approximately 12648.406038302792, while the water availability in the region is around 12091.900253943171. This indicates a relatively high water requirement for guava cultivation,which is supported by adequate water availability in the region through the canal irrigation system.'),
 Document(metadata={'index': 2}, page_content='In the district 

In [4]:
import pandas as pd
from langchain.schema import Document

# Load the CSV file
csv_file_path = "data/csv/scarcity.csv"
water_data = pd.read_csv(csv_file_path, delimiter=';')


def load_water_usage_data():
    # Process the data and fill in placeholders
    documents = []
    for index, row in water_data.iterrows():
        # Extract all the columns from the current row
        basin_id = row['basin_id']
        basin_name = row['basin_name']
        population = row['population']
        jan = row['jan']
        feb = row['feb']
        mar = row['mar']
        apr = row['apr']
        may = row['may']
        jun = row['jun']
        jul = row['jul']
        aug = row['aug']
        sep = row['sep']
        oct_ = row['oct']  # `oct` is Python reserved keyword, so use `oct_` to avoid conflicts
        nov = row['nov']
        dec = row['dec']
        average = row['average']
        low = row['low']
        moderate = row['moderate']
        significant = row['significant']
        severe = row['severe']
        content = f"The basin {basin_name} (ID: {basin_id}) supports a population of {population}. The water scarcity levels recorded for each month are: January ({jan}), February ({feb}), March ({mar}), April ({apr}), May ({may}), June ({jun}), July ({jul}), August ({aug}), September ({sep}), October ({oct}), November ({nov}), and December ({dec}). The average water scarcity level for this basin is {average}, with {low} months of low scarcity, {moderate} months of moderate scarcity, {significant} months of significant scarcity, and {severe} months of severe scarcity. This data provides a comprehensive understanding of water scarcity throughout the year for the region. "
        documents.append(Document(page_content=content, metadata={"index": index, "basin_name": basin_name}))
    return documents


water_scarcity_csv = load_water_usage_data()
water_scarcity_csv


[Document(metadata={'index': 0, 'basin_name': 'Khatanga'}, page_content='The basin Khatanga (ID: 1) supports a population of 4633. The water scarcity levels recorded for each month are: January (0,0028), February (0,0748), March (0,1238), April (0,2050), May (0,0198), June (0,0002), July (0,0003), August (0,0006), September (0,0010), October (<built-in function oct>), November (0,0030), and December (0,0050). The average water scarcity level for this basin is 0,0365, with 12 months of low scarcity, 0 months of moderate scarcity, 0 months of significant scarcity, and 0 months of severe scarcity. This data provides a comprehensive understanding of water scarcity throughout the year for the region. '),
 Document(metadata={'index': 1, 'basin_name': 'Olenek'}, page_content='The basin Olenek (ID: 2) supports a population of 5960. The water scarcity levels recorded for each month are: January (0,0046), February (0,0353), March (0,0585), April (0,0968), May (0,0086), June (0,0003), July (0,001

In [5]:
print(water_data.columns)


Index(['basin_id', 'basin_name', 'population', 'jan', 'feb', 'mar', 'apr',
       'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'average',
       'low', 'moderate', 'significant', 'severe'],
      dtype='object')


In [6]:
import pandas as pd
from langchain.schema import Document

# Load the CSV file
csv_file_path = "data/csv/global_water_consumption.csv"
water_data = pd.read_csv(csv_file_path)


def load_water_usage_data():
    # Process the data and fill in placeholders
    documents = []
    for index, row in water_data.iterrows():
        # Extract all the columns from the current row
        country = row['Country']
        year = row['Year']
        total_water_consumption = row['Total Water Consumption (Billion Cubic Meters)']
        per_capital_use = row['Per Capita Water Use (Liters per Day)']
        agricultural_use = row['Agricultural Water Use (%)']
        industrial_use = row['Industrial Water Use (%)']
        household_use = row['Household Water Use (%)']
        rainfall = row['Rainfall Impact (Annual Precipitation in mm)']
        groundwater_depletion = row['Groundwater Depletion Rate (%)']
        scarcity_level = row['Water Scarcity Level']
        content = f"The country {country} reported water consumption statistics for the year {year}. The total water consumption was {total_water_consumption} billion cubic meters, with a per capita water consumption of {per_capital_use} liters per day. Water distribution included {agricultural_use}% for agricultural use, {industrial_use}% for industrial use, and {household_use}% for household use. The region experienced an annual rainfall impact of {rainfall} mm, while the groundwater depletion rate was recorded at {groundwater_depletion}%. Based on these figures, the water scarcity level in {year} was categorized as {scarcity_level}. "
        documents.append(Document(page_content=content, metadata={"index": index, "country": country, "year": year}))
    return documents


global_water_consumption_csv = load_water_usage_data()
global_water_consumption_csv


[Document(metadata={'index': 0, 'country': 'Argentina', 'year': 2000}, page_content='The country Argentina reported water consumption statistics for the year 2000. The total water consumption was 481.49 billion cubic meters, with a per capita water consumption of 235.4314286 liters per day. Water distribution included 48.55% for agricultural use, 20.84428571% for industrial use, and 30.1% for household use. The region experienced an annual rainfall impact of 1288.698571 mm, while the groundwater depletion rate was recorded at 3.255714286%. Based on these figures, the water scarcity level in 2000 was categorized as Moderate. '),
 Document(metadata={'index': 1, 'country': 'Argentina', 'year': 2001}, page_content='The country Argentina reported water consumption statistics for the year 2001. The total water consumption was 455.063 billion cubic meters, with a per capita water consumption of 299.551 liters per day. Water distribution included 48.465% for agricultural use, 26.943% for indus

In [7]:
#create vectorstore with chroma to store the documents
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings


def create_vectorstore(docs, persist_dir="./chroma_db"):
    embeddings = OpenAIEmbeddings()
    vectordb = Chroma.from_documents(docs, embeddings, persist_directory=persist_dir)
    return vectordb

pdf_csv_docs = water_usage_csv + water_scarcity_csv + global_water_consumption_csv + pdf_chunks
db = create_vectorstore(pdf_csv_docs)



  embeddings = OpenAIEmbeddings()


In [8]:
result = db.similarity_search(
    'availability of water in jaipur',
    k=6  #6 nearest neighbors
)
print(f"for query {len(result)} results found")
print(result[0].page_content)
print(result[0].metadata)

for query 6 results found
In the district Jaipur, the crop Barley is cultivated using the Tube Well method. The water consumption for this irrigation method is approximately 8087.602263601358, while the water availability in the region is around 8487.71266227881. This indicates a relatively high water requirement for guava cultivation,which is supported by adequate water availability in the region through the canal irrigation system.
{'index': 435}


In [41]:
#prompt engineering
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

# Define the system message template (sets the AI's behavior and context)
system_message = SystemMessagePromptTemplate.from_template("""
You are a helpful assistant specialized in water usage, irrigation techniques, soil analysis, and water scarcity data.
Your job is to analyze relevant material provided in the context and answer user queries accurately.
Always use the context provided to create factual responses.
""")

# Define the human message template (includes user query and context dynamically)
human_message = HumanMessagePromptTemplate.from_template("""
Context:
{context}

Question:
{question}

Provide a detailed and accurate response based on the context above. If the context does not contain enough information, say so and avoid guessing.
""")

# Combine the templates into a ChatPromptTemplate
chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])
# Define inputs
retrieved_context = [doc.page_content for doc in db.similarity_search('availability of water in jaipur', k=6)]
user_query = "what are the major water-related disasters ?"

# Render the chat prompt with actual inputs
formatted_prompt = chat_prompt.format_prompt(
    context=" ".join(retrieved_context),
    question=user_query
)



In [51]:
from langchain.prompts import PromptTemplate, MessagesPlaceholder
from langchain.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate

qa_system_prompt_template = """
You are a chatbot specialized in water scarcity and related challenges around the globe. Your job is to assist researchers, NGOs, and policymakers in exploring data, solutions, and impacts regarding water scarcity. Use the provided information to answer questions accurately and reliably. If you don't know the answer based on the context, say you don't know. Follow these rules:

1. If the question is to request links, please only return the source links with no answer.
2. If you don't know the answer, don't fabricate a response. Just say **I can't find the final answer but you may want to check the following links** and add the source links as a list.
3. If you find the answer, write the answer in a concise way and add the list of sources that are **directly** used to derive the answer. Exclude the sources that are irrelevant to the final answer.

{context}"""

qa_system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["context"], template=qa_system_prompt_template
    )
)

qa_human_prompt_template = """Question: {input}
Answer:"""

qa_human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(input_variables=["question"], template=qa_human_prompt_template)
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        qa_system_prompt,
        MessagesPlaceholder("chat_history"),
        qa_human_prompt,
    ]
)


In [52]:
#dynamic prompt setup
from openai import ChatCompletion
import json

# formatted_prompt = [
#      {"role": "system", "content": "You are a helpful assistant specialized in water usage, irrigation techniques, soil analysis, and water scarcity data. Your job is to analyze relevant material provided in the context and answer user queries accurately. Always use the context provided to create factual responses."},
#      {"role": "user", "content": "What is the capital of France?"}
#  ]
prompt = formatted_prompt
model_gen = ChatCompletion.create


In [53]:
print(type(formatted_prompt))
print(formatted_prompt)

<class 'langchain_core.prompt_values.ChatPromptValue'>
messages=[SystemMessage(content='\nYou are a helpful assistant specialized in water usage, irrigation techniques, soil analysis, and water scarcity data.\nYour job is to analyze relevant material provided in the context and answer user queries accurately.\nAlways use the context provided to create factual responses.\n', additional_kwargs={}, response_metadata={}), HumanMessage(content='\nContext:\nIn the district Jaipur, the crop Barley is cultivated using the Tube Well method. The water consumption for this irrigation method is approximately 8087.602263601358, while the water availability in the region is around 8487.71266227881. This indicates a relatively high water requirement for guava cultivation,which is supported by adequate water availability in the region through the canal irrigation system. In the district Jaipur, the crop Barley is cultivated using the Tube Well method. The water consumption for this irrigation method i

In [58]:
#convert prompts to message format
def to_messages(prompt_list):
    return [{"role": "user", "content": message} for message in prompt_list]

# Pass the rendered prompt to your LLM
response = model_gen(
    model="gpt-4",
    messages= formatted_prompt
    # messages= formatted_prompt
)
print(response.choices[0].message["content"])


TypeError: Object of type ChatPromptValue is not JSON serializable