In [20]:
import getpass
import os

api_key= os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = api_key

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo")

In [None]:
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import faiss
import numpy as np

websites= [
    "http://environnement.wallonie.be/de/eso/eau_distribution/",
    "https://environment.ec.europa.eu/topics/water/water-wise-eu/belgium_en",
    "https://environment.ec.europa.eu/topics/water/water-wise-eu/polluted-water_en",
    "https://www.brusselstimes.com/1009591/flemish-drinking-water-highly-polluted-with-pfas-but-purifying-costs-millions"
]

# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=websites,
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer("p")
        )
    
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

#Embed and store the docs splits
#vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

#Embed documents and add to FAISS
embedding_model = OpenAIEmbeddings()
embeddings = embedding_model.embed_documents([split.page_content for split in splits])

# Initialize FAISS index
dimension = len(embeddings[0])
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings, dtype='float32'))  # Add embeddings to the index

# Retrieve and generate using the relevant snippets of the blog.
#retriever = vectorstore.as_retriever()

#Create a custom retriever
def retrieve_similar(query, k=5):
    query_embedding = np.array([embedding_model.embed_query(query)], dtype='float32')
    distances, indices = index.search(query_embedding, k)
    return [splits[i] for i in indices[0]]

prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": lambda q: format_docs(retrieve_similar(q)), "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("quelle est la qualité de l'eau?")



"La qualité de l'eau potable est réglementée par des directives européennes, notamment la directive 98/83/CE entrée en vigueur en 1998. En Belgique, la qualité de l'eau destinée à la consommation humaine est une compétence régionale. L'eau distribuée par les réseaux doit respecter des normes strictes en termes de qualité physico-chimique, chimique et microbiologique."

In [22]:
print(docs[0].page_content[:500])

 Janvier 2018
                    Cinquième révision
 Eau du robinetL’usage de l’eau à des fins alimentaires ou d’hygiène corporelle nécessite une excellente qualité physico-chimique, chimique et microbiologique.L’eau distribuée par réseaux constitue un des produits alimentaires les plus contrôlés en Région wallonne avec plus de 30.000 contrôles par an. Ces derniers sont réalisés depuis son origine jusqu'au robinet. L’eau de distribution doit répondre aux exigences de qualité imposées par la lé


In [25]:
# Define custom prompts
from langchain.prompts import PromptTemplate

general_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="Using the information provided:\n{context}\nAnswer the question:\n{question}"
)

detailed_analysis_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="Based on the following detailed analysis:\n{context}\nProvide a comprehensive answer to the question:\n{question}"
)

summarized_response_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="Summarize the following content:\n{context}\nAnswer briefly:\n{question}"
)

# Use a dictionary to manage different prompts
prompt_choices = {
    "general": general_prompt,
    "detailed": detailed_analysis_prompt,
    "summary": summarized_response_prompt
}

# Example: Select a specific prompt
selected_prompt = prompt_choices["general"]

# Updated RAG chain
rag_chain = (
    {"context": lambda q: format_docs(retrieve_similar(q)), "question": RunnablePassthrough()}
    | selected_prompt
    | llm
    | StrOutputParser()
)

# Example question
response = rag_chain.invoke("how come is the drinking water of the flemish side is polluted? and in what percentage? how is it compared to the one in Wallonia?")
print(response)




The drinking water on the Flemish side is polluted with PFAS, with 16.1% of tap measurements and analyses exceeding the strictest PFAS standards in 2022. This means that one in six samples did not meet the threshold of 4 nanograms per liter of drinking water. On the other hand, the article does not provide specific information on the percentage of contaminated drinking water in Wallonia. However, it does mention that concerns have been raised about the discovery of PFAS 'forever chemicals' in drinking water in northern Flanders and Wallonia, indicating that contamination is also a concern in Wallonia.


In [26]:
# Check the number of vectors stored in the index
print(f"Number of vectors in FAISS index: {index.ntotal}")

# Save the FAISS index to disk
faiss.write_index(index, "vector_index.faiss")
print("FAISS index saved.")

# Reload the FAISS index
reloaded_index = faiss.read_index("vector_index.faiss")
print("FAISS index reloaded.")
print(f"Number of vectors in reloaded FAISS index: {reloaded_index.ntotal}")

Number of vectors in FAISS index: 70
FAISS index saved.
FAISS index reloaded.
Number of vectors in reloaded FAISS index: 70
