In [1]:
import getpass
import os
import bs4 
import faiss
import numpy as np
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

from langchain_openai import ChatOpenAI


api_key= os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = api_key

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo")

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [None]:
# Load dataset
data1 = pd.read_csv("water_potability.csv")
#data2 = pd.read_csv("waterQuality1.csv")
# Optionally, concatenate them into a single dataset (if they have the same structure)
#data1 = pd.concat([data1, data2], ignore_index=True)
#combined_data.replace("#NUM!", np.nan, inplace=True)

# Preview the data
data1.head()

# Handle missing values by replacing them with the mean of each column
imputer = SimpleImputer(strategy="mean")
data1.iloc[:, :-1] = imputer.fit_transform(data1.iloc[:, :-1])
# Verify missing values are handled
data1.isnull().sum()

# Split dataset into features (X) and target (y)
X = data1.drop(columns=["Potability"])
y = data1["Potability"]

# Split dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features (normalize the data)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Data loaded and preprocessed successfully!")


In [None]:
# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the trained model to disk
joblib.dump(model, "water_potability_model.pkl")


In [None]:
# Load the trained model when needed
model = joblib.load("water_potability_model.pkl")

# Function to predict water potability based on input features
def predict_potability(features):
    """
    Predict water potability based on input features.
    
    Args:
    - features (list): A list of feature values in the same order as the dataset.
    
    Returns:
    - str: Prediction result ("Potable" or "Not Potable").
    """
    prediction = model.predict([features])[0]
    return "Potable" if prediction == 1 else "Not Potable"


In [None]:
# Define websites to scrape information from
websites = [
    "http://environnement.wallonie.be/de/eso/eau_distribution/",
    "https://environment.ec.europa.eu/topics/water/water-wise-eu/belgium_en",
    "https://environment.ec.europa.eu/topics/water/water-wise-eu/polluted-water_en",
    "https://www.brusselstimes.com/1009591/flemish-drinking-water-highly-polluted-with-pfas-but-purifying-costs-millions"
]

# Load and scrape the content of the websites
loader = WebBaseLoader(
    web_paths=websites,
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer("p")
    )
)
docs = loader.load()

# Split the documents into smaller chunks for easier processing
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

print("Documents loaded and split into chunks successfully!")


In [None]:
# Embed the document chunks using OpenAI embeddings
embedding_model = OpenAIEmbeddings()
embeddings = embedding_model.embed_documents([split.page_content for split in splits])

# Initialize a FAISS index to store and search the embeddings
dimension = len(embeddings[0])  # Get the dimension of the embeddings
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings, dtype='float32'))  # Add embeddings to the index

print("Documents embedded and indexed successfully!")


In [None]:
# Function to retrieve the most similar documents for a given query
def retrieve_similar(query, k=5):
    query_embedding = np.array([embedding_model.embed_query(query)], dtype='float32')
    distances, indices = index.search(query_embedding, k)
    return [splits[i] for i in indices[0]]

print("Retrieval function defined successfully!")


In [None]:
from langchain.prompts import PromptTemplate

# Define different prompt templates for response generation
general_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="Using the information provided:\n{context}\nAnswer the question:\n{question}"
)

detailed_analysis_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="Based on the following detailed analysis:\n{context}\nProvide a comprehensive answer to the question:\n{question}"
)

summarized_response_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="Summarize the following content:\n{context}\nAnswer briefly:\n{question}"
)

# Dictionary to manage prompt choices
prompt_choices = {
    "general": general_prompt,
    "detailed": detailed_analysis_prompt,
    "summary": summarized_response_prompt
}

print("Prompt templates defined successfully!")


In [None]:
# Select a specific prompt template for use
selected_prompt = prompt_choices["general"]

# Define the RAG chain combining retrieval and language model generation
rag_chain = (
    {"context": lambda q: format_docs(retrieve_similar(q)), "question": RunnablePassthrough()}
    | selected_prompt
    | llm
    | StrOutputParser()
)

# Function to format the retrieved documents into a string
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Example: Ask a question and get a response
response = rag_chain.invoke("What is the potability of water with these characteristics?")
print(response)


In [None]:
# Function to format context with potability prediction
def format_with_predictions(docs, features=None):
    context = "\n\n".join(doc.page_content for doc in docs)
    if features:
        potability = predict_potability(features)
        context += f"\n\nWater Potability Prediction: {potability}"
    return context

# Updated RAG chain that includes potability predictions
rag_chain = (
    {"context": lambda q: format_with_predictions(retrieve_similar(q), features=[7.2, 204.5, 20791, 7.2, 333.3, 17.2, 6.2, 325.3, 0.5]),
     "question": RunnablePassthrough()}
    | selected_prompt
    | llm
    | StrOutputParser()
)

# Example: Ask a question with potability prediction
response = rag_chain.invoke("What is the potability of water with these characteristics?")
print(response)
