Data Preprocessing

In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/water_potability.csv")  



from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import os
import joblib

# Fill missing values (e.g., with mean)
df.fillna(df.mean(), inplace=True)

# Features and target
X = df.drop("Potability", axis=1)
y = df["Potability"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 9. Create models folder if it doesn't exist
os.makedirs("../models", exist_ok=True)

# 10. Save model and scaler
joblib.dump(model, "../models/water_quality_model.pkl")
joblib.dump(scaler, "../models/scaler.pkl")

[[357  55]
 [156  88]]
              precision    recall  f1-score   support

           0       0.70      0.87      0.77       412
           1       0.62      0.36      0.45       244

    accuracy                           0.68       656
   macro avg       0.66      0.61      0.61       656
weighted avg       0.67      0.68      0.65       656



['../models/scaler.pkl']