In [3]:
! pip install chardet

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
Installing collected packages: chardet
Successfully installed chardet-5.2.0


In [4]:
import os
import json
import yaml
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import random
import chardet

# ------------------------------------------------
# 📁 Paths
# ------------------------------------------------
DATA_PATH = r"C:\Users\NXTWAVE\Downloads\Water Quality & Supply Prediction System\archive\water_dataX.csv"
OUTPUT_DIR = r"C:\Users\NXTWAVE\Downloads\Water Quality & Supply Prediction System"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ------------------------------------------------
# 🧩 Phase 1: Load and Preprocess Data (Encoding Safe)
# ------------------------------------------------
print("[INFO] Loading dataset...")

# Detect file encoding to prevent UnicodeDecodeError
with open(DATA_PATH, 'rb') as f:
    raw_data = f.read(10000)
    result = chardet.detect(raw_data)
    encoding_used = result['encoding']

print(f"[INFO] Detected Encoding: {encoding_used}")

try:
    df = pd.read_csv(DATA_PATH, encoding=encoding_used)
except UnicodeDecodeError:
    print("[WARN] UnicodeDecodeError with detected encoding — using latin1 fallback.")
    df = pd.read_csv(DATA_PATH, encoding='latin1')

print("[INFO] Shape:", df.shape)

# Clean column names and handle missing values
df.columns = [c.strip().replace(" ", "_") for c in df.columns]
df = df.fillna(df.median(numeric_only=True))

numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
print("[INFO] Numeric columns detected:", numeric_cols)

# Normalize numeric features
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df[numeric_cols])
df_scaled = pd.DataFrame(scaled, columns=numeric_cols)

# ------------------------------------------------
# 🌊 Phase 2: Feature Engineering
# ------------------------------------------------
print("[INFO] Creating Contamination & Supply indices...")

# Create synthetic contamination & supply indices
df_scaled["Contamination_Index"] = (
    df_scaled.get("pH", 0.5) * 0.3
    + df_scaled.get("Turbidity", 0.5) * 0.3
    + df_scaled.get("Conductivity", 0.5) * 0.2
    + df_scaled.get("Dissolved_Oxygen", 0.5) * 0.2
)

df_scaled["Supply_Health_Index"] = (
    df_scaled.get("Pressure", 0.5) * 0.4
    + df_scaled.get("FlowRate", 0.5) * 0.4
    + df_scaled.get("Reservoir_Level", 0.5) * 0.2
)

target_col = "Contamination_Index"

# ------------------------------------------------
# 🧠 Phase 3: CNN-LSTM + Grey Wolf Optimizer
# ------------------------------------------------

def create_model(filters=32, lstm_units=64, dropout=0.2, lr=0.001, input_shape=None):
    model = Sequential([
        Conv1D(filters, kernel_size=2, activation='relu', input_shape=input_shape),
        Dropout(dropout),
        LSTM(lstm_units, activation='tanh', return_sequences=False),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    opt = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=opt, loss='mse', metrics=['mae'])
    return model


def create_sequences(X, y, time_steps=5):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:i + time_steps])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)


# Prepare sequences
values = df_scaled[numeric_cols + ["Contamination_Index"]].values
time_steps = 5
X, y = create_sequences(values, values[:, -1], time_steps)
input_shape = (X.shape[1], X.shape[2])

# Split data
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# Grey Wolf Optimizer - Simplified
print("[INFO] Starting Grey Wolf Optimization...")

def gwo_optimize(iterations=5):
    best_rmse, best_params = float('inf'), None
    for i in range(iterations):
        params = {
            "filters": random.choice([16, 32, 64]),
            "lstm_units": random.choice([32, 64, 128]),
            "dropout": random.uniform(0.1, 0.4),
            "lr": random.choice([0.001, 0.0005]),
        }
        model = create_model(**params, input_shape=input_shape)
        history = model.fit(
            X_train, y_train,
            epochs=10, batch_size=16, verbose=0,
            validation_split=0.2,
            callbacks=[EarlyStopping(patience=3, restore_best_weights=True)]
        )
        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        print(f"[GWO] Iter {i + 1} | RMSE={rmse:.4f} | Params={params}")
        if rmse < best_rmse:
            best_rmse, best_params = rmse, params
    return best_params

best_params = gwo_optimize()
print("[INFO] Best Params:", best_params)

# Final model
final_model = create_model(**best_params, input_shape=input_shape)
history = final_model.fit(
    X_train, y_train,
    epochs=25, batch_size=16, verbose=1,
    validation_split=0.2,
    callbacks=[EarlyStopping(patience=5, restore_best_weights=True)]
)

# ------------------------------------------------
# 📊 Phase 4: Evaluation & Visualization
# ------------------------------------------------
print("[INFO] Evaluating model...")

y_pred = final_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[RESULT] RMSE: {rmse:.4f}, R²: {r2:.4f}")

# --- Accuracy Graph ---
plt.figure(figsize=(8, 4))
plt.plot(history.history["loss"], label="Train Loss")
plt.plot(history.history["val_loss"], label="Val Loss")
plt.title("Training vs Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("MSE")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "AquaSentinel_accuracy_graph.png"))
plt.close()

# --- Prediction vs Actual ---
plt.figure(figsize=(6, 5))
sns.scatterplot(x=y_test.flatten(), y=y_pred.flatten())
plt.title("Prediction vs Actual Contamination Index")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "AquaSentinel_result_graph.png"))
plt.close()

# ------------------------------------------------
# 💾 Save Artifacts
# ------------------------------------------------
print("[INFO] Saving artifacts...")

# Model (.h5)
final_model.save(os.path.join(OUTPUT_DIR, "AquaSentinel_model.h5"))

# Scaler (.pkl)
with open(os.path.join(OUTPUT_DIR, "AquaSentinel_scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)

# Config (.yaml)
config = {
    "features": numeric_cols,
    "target": target_col,
    "model_params": best_params,
    "time_steps": time_steps,
}
with open(os.path.join(OUTPUT_DIR, "AquaSentinel_config.yaml"), "w") as f:
    yaml.dump(config, f)

# Results (.json)
results = {
    "RMSE": float(rmse),
    "R2_Score": float(r2),
    "best_params": best_params,
    "rows_used": len(df),
    "columns": numeric_cols,
}
with open(os.path.join(OUTPUT_DIR, "AquaSentinel_results.json"), "w") as f:
    json.dump(results, f, indent=4)

print("\n✅ [SUCCESS] All artifacts saved at:")
print(OUTPUT_DIR)
print("------------------------------------------------------------")
print("Generated files:")
print("  📁 AquaSentinel_model.h5")
print("  📁 AquaSentinel_scaler.pkl")
print("  📁 AquaSentinel_config.yaml")
print("  📁 AquaSentinel_results.json")
print("  📊 AquaSentinel_accuracy_graph.png")
print("  📊 AquaSentinel_result_graph.png")
print("------------------------------------------------------------")


[INFO] Loading dataset...
[INFO] Detected Encoding: ISO-8859-1
[INFO] Shape: (1991, 12)
[INFO] Numeric columns detected: ['year']
[INFO] Creating Contamination & Supply indices...
[INFO] Starting Grey Wolf Optimization...



[GWO] Iter 1 | RMSE=0.0338 | Params={'filters': 16, 'lstm_units': 128, 'dropout': 0.22764777194513638, 'lr': 0.0005}
[GWO] Iter 2 | RMSE=0.0274 | Params={'filters': 32, 'lstm_units': 32, 'dropout': 0.3040214066237258, 'lr': 0.0005}
[GWO] Iter 3 | RMSE=0.0327 | Params={'filters': 64, 'lstm_units': 64, 'dropout': 0.1719747909740827, 'lr': 0.001}
[GWO] Iter 4 | RMSE=0.0587 | Params={'filters': 16, 'lstm_units': 32, 'dropout': 0.11425201189137679, 'lr': 0.0005}
[GWO] Iter 5 | RMSE=0.0029 | Params={'filters': 16, 'lstm_units': 64, 'dropout': 0.3430717832162359, 'lr': 0.001}
[INFO] Best Params: {'filters': 16, 'lstm_units': 64, 'dropout': 0.3430717832162359, 'lr': 0.001}
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Ep

  saving_api.save_model(
