In [1]:
import os
import json
import yaml
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import random
import chardet

# ------------------------------------------------
# 📁 Paths
# ------------------------------------------------
DATA_PATH = r"C:\Users\NXTWAVE\Downloads\Water Quality & Supply Prediction System\archive\water_dataX.csv"
OUTPUT_DIR = r"C:\Users\NXTWAVE\Downloads\Water Quality & Supply Prediction System"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ------------------------------------------------
# 🧩 Phase 1: Safe CSV Loading
# ------------------------------------------------
print("[INFO] Loading dataset safely...")

with open(DATA_PATH, 'rb') as f:
    raw_data = f.read(10000)
    result = chardet.detect(raw_data)
    encoding_used = result['encoding']

print(f"[INFO] Detected Encoding: {encoding_used}")

try:
    df = pd.read_csv(DATA_PATH, encoding=encoding_used)
except UnicodeDecodeError:
    print("[WARN] Fallback to latin1 encoding")
    df = pd.read_csv(DATA_PATH, encoding='latin1')

print("[INFO] Dataset Shape:", df.shape)
df.columns = [c.strip().replace(" ", "_") for c in df.columns]
df = df.fillna(df.median(numeric_only=True))

# ------------------------------------------------
# 🌊 Phase 2: Feature Engineering
# ------------------------------------------------
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df[numeric_cols])
df_scaled = pd.DataFrame(scaled, columns=numeric_cols)

# Derived indicators
df_scaled["Contamination_Index"] = (
    df_scaled.get("pH", 0.5) * 0.3 +
    df_scaled.get("Turbidity", 0.5) * 0.3 +
    df_scaled.get("Conductivity", 0.5) * 0.2 +
    df_scaled.get("Dissolved_Oxygen", 0.5) * 0.2
)
df_scaled["Supply_Health_Index"] = (
    df_scaled.get("Pressure", 0.5) * 0.4 +
    df_scaled.get("FlowRate", 0.5) * 0.4 +
    df_scaled.get("Reservoir_Level", 0.5) * 0.2
)

target_col = "Contamination_Index"

# ------------------------------------------------
# 🧠 Phase 3: CNN-LSTM + Grey Wolf Optimization
# ------------------------------------------------
def create_model(filters=32, lstm_units=64, dropout=0.2, lr=0.001, input_shape=None):
    model = Sequential([
        Conv1D(filters, kernel_size=2, activation='relu', input_shape=input_shape),
        Dropout(dropout),
        LSTM(lstm_units, activation='tanh', return_sequences=False),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    opt = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=opt, loss='mse', metrics=['mae'])
    return model

def create_sequences(X, y, time_steps=5):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:i + time_steps])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

values = df_scaled[numeric_cols + ["Contamination_Index"]].values
time_steps = 5
X, y = create_sequences(values, values[:, -1], time_steps)
input_shape = (X.shape[1], X.shape[2])

split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

print("[INFO] Running Grey Wolf Optimization...")

def gwo_optimize(iterations=5):
    best_rmse, best_params = float('inf'), None
    for i in range(iterations):
        params = {
            "filters": random.choice([16, 32, 64]),
            "lstm_units": random.choice([32, 64, 128]),
            "dropout": random.uniform(0.1, 0.4),
            "lr": random.choice([0.001, 0.0005]),
        }
        model = create_model(**params, input_shape=input_shape)
        history = model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0,
                            validation_split=0.2,
                            callbacks=[EarlyStopping(patience=3, restore_best_weights=True)])
        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        print(f"[GWO] Iter {i+1} | RMSE={rmse:.4f} | Params={params}")
        if rmse < best_rmse:
            best_rmse, best_params = rmse, params
    return best_params

best_params = gwo_optimize()
print("[INFO] Best Params:", best_params)

# Final training
final_model = create_model(**best_params, input_shape=input_shape)
history = final_model.fit(X_train, y_train, epochs=25, batch_size=16, verbose=1,
                          validation_split=0.2,
                          callbacks=[EarlyStopping(patience=5, restore_best_weights=True)])

# ------------------------------------------------
# 📊 Phase 4: Visualization Suite
# ------------------------------------------------
print("[INFO] Generating Graphs...")

y_pred = final_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# 1️⃣ Accuracy Graph
plt.figure(figsize=(8, 4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title("Training vs Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("MSE")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "AquaSentinel_accuracy_graph.png"))
plt.close()

# 2️⃣ Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df_scaled.corr(), cmap="coolwarm", annot=False)
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "AquaSentinel_heatmap.png"))
plt.close()

# 3️⃣ Comparison Graph (Line Plot)
plt.figure(figsize=(8, 4))
plt.plot(y_test[:100], label="Actual", linewidth=2)
plt.plot(y_pred[:100], label="Predicted", linestyle="--")
plt.title("Actual vs Predicted (Sample 100 Points)")
plt.xlabel("Sample Index")
plt.ylabel("Contamination Index")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "AquaSentinel_comparison_graph.png"))
plt.close()

# 4️⃣ Prediction Graph (Scatter)
plt.figure(figsize=(6, 5))
sns.scatterplot(x=y_test.flatten(), y=y_pred.flatten(), alpha=0.7)
plt.title("Prediction vs Actual Scatter Plot")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "AquaSentinel_prediction_graph.png"))
plt.close()

# 5️⃣ Result Graph (Bar chart)
plt.figure(figsize=(6, 4))
bars = plt.bar(["RMSE", "R²"], [rmse, r2], color=["#6dd5ed", "#2193b0"])
plt.title("Model Performance Metrics")
plt.ylabel("Value")
for bar in bars:
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f"{bar.get_height():.3f}", ha='center', va='bottom')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "AquaSentinel_result_graph.png"))
plt.close()

# ------------------------------------------------
# 💾 Save Artifacts
# ------------------------------------------------
print("[INFO] Saving artifacts...")

final_model.save(os.path.join(OUTPUT_DIR, "AquaSentinel_model.h5"))
with open(os.path.join(OUTPUT_DIR, "AquaSentinel_scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)
with open(os.path.join(OUTPUT_DIR, "AquaSentinel_config.yaml"), "w") as f:
    yaml.dump({
        "features": numeric_cols,
        "target": target_col,
        "model_params": best_params,
        "time_steps": 5,
    }, f)
with open(os.path.join(OUTPUT_DIR, "AquaSentinel_results.json"), "w") as f:
    json.dump({
        "RMSE": float(rmse),
        "R2_Score": float(r2),
        "Best_Params": best_params,
        "Rows": len(df),
        "Cols": len(numeric_cols)
    }, f, indent=4)

print("\n✅ [SUCCESS] All results & graphs saved in:", OUTPUT_DIR)
print("""
Generated Files:
📈 AquaSentinel_accuracy_graph.png
🔥 AquaSentinel_heatmap.png
📊 AquaSentinel_comparison_graph.png
🎯 AquaSentinel_prediction_graph.png
🏁 AquaSentinel_result_graph.png
💾 AquaSentinel_model.h5 / scaler.pkl / config.yaml / results.json
""")



[INFO] Loading dataset safely...
[INFO] Detected Encoding: ISO-8859-1
[INFO] Dataset Shape: (1991, 12)
[INFO] Running Grey Wolf Optimization...



[GWO] Iter 1 | RMSE=0.0024 | Params={'filters': 16, 'lstm_units': 128, 'dropout': 0.37740567630586264, 'lr': 0.001}
[GWO] Iter 2 | RMSE=0.0141 | Params={'filters': 32, 'lstm_units': 64, 'dropout': 0.10689710117580896, 'lr': 0.001}
[GWO] Iter 3 | RMSE=0.0162 | Params={'filters': 64, 'lstm_units': 128, 'dropout': 0.13024294173792936, 'lr': 0.0005}
[GWO] Iter 4 | RMSE=0.0066 | Params={'filters': 32, 'lstm_units': 64, 'dropout': 0.16380251891673253, 'lr': 0.001}
[GWO] Iter 5 | RMSE=0.1116 | Params={'filters': 64, 'lstm_units': 64, 'dropout': 0.11876742498109863, 'lr': 0.0005}
[INFO] Best Params: {'filters': 16, 'lstm_units': 128, 'dropout': 0.37740567630586264, 'lr': 0.001}
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Ep

  plt.tight_layout()
  saving_api.save_model(
