In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder


In [2]:
df = pd.read_csv("/kaggle/input/shuf-co-data-water/shu_cor_water_Q.csv")
df.head()


Unnamed: 0,timestamp,TDS,turbidity,temperature,pH,status,cause
0,2025-01-01 00:00:00,735.91,13.3,46.09,6.74,Warning,"TDS Warning, Turbidity Warning, Temp Warning"
1,2025-01-01 00:10:00,717.53,16.98,51.26,6.88,Warning,High Temp
2,2025-01-01 00:20:00,593.72,4.3,27.94,7.4,Safe,
3,2025-01-01 00:30:00,477.2,16.48,39.42,6.79,Warning,Turbidity Warning
4,2025-01-01 00:40:00,392.8,11.23,35.66,6.28,Safe,Turbidity Warning


In [3]:
label_encoder = LabelEncoder()
df["status_encoded"] = label_encoder.fit_transform(df["status"])
df[["status", "status_encoded"]].drop_duplicates()


Unnamed: 0,status,status_encoded
0,Warning,2
2,Safe,1
9,Danger,0


In [4]:
X = df[["TDS", "turbidity", "temperature", "pH"]]
y = df["status_encoded"]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [6]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [7]:
y_pred = clf.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

      Danger       0.98      0.98      0.98        59
        Safe       1.00      1.00      1.00       483

    accuracy                           0.99       660
   macro avg       0.99      0.99      0.99       660
weighted avg       0.99      0.99      0.99       660

Confusion Matrix:
[[ 58   1   0]
 [  0 482   1]
 [  1   1 116]]


In [8]:
import joblib

joblib.dump(clf, "water_quality_classifier.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")


['label_encoder.pkl']

In [9]:
import pandas as pd
import joblib

# 🔹 Step 1: Load your data
df = pd.read_csv("/kaggle/input/shuf-co-data-water/shu_cor_water_Q.csv")

# 🔹 Step 2: Load your trained model and encoder
model = joblib.load("water_quality_classifier.pkl")
encoder = joblib.load("label_encoder.pkl")

# 🔹 Step 3: Helper function to explain the cause of status
def find_causes(tds, turbidity, temp, ph):
    causes = []
    if tds > 1000:
        causes.append("High TDS")
    elif tds > 600:
        causes.append("TDS Warning")

    if turbidity > 30:
        causes.append("High Turbidity")
    elif turbidity > 10:
        causes.append("Turbidity Warning")

    if temp > 50:
        causes.append("High Temperature")
    elif temp > 40:
        causes.append("Temperature Warning")

    if ph < 5 or ph > 9:
        causes.append("Abnormal pH")
    elif ph < 6 or ph > 8:
        causes.append("pH Warning")

    return ", ".join(causes) if causes else "All parameters within safe range"

# 🔹 Step 4: Predict status for each row and show cause
def predict_on_dataset(df):
    X = df[["TDS", "turbidity", "temperature", "pH"]]
    y_true = df["status"]

    # Predict with model
    y_pred_encoded = model.predict(X)
    y_pred = encoder.inverse_transform(y_pred_encoded)

    # Add predictions to dataframe
    df["predicted_status"] = y_pred

    # Add predicted cause using the function
    df["predicted_cause"] = df.apply(
        lambda row: find_causes(row["TDS"], row["turbidity"], row["temperature"], row["pH"]),
        axis=1
    )

    return df

# 🔹 Step 5: Run the prediction
df_result = predict_on_dataset(df)

# 🔹 Step 6: Save or show top predictions
print(df_result[["timestamp", "status", "predicted_status", "predicted_cause"]].head())

# 🔹 Step 7: Save to file if needed
df_result.to_csv("predicted_water_quality_full.csv", index=False)
print("✅ Predictions saved to 'predicted_water_quality_full.csv'")


             timestamp   status predicted_status  \
2  2025-01-01 00:20:00     Safe             Safe   
4  2025-01-01 00:40:00     Safe             Safe   

                                     predicted_cause  
2                   All parameters within safe range  
✅ Predictions saved to 'predicted_water_quality_full.csv'


In [10]:
import pandas as pd
import joblib
from sklearn.metrics import classification_report, confusion_matrix

# Load your dataset
df = pd.read_csv("/kaggle/input/shuf-co-data-water/shu_cor_water_Q.csv")

# Load your trained model and label encoder
model = joblib.load("water_quality_classifier.pkl")
encoder = joblib.load("label_encoder.pkl")
# Extract only feature columns
X = df[["TDS", "turbidity", "temperature", "pH"]]
y_true = df["status"]
# Predict encoded labels
y_pred_encoded = model.predict(X)

# Decode predictions back to readable labels
y_pred = encoder.inverse_transform(y_pred_encoded)

# Add predictions to the dataframe
df["predicted_status"] = y_pred
# Calculate match percentage
accuracy = (df["status"] == df["predicted_status"]).mean() * 100
print(f"✅ Model Accuracy on Labeled Dataset: {accuracy:.2f}%")

# Optional: detailed report
print("\n📋 Classification Report:")
print(classification_report(y_true, y_pred, target_names=encoder.classes_))

print("\n📉 Confusion Matrix:")
print(confusion_matrix(y_true, y_pred, labels=encoder.classes_))
# Show rows where predicted ≠ actual
mismatches = df[df["status"] != df["predicted_status"]]
print(f"❌ Mismatches: {len(mismatches)} rows\n")
mismatches[["timestamp", "TDS", "turbidity", "temperature", "pH", "status", "predicted_status", "cause"]].head()


✅ Model Accuracy on Labeled Dataset: 99.88%

📋 Classification Report:
              precision    recall  f1-score   support

      Danger       1.00      1.00      1.00       301
        Safe       1.00      1.00      1.00      2397

    accuracy                           1.00      3298
   macro avg       1.00      1.00      1.00      3298
weighted avg       1.00      1.00      1.00      3298


📉 Confusion Matrix:
[[ 300    1    0]
 [   0 2396    1]
 [   1    1  598]]
❌ Mismatches: 4 rows



Unnamed: 0,timestamp,TDS,turbidity,temperature,pH,status,predicted_status,cause
102,2025-01-01 17:00:00,607.7,3.5,42.0,6.91,Safe,Warning,"TDS Warning, Temp Warning"
195,2025-01-02 08:30:00,621.7,18.98,51.53,6.53,Warning,Danger,High Temp
299,2025-01-03 01:50:00,571.62,12.02,27.03,4.7,Danger,Safe,Abnormal pH
612,2025-01-05 06:00:00,427.11,9.1,32.75,6.14,Warning,Safe,
