In [None]:
pip install pandas numpy matplotlib seaborn scikit-learn xgboost joblib

In [None]:
# =============================================
# WATER POTABILITY PREDICTION - COMPLETE SCRIPT
# =============================================

# --------------------------
# 1. Import Required Libraries
# --------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import joblib

# --------------------------
# 2. Load and Explore Data
# --------------------------
# Load dataset
df = pd.read_csv("water_quality.csv")

# Display basic info
print("\n=== Data Overview ===")
print("First 5 rows:\n", df.head())  # Check column names and sample data
print("\nData shape:", df.shape)     # Verify row/column count
print("\nData types:\n", df.dtypes)  # Check for any unexpected data types

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())  # If any, handle with df.fillna() or df.dropna()

# Analyze target variable distribution
print("\nPotability class distribution:")
print(df["Potability"].value_counts())  # Note: Imbalanced (61% vs 39%)

# --------------------------
# 3. Data Visualization (EDA)
# --------------------------
# Set style for plots
sns.set_style("whitegrid")

# Plot 1: Target class distribution
plt.figure(figsize=(6, 4))
sns.countplot(x="Potability", data=df)
plt.title("Potability Class Distribution (0=Not Potable, 1=Potable)")
plt.show()

# Plot 2: Boxplot for key numerical features
plt.figure(figsize=(10, 6))
sns.boxplot(x="Potability", y="Solids", data=df)
plt.title("Solids Distribution by Potability Status")
plt.show()

# Plot 3: Correlation heatmap
plt.figure(figsize=(12, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", center=0)
plt.title("Feature Correlation Matrix")
plt.show()

# --------------------------
# 4. Feature Engineering
# --------------------------
# Drop highly correlated features (if any)
df = df.drop("log_Solids", axis=1)  # log_Solids and Solids are 99% correlated

# One-hot encode categorical features (example)
if "Solids_bins" in df.columns:
    df = pd.get_dummies(df, columns=["Solids_bins"], drop_first=True)

# Separate features (X) and target (y)
X = df.drop("Potability", axis=1)
y = df["Potability"]

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardize features (important for models like SVM, Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit on training data only
X_test_scaled = scaler.transform(X_test)        # Transform test data

# --------------------------
# 5. Model Building
# --------------------------
# Initialize models with class weighting for imbalance handling
models = {
    "Logistic Regression": LogisticRegression(class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(
        class_weight="balanced_subsample", random_state=42
    ),
    "XGBoost": XGBClassifier(
        scale_pos_weight=sum(y == 0) / sum(y == 1),  # Handle imbalance
        eval_metric="logloss",
        random_state=42,
    ),
}

# Train and evaluate each model
for name, model in models.items():
    print(f"\n=== Training {name} ===")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    # Evaluation metrics
    print(f"\n{name} Performance:")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

# --------------------------
# 6. Model Optimization (Hyperparameter Tuning)
# --------------------------
# Focus on the best-performing model (Random Forest in this case)
print("\n=== Hyperparameter Tuning for Random Forest ===")

# Define parameter grid
param_grid = {
    "n_estimators": [100, 200],      # Number of trees
    "max_depth": [None, 10, 20],     # Maximum tree depth
    "min_samples_split": [2, 5],     # Minimum samples to split a node
}

# Initialize GridSearchCV
rf = RandomForestClassifier(class_weight="balanced_subsample", random_state=42)
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,               # 5-fold cross-validation
    scoring="f1",       # Optimize for F1-score (balance of precision/recall)
    n_jobs=-1           # Use all CPU cores
)

# Execute grid search
grid_search.fit(X_train_scaled, y_train)

# Results
print("\nBest Parameters:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)

# Get optimized model
optimized_rf = grid_search.best_estimator_

# Final evaluation
y_pred_optimized = optimized_rf.predict(X_test_scaled)
print("\nOptimized Model Performance:")
print(classification_report(y_test, y_pred_optimized))

# --------------------------
# 7. Feature Importance Analysis
# --------------------------
# Get feature importances from the optimized model
importances = optimized_rf.feature_importances_
feature_importance = pd.DataFrame(
    {"Feature": X.columns, "Importance": importances}
).sort_values("Importance", ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=feature_importance)
plt.title("Top Features Influencing Water Potability")
plt.tight_layout()
plt.show()

# --------------------------
# 8. Save the Model for Deployment
# --------------------------
# Save the trained model and scaler
joblib.dump(optimized_rf, "water_potability_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("\nModel and scaler saved successfully!")

# =============================================
# HOW TO USE THE SAVED MODEL (EXAMPLE)
# =============================================
"""
# Load the model and scaler
loaded_model = joblib.load("water_potability_model.pkl")
loaded_scaler = joblib.load("scaler.pkl")

# Prepare new sample data (must match original features)
new_sample = pd.DataFrame({
    "Hardness": [204.89],
    "Solids": [20791.31],
    "Chloramines": [7.30],
    # ... include all other features ...
})

# Preprocess and predict
new_sample_scaled = loaded_scaler.transform(new_sample)
prediction = loaded_model.predict(new_sample_scaled)
print("Potable" if prediction[0] == 1 else "Not Potable")
"""