# Project

- **ppm:** parts per million
- **μg/L:** microgram per litre
- **mg/L:** milligram per litre

**Data Dictionary:**

1. **ph:** pH of 1. water (0 to 14).
2. **Hardness:** Capacity of water to precipitate soap in mg/L.
3. **Solids:** Total dissolved solids in ppm.
4. **Chloramines:** Amount of Chloramines in ppm.
5. **Sulfate:** Amount of Sulfates dissolved in mg/L.
6. **Conductivity:** Electrical conductivity of water in μS/cm.
7. **Organic_carbon:** Amount of organic carbon in ppm.
8. **Trihalomethanes:** Amount of Trihalomethanes in μg/L.
9. **Turbidity:** Measure of light emiting property of water in NTU.
10. **Potability:** Indicates if water is safe for human consumption. Potable -1 and Not potable -0

In [1]:
# !pip install xgboost
# !pip install streamlit

In [4]:
!pip install imbalanced-learn



In [5]:
import imblearn
print(imblearn.__version__)

ModuleNotFoundError: No module named 'imblearn'

# Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV,StratifiedKFold, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import joblib

# Set Seaborn Style
sns.set(style="whitegrid")

# 1. Load Data

In [None]:
# 1. Load Data
def load_data(filepath):
    """
    Load dataset from a CSV file.
    
    Args:
        filepath (str): Path to the dataset.
    
    Returns:
        pd.DataFrame: Loaded dataset as a pandas DataFrame.
    """
    try:
        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(filepath)
        print(f"Data loaded successfully from {filepath}")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None
    except Exception as e:
        print(f"An error occurred while loading the file: {e}")
        return None

# 2. Handle Missing Values

In [None]:
# 2. Handle Missing Values
def handle_missing_values(df):
    """
    Handle missing values in the dataset using SimpleImputer.
    
    Args:
        df (pd.DataFrame): Input dataset.
    
    Returns:
        pd.DataFrame: Dataset with missing values imputed.
    """
    imputer = SimpleImputer(strategy='mean')  # Impute missing values with the column mean
    df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    
    # Display summary of missing values after imputation
    print("Missing values after imputation:")
    print(df_imputed.isnull().sum())
    
    return df_imputed

# 3. Encode Categorical Variables

In [None]:
# 3. Encode Categorical Variables
def encode_categorical(df, columns):
    """
    Encode categorical columns into numerical values using Label Encoding.
    
    Args:
        df (pd.DataFrame): Input dataset.
        columns (list): List of column names to encode.
    
    Returns:
        pd.DataFrame: Dataset with encoded categorical columns.
    """
    for col in columns:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))  # Ensure all values are strings
            print(f"Encoded column: {col}")
        else:
            print(f"Warning: Column '{col}' not found in dataset.")
    return df

# 4. Scale Features

In [None]:
# 4. Scale Features
def scale_features(df, target_column='Potability'):
    """
    Scale numerical features in the dataset while excluding the target column.
    
    Args:
        df (pd.DataFrame): Input dataset.
        target_column (str): Name of the target column to exclude from scaling (default is 'Potability').
    
    Returns:
        pd.DataFrame: Scaled dataset.
    """
    # Separate features and target
    features = df.drop(columns=[target_column])
    
    # Initialize the scaler
    scaler = StandardScaler()
    
    # Scale the features
    scaled_features = scaler.fit_transform(features)
    
    # Convert scaled features back to DataFrame
    scaled_df = pd.DataFrame(scaled_features, columns=features.columns)
    
    # Add the target column back to the dataset
    scaled_df[target_column] = df[target_column].values
    
    return scaled_df

# 5. Split Data

In [None]:
# 5. Split Data
def split_data(df, target_column='Potability', test_size=0.2, random_state=42):
    """
    Split dataset into training and testing sets.
    
    Args:
        df (pd.DataFrame): Input dataset.
        target_column (str): Name of the target column (default is 'Potability').
        test_size (float): Proportion of the dataset to include in the test split (default is 0.2).
        random_state (int): Random state for reproducibility (default is 42).
    
    Returns:
        tuple: X_train, X_test, y_train, y_test
    """
    # Ensure target column exists in the dataset
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' not found in the dataset.")

    # Separate features (X) and target (y)
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test

# 6. Data Preprocessing Pipeline

In [None]:
# 6. Data Preprocessing Pipeline
def preprocess_pipeline(filepath, target_column, categorical_columns=None, test_size=0.2, random_state=42):
    """
    Complete preprocessing pipeline.
    
    Args:
        filepath (str): Path to the dataset.
        target_column (str): Name of the target column.
        categorical_columns (list): List of categorical columns (optional).
        test_size (float): Proportion of the dataset to include in the test split.
        random_state (int): Random state for reproducibility.
    
    Returns:
        tuple: X_train, X_test, y_train, y_test
    """
    # Load data
    df = load_data(filepath)
    if df is None:
        return None, None, None, None

    # Handle missing values
    df = handle_missing_values(df)

    # Encode categorical columns (if any)
    if categorical_columns:
        df = encode_categorical(df, categorical_columns)

    # Scale features
    X_scaled = scale_features(df, target_column)

    # Split data
    X_train, X_test, y_train, y_test = split_data(
        X_scaled,
        target_column,
        test_size,
        random_state,
    )

    return X_train, X_test, y_train, y_test

# 7. Model Training and Evaluation

In [None]:
# 7. Model Training and Evaluation
def train_evaluate_model(X_train, X_test, y_train, y_test):
    """
    Train and evaluate a Random Forest Classifier on the given dataset.
    
    Args:
        X_train (pd.DataFrame): Training features.
        X_test (pd.DataFrame): Testing features.
        y_train (pd.Series): Training labels.
        y_test (pd.Series): Testing labels.
    
    Returns:
        RandomForestClassifier: Trained model.
    """
    # Initialize the Random Forest Classifier
    model = RandomForestClassifier(random_state=42, n_estimators=100)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.2f}\n")
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Not Potable", "Potable"], yticklabels=["Not Potable", "Potable"])
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()
    
    return model

# Main Script

In [None]:
# Main Script
if __name__ == "__main__":
    # Specify parameters
    filepath = r"C:\Pinky_Folder\github_folder\water-quality-prediction\data\water_potability.csv"
    target_column = "Potability"
    categorical_columns = []  # List the categorical columns here if needed
    
    # Preprocess data
    X_train, X_test, y_train, y_test = preprocess_pipeline(filepath, target_column, categorical_columns)

    # Ensure data is properly loaded and preprocessed
    if X_train is not None:
        # Print dataset shapes
        print("Training Features Shape:", X_train.shape)
        print("Testing Features Shape:", X_test.shape)
        print("Training Labels Shape:", y_train.shape)
        print("Testing Labels Shape:", y_test.shape)

        # Visualize class distribution
        sns.countplot(x=y_train)
        plt.title("Class Distribution in Training Set")
        plt.xlabel("Potability Labels")
        plt.ylabel("Count")
        plt.show()

        # Train and evaluate model
        trained_model = train_evaluate_model(X_train, X_test, y_train, y_test)
    else:
        print("Data preprocessing failed.")


# **Observations:**
- The model achieved an accuracy of 68%, which is decent but shows room for improvement, especially in predicting the minority class (label 1.0). Here's a breakdown of the classification metrics:

*Precision:*
- For 0.0: 0.70 (70% of predicted 0.0 are correct).
- For 1.0: 0.61 (61% of predicted 1.0 are correct).
- Indicates the model is better at identifying 0.0 correctly.

*Recall:*
- For 0.0: 0.86 (86% of actual 0.0 are identified correctly).
- For 1.0: 0.38 (only 38% of actual 1.0 are identified correctly).
- Indicates the model struggles to identify the minority class (1.0).

*F1-Score:*
- Combines precision and recall. For 1.0, it’s 0.47, which reflects poor balance for this class.

*Class Imbalance:*
- The support (number of samples) is **imbalanced:** 412 for 0.0 vs. 244 for 1.0. The imbalance skews the model towards predicting the majority class.

# Recommendations to Improve the Model:
Handle Class Imbalance:
- Use techniques like SMOTE (Synthetic Minority Over-sampling Technique) or ADASYN to oversample the minority class.
Alternatively, try undersampling the majority class.

Hyperparameter Tuning:
- Adjust parameters like n_estimators, max_depth, and class_weight in the Random Forest model.
- Setting class_weight='balanced' can help address the class imbalance.

Feature Engineering:
- Explore feature correlations to identify redundant or uninformative features.
- Create new features that may improve class separation.

Use a Different Algorithm:
- Try gradient-boosted algorithms like XGBoost, LightGBM, or CatBoost, which often handle class imbalance better.

Cross-Validation:
- Perform k-fold cross-validation to ensure the model generalizes well across all data splits.

Evaluate Additional Metrics:
- Use ROC-AUC to evaluate the model’s performance, especially for imbalanced datasets.

Analyze Errors:
- Examine misclassified samples to understand where the model struggles and adjust preprocessing or features accordingly.

# 8. SMOTE and Hyperparameter tuning

In [None]:
# 8. SMOTE and Hyperparameter tuning
def train_and_evaluate(X_train, X_test, y_train, y_test):
    """
    Train and evaluate a Random Forest classifier with SMOTE for class balancing.
    
    Args:
        X_train (pd.DataFrame): Training features.
        X_test (pd.DataFrame): Testing features.
        y_train (pd.Series): Training labels.
        y_test (pd.Series): Testing labels.
    """
    # Apply SMOTE to balance the classes in the training set
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
    print("Class distribution after SMOTE:")
    print(pd.Series(y_train_balanced).value_counts())
    
    # Define a Random Forest model
    rf = RandomForestClassifier(random_state=42, class_weight='balanced')
    
    # Hyperparameter tuning using GridSearchCV
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
    }
    grid_search = GridSearchCV(rf, param_grid, scoring='roc_auc', cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train_balanced, y_train_balanced)
    best_model = grid_search.best_estimator_
    
    print(f"Best Parameters: {grid_search.best_params_}")
    
    # Make predictions
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    
    # Evaluate the model
    print("\nModel Evaluation:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot ROC Curve
    RocCurveDisplay.from_estimator(best_model, X_test, y_test)
    plt.title("ROC Curve")
    plt.show()

In [None]:
# Main Script
if __name__ == "__main__":
    # Specify parameters
    filepath = r"C:\Pinky_Folder\github_folder\water-quality-prediction\data\water_potability.csv"
    target_column = "Potability"
    categorical_columns = []  # Update if there are categorical columns
    
    # Preprocess data
    X_train, X_test, y_train, y_test = preprocess_pipeline(filepath, target_column, categorical_columns)
    
    # Ensure data is properly loaded and preprocessed
    if X_train is not None:
        # Train and evaluate the model
        train_and_evaluate(X_train, X_test, y_train, y_test)
    else:
        print("Data preprocessing failed.")

# 9. Improvement
- Applies SMOTE to handle class imbalance in the training set.
- Tunes Hyperparameters for XGBoost using RandomizedSearchCV and stratified cross-validation.
- Evaluates the Model using accuracy, ROC-AUC, and a detailed classification report.

In [None]:
# Initialize XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

# Define parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 7, 9, None],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'min_child_weight': [1, 5, 10],
}

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Perform Stratified Cross-Validation with RandomizedSearchCV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=30,
    scoring='roc_auc',
    cv=cv,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the model
random_search.fit(X_train_smote, y_train_smote)

# Get the best parameters and evaluate on test data
best_xgb = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

# Predict and evaluate
y_pred = best_xgb.predict(X_test)
y_pred_prob = best_xgb.predict_proba(X_test)[:, 1]

print("\nModel Evaluation:")
print("Accuracy:", best_xgb.score(X_test, y_test))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_prob))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Model Performance:
- Accuracy: 65.1% is moderate but not ideal for classification.
- ROC-AUC: 69.6% indicates the model's ability to distinguish between classes is fair but can be improved.
- Classification Report:
- The model performs better on class 0.0 than class 1.0. This could be due to an imbalance in the dataset even after using SMOTE or limitations in the model's ability to learn minority class patterns.

# 10. Deployment

## 10.1 Save the Trained Model
- Save the trained XGBoost model to disk using Python's joblib or pickle libraries.

In [None]:
# Save the trained model
model_path = "xgboost_model.pkl"
joblib.dump(best_xgb, model_path)
print(f"Model saved to {model_path}")

## 10.2 Web App

In [None]:
#!streamlit run app.py