In [None]:
# Importing all the necessary libraries for data processing, visualization, modeling, and evaluation

import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

# Modeling
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, plot_importance

# Evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Utilities
import joblib
import time


In [None]:
# 1. Load Dataset from a Specified Path
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

data = pd.read_csv('/kaggle/input/water-quality-index-wqi/Results_MADE.csv')

print("Displaying first 5 rows of the dataset:")
display(data.head())

print("\nDataset Information:")
print(data.info())

print("\nDescriptive Statistics:")
display(data.describe())

print("\nMissing Values Count per Column:")
print(data.isnull().sum())

# ==============================================
# 2. Handling Missing Values
# ==============================================

print("\nFilling missing values with column mean...")

imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# ==============================================
# 3. Removing Outliers
# ==============================================

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

print("\nRemoving outliers from numerical columns...")

num_cols = data_imputed.select_dtypes(include=['float64', 'int64']).columns
before_len = len(data_imputed)

for col in num_cols:
    data_imputed = remove_outliers(data_imputed, col)

after_len = len(data_imputed)
print(f"Records before: {before_len} / after: {after_len}")

# ==============================================
# 4. Normalizing Features
# ==============================================

print("\nNormalizing features using StandardScaler...")

scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data_imputed), columns=data_imputed.columns)

# ==============================================
# 5. Splitting Features and Target (WQI)
# ==============================================

X = data_scaled.drop('WQI', axis=1)
y = data_scaled['WQI']

# ==============================================
# 6. Train-Test Split
# ==============================================

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData is ready for modeling:")
print(f"Train samples: {X_train.shape[0]}")
print(f"Test samples : {X_test.shape[0]}")


In [None]:
# ==============================
# 1. Load the dataset
# ==============================
data = pd.read_csv('/kaggle/input/water-quality-index-wqi/Results_MADE.csv')  # Change path accordingly

# ==============================
# 2. Handle missing values
# ==============================
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# ==============================
# 3. Remove outliers
# ==============================
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

numeric_columns = data_imputed.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    data_imputed = remove_outliers(data_imputed, col)

# ==============================
# 4. Standardize the features
# ==============================
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data_imputed), columns=data_imputed.columns)

# ==============================
# 5. Feature/target split
# ==============================
X = data_scaled.drop('WQI', axis=1)
y = data_scaled['WQI']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ==============================
# 6. Train Linear Regression Model
# ==============================
model = LinearRegression()
model.fit(X_train, y_train)

# ==============================
# 7. Predictions and Evaluation
# ==============================
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("=== Linear Regression Model Evaluation ===")
print(f"MAE:  {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²:   {r2:.4f}")

# ==============================
# 8. Actual vs Predicted Plot
# ==============================
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual WQI")
plt.ylabel("Predicted WQI")
plt.title("Actual vs Predicted WQI")
plt.grid(True)
plt.tight_layout()
plt.show()

# ==============================
# 9. Coefficients Overview
# ==============================
coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\n=== Linear Regression Coefficients ===")
print(coefficients)

# ==============================
# 10. Residuals Distribution
# ==============================
residuals = y_test - y_pred
plt.figure(figsize=(8, 5))
sns.histplot(residuals, kde=True, bins=20)
plt.xlabel("Residuals")
plt.title("Distribution of Residuals")
plt.grid(True)
plt.tight_layout()
plt.show()

# ==============================
# 11. Cross-Validation
# ==============================
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"\nCross-Validated R² Score: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")


In [None]:
# ==============================
# 1. Train Random Forest Regressor
# ==============================
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# ==============================
# 2. Predictions and Evaluation
# ==============================
y_pred_rf = rf_model.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print("\n=== Random Forest Regression Performance ===")
print(f"MAE :  {mae_rf:.4f}")
print(f"RMSE:  {rmse_rf:.4f}")
print(f"R²   :  {r2_rf:.4f}")

# ==============================
# 3. Cross-Validation
# ==============================
cv_scores_rf = cross_val_score(rf_model, X, y, cv=5, scoring='r2')
print(f"\nCross-Validated R² Score: {np.mean(cv_scores_rf):.4f} (±{np.std(cv_scores_rf):.4f})")

# ==============================
# 4. Feature Importance
# ==============================
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\n=== Feature Importances ===")
print(importance_df)

# ==============================
# 5. Plot Feature Importance
# ==============================
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title('Feature Importance - Random Forest Regressor', fontsize=14)
plt.xlabel('Relative Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


In [None]:
# Using only Conductivity column as feature and WQI as target
X_single = data_scaled[['Conductivity (mho/ Cm)']]
y_single = data_scaled['WQI']

# Split into Train and Test sets
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_single, y_single, test_size=0.2, random_state=42)

# Define model
rf_single = RandomForestRegressor(n_estimators=100, random_state=42)
rf_single.fit(X_train_s, y_train_s)

# Prediction and evaluation
y_pred_s = rf_single.predict(X_test_s)
mae_s = mean_absolute_error(y_test_s, y_pred_s)
rmse_s = np.sqrt(mean_squared_error(y_test_s, y_pred_s))
r2_s = r2_score(y_test_s, y_pred_s)

print("\n\033[1;33m=== Random Forest Evaluation (Conductivity only) ===\033[0m")
print(f"MAE:  {mae_s:.4f}")
print(f"RMSE: {rmse_s:.4f}")
print(f"R2:   {r2_s:.4f}")

# Cross-validation
cv_scores_s = cross_val_score(rf_single, X_single, y_single, cv=5, scoring='r2')
print(f"\nR2-Score with Cross-Validation: {np.mean(cv_scores_s):.4f} (±{np.std(cv_scores_s):.4f})")

In [None]:
# 1. Defining the model with optimal parameters
xgb_model = XGBRegressor(
    n_estimators=150,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='mae'
)

# 2. Training the model
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=True
)

# 3. Predictions
y_pred_xgb = xgb_model.predict(X_test)

# 4. Calculating metrics
metrics = {
    'MAE': mean_absolute_error(y_test, y_pred_xgb),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_xgb)),
    'R2': r2_score(y_test, y_pred_xgb),
    'MAPE': np.mean(np.abs((y_test - y_pred_xgb) / np.where(y_test==0, 1, y_test))) * 100
}

# 5. Displaying the results
print("\n" + "="*40)
print(" Final Evaluation of XGBoost Model")
print("="*40)
for name, value in metrics.items():
    print(f"{name}: {value:.4f}")

# 6. Saving the model
joblib.dump(xgb_model, 'xgb_wqi_model.pkl')

# 7. Feature importance plot
plt.figure(figsize=(10,6))
plot_importance(xgb_model, max_num_features=10, importance_type='weight')
plt.title('Feature Importance in XGBoost Model')
plt.tight_layout()
plt.show()


In [None]:
# 1. Defining the model
rf_model = RandomForestRegressor(random_state=42)

# 2. Optimized hyperparameters
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2']
}

# 3. Setting up GridSearchCV
grid_search_rf = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid_rf,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,  # Using all CPU cores
    verbose=1
)

# 4. Running the hyperparameter search
print("Starting hyperparameter tuning...")
start_time = time.time()
grid_search_rf.fit(X_train, y_train)
print(f"Execution time: {(time.time() - start_time)/60:.2f} minutes")

# 5. Results
print("\nBest parameters:")
print(grid_search_rf.best_params_)

print("\nBest score (MSE):")
print(f"{grid_search_rf.best_score_:.4f}")

# 6. Saving the best model
best_rf = grid_search_rf.best_estimator_


In [None]:
import time
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# 1. Define base XGBoost model
xgb_model = XGBRegressor(
    random_state=42,
    objective='reg:squarederror',
    n_jobs=-1,
    verbosity=0  # Suppress internal XGBoost messages
)

# 2. Define parameter grid for hyperparameter tuning
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}

# 3. Setup advanced GridSearchCV with multiple scoring metrics
grid_search_xgb = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_xgb,
    scoring={
        'MAE': 'neg_mean_absolute_error',
        'MSE': 'neg_mean_squared_error',
        'R2': 'r2'
    },
    refit='MSE',  # Select best model based on MSE
    cv=5,
    n_jobs=-1,
    verbose=0  # Suppress cross-validation output
)

# 4. Run the search with time tracking
print(" Starting hyperparameter optimization...")
start_time = time.time()
grid_search_xgb.fit(X_train, y_train)
elapsed_time = (time.time() - start_time) / 60
print(f" Optimization completed in {elapsed_time:.2f} minutes.")

# 5. Show the final results
print("\n🎯 Best Hyperparameters:")
print(grid_search_xgb.best_params_)

# Extract best index and performance metrics
best_index = grid_search_xgb.best_index_
results = pd.DataFrame(grid_search_xgb.cv_results_)

print("\n Best Model Evaluation Metrics:")
print(results.loc[best_index, [
    'mean_test_MAE',
    'mean_test_MSE',
    'mean_test_R2'
]].rename({
    'mean_test_MAE': 'MAE',
    'mean_test_MSE': 'MSE',
    'mean_test_R2': 'R²'
}).to_string())

# 6. Save the best model to file
best_xgb = grid_search_xgb.best_estimator_
best_xgb.save_model('best_xgb_model.json')

# 7. Analyze feature importance
importance = best_xgb.get_booster().get_score(importance_type='gain')
importance_df = pd.DataFrame({
    'Feature': list(importance.keys()),
    'Importance': list(importance.values())
}).sort_values('Importance', ascending=False)

print("\n Top 10 Most Important Features:")
print(importance_df.head(10).to_string(index=False))
