In [None]:
# Importing all the necessary libraries for data processing, visualization, modeling, and evaluation

import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

# Modeling
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, plot_importance

# Evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Utilities
import joblib
import time


In [None]:
# 1. Load Dataset from a Specified Path
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

data = pd.read_csv('/kaggle/input/water-quality-index-wqi/Results_MADE.csv')

print("Displaying first 5 rows of the dataset:")
display(data.head())

print("\nDataset Information:")
print(data.info())

print("\nDescriptive Statistics:")
display(data.describe())

print("\nMissing Values Count per Column:")
print(data.isnull().sum())

# ==============================================
# 2. Handling Missing Values
# ==============================================

print("\nFilling missing values with column mean...")

imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# ==============================================
# 3. Removing Outliers
# ==============================================

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

print("\nRemoving outliers from numerical columns...")

num_cols = data_imputed.select_dtypes(include=['float64', 'int64']).columns
before_len = len(data_imputed)

for col in num_cols:
    data_imputed = remove_outliers(data_imputed, col)

after_len = len(data_imputed)
print(f"Records before: {before_len} / after: {after_len}")

# ==============================================
# 4. Normalizing Features
# ==============================================

print("\nNormalizing features using StandardScaler...")

scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data_imputed), columns=data_imputed.columns)

# ==============================================
# 5. Splitting Features and Target (WQI)
# ==============================================

X = data_scaled.drop('WQI', axis=1)
y = data_scaled['WQI']

# ==============================================
# 6. Train-Test Split
# ==============================================

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData is ready for modeling:")
print(f"Train samples: {X_train.shape[0]}")
print(f"Test samples : {X_test.shape[0]}")

In [None]:
# ==============================
# 1. Load the dataset
# ==============================
data = pd.read_csv('/kaggle/input/water-quality-index-wqi/Results_MADE.csv')  # Change path accordingly

# ==============================
# 2. Handle missing values
# ==============================
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# ==============================
# 3. Remove outliers
# ==============================
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

numeric_columns = data_imputed.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    data_imputed = remove_outliers(data_imputed, col)
