<a href="https://colab.research.google.com/github/sauravkrpal/IBM-water-quality-prediction-Data-Analytics-Project/blob/main/Water_Quality_Prediction_Data_Analytics_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
anbarivan_indian_water_quality_data_path = kagglehub.dataset_download('anbarivan/indian-water-quality-data')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
print(os.listdir("../input/indian-water-quality-data"))

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load and preprocess the data
df = pd.read_csv("../input/indian-water-quality-data/water_dataX.csv", encoding='unicode_escape')
df = df.iloc[0:1900, :]
df = df.rename(columns={"D.O. (mg/l)": "DO", "CONDUCTIVITY (µmhos/cm)": "Conductivity", "B.O.D. (mg/l)": "BOD",
                        "NITRATENAN N+ NITRITENANN (mg/l)": "NI", "FECAL COLIFORM (MPN/100ml)": "Fec_col",
                        "TOTAL COLIFORM (MPN/100ml)Mean": "Tot_col"})

In [None]:
# Dropping unwanted features
df_num = df.drop(["STATION CODE", "NI", "LOCATIONS", "STATE", "year", "BOD", "Fec_col", "Tot_col"], axis=1)
df_num

In [None]:
num_col = df.shape[1]
for index in range(num_col):
    col_name = df.iloc[:, index].name
    df[col_name] = pd.to_numeric(df[col_name], errors="coerce")

df.dtypes

In [None]:
# Convert all columns to numeric, coerce errors to NaN
df_num = df_num.apply(pd.to_numeric, errors='coerce')

In [None]:
def convert_to_nan(df):
    n_col = df.shape[1]
    for index in range(n_col):
        df.iloc[:, index]  = df.iloc[:, index].replace("NAN", np.nan)
    return df

df = convert_to_nan(df)
df.isnull().sum().sort_values()

In [None]:
# Convert all columns to numeric, coerce errors to NaN
df_num = df_num.apply(pd.to_numeric, errors='coerce')

# Function to replace "NAN" strings with np.nan
def convert_to_nan(df):
    n_col = df.shape[1]
    for index in range(n_col):
        df.iloc[:, index] = df.iloc[:, index].replace("NAN", np.nan)
    return df

df_num = convert_to_nan(df_num)

In [None]:
# Impute missing values with median
imputer = SimpleImputer(strategy="median")
df_num = pd.DataFrame(imputer.fit_transform(df_num), columns=df_num.columns)


In [None]:
# Detecting and removing outliers using Z-Score
def detect_outliers_zscore(df, threshold=3):
    z_scores = zscore(df)
    abs_z_scores = np.abs(z_scores)
    filtered_entries = (abs_z_scores < threshold).all(axis=1)
    return df[filtered_entries]

df_num = detect_outliers_zscore(df_num)


In [None]:
# Plot histograms
def plot_histograms(df):
    for col in df.columns:
        fig, ax = plt.subplots(1, 1, figsize=(7, 5))
        sns.histplot(df[col], kde=True, ax=ax)
        ax.set_title(f'Distribution of {col}')
        plt.show()

plot_histograms(df_num)


In [None]:
# KDE plots
def plot_kde(df):
    for col in df.columns:
        fig, ax = plt.subplots(1, 1, figsize=(7, 5))
        sns.kdeplot(data=df, x=col, ax=ax)
        ax.set_title(f'KDE Plot of {col}')
        plt.show()

plot_kde(df_num)

In [None]:
# Correlation heatmap
corr = df_num.corr()
fig, ax = plt.subplots(figsize=(13, 9))
sns.heatmap(corr, annot=True, cmap='magma', ax=ax)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Pairplot
sns.pairplot(df_num)
plt.show()

In [None]:
# Boxplot
def plot_boxplots(df):
    for col in df.columns:
        fig, ax = plt.subplots(1, 1, figsize=(7, 5))
        sns.boxplot(data=df, x=col, ax=ax)
        ax.set_title(f'Boxplot of {col}')
        plt.show()

plot_boxplots(df_num)

In [None]:
# Scatter plot matrix
pd.plotting.scatter_matrix(df_num, figsize=(12, 12))
plt.show()

In [None]:
# Redefining df_num based on the previously processed dataframe
df_num = df.drop(["STATION CODE", "NI", "LOCATIONS", "STATE", "year", "BOD", "Fec_col", "Tot_col"], axis=1)
df_num = df_num.apply(pd.to_numeric, errors='coerce')
df_num = convert_to_nan(df_num)
df_num


In [None]:
# Replacing NULL values with median of column
imputer = SimpleImputer(strategy="median")
df_num = pd.DataFrame(imputer.fit_transform(df_num), columns=df_num.columns)
df_num

In [None]:
# Compute Z-Score
df_num = detect_outliers_zscore(df_num)
df_num

In [None]:
# Temp has nothing to do with WQI, so dropping
df_num.drop(["Temp"], axis=1, inplace=True)

In [None]:
# Weight Vector (wi)
wi = np.array([0.2213, 0.2604, 0.0022])
wi

In [None]:
# Standard values of parameters (si)
si = np.array([10, 8.5, 1000])
si

In [None]:
# Ideal values of parameters (vIdeal)
vIdeal = np.array([14.6, 7, 0])

def calc_wqi(sample):
    wqi_sample = 0
    num_col = 3
    for index in range(num_col):
        v_index = sample[index]  # Observed value of sample at index
        v_index_ideal = vIdeal[index]  # Ideal value of observed value
        w_index = wi[index]  # Weight of corresponding parameter of observed value
        std_index = si[index]  # Standard value recommended for observed value
        q_index = (v_index - v_index_ideal) / (std_index - v_index_ideal)
        q_index = q_index * 100  # Final qi value of observed value
        wqi_sample += q_index * w_index
    return wqi_sample
def calc_wqi_for_df(df):
    wqi_arr = []
    for index in range(df.shape[0]):
        index_row = df.iloc[index, :]
        wqi_row = calc_wqi(index_row)
        wqi_arr.append(wqi_row)
    return wqi_arr

wqi_arr = calc_wqi_for_df(df_num)
wqi_arr = np.array(wqi_arr).reshape(-1, 1)
# wqi_df = pd.DataFrame(wqi_arr, columns=["WQI"])
wqi_arr

In [None]:
# Combining dataframe of WQI and dataframe of attributes
df_wqi = pd.concat([df_num.reset_index(drop=True), pd.DataFrame(wqi_arr, columns=["WQI"])], axis=1)
df_wqi

In [None]:
#  Removing the samples with negative WQI
df_wqi = df_wqi[df_wqi["WQI"] >= 0]
df_wqi.head()

In [None]:
# WQI classification
df_wqi["WQI clf"] = df_wqi["WQI"].apply(lambda x: 4 if x <= 25 else 3 if 26 <= x <= 50 else 2 if 51 <= x <= 75 else 1 if 76 <= x <= 100 else 0)

In [None]:
features = list(df_wqi.columns)[:]
data_cluster = df_wqi['WQI clf']
data_cluster.describe()

 WQI Range                   Classification

     Less than 25                  Excellent(3)
        26–50                         Good(2)
        51-75                         Poor(1)
    Greater than 75                 Very Poor(0)

In [None]:
df_wqi["WQI clf"] = df_wqi["WQI"].apply(lambda x: (4 if (x <= 25)
                                        else(3 if (26<=x<=50)
                                        else(2 if (51<=x<=75)
                                        else(1 if (76<=x<=100)
                                        else 0)))))
df_wqi.head(10)

In [None]:
# Correlation heatmap
corr = df_wqi[["DO", "PH", "Conductivity", "WQI"]].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(13, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .7})
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

In [None]:
# KMeans clustering and Elbow method
features = list(df_wqi.columns)[:3]
data_f = df_wqi[features]

# Ensure data_f is not empty before proceeding
if data_f.empty:
    raise ValueError("No data available in data_f. Please check the data preprocessing steps.")

sse = []
list_k = list(range(1, 10))

for k in list_k:
    km = KMeans(n_clusters=k)
    km.fit(data_f)
    sse.append(km.inertia_)

plt.figure(figsize=(6, 6))
plt.plot(list_k, sse, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance')
plt.title('Elbow Method for Optimal k')
plt.show()

In [None]:
# Logistic Regression
Y = df_wqi['WQI clf']
X = df_wqi[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=30)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# Clustering with Silhouette Score
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_scaled)
labels = kmeans.labels_
silhouette_score(X_scaled, labels, metric='euclidean') # silhouette

In [None]:
# Histogram for WQI
plt.figure(figsize=(10, 6))
sns.histplot(df_wqi['WQI'], bins=30, kde=True)
plt.title('Histogram of Water Quality Index (WQI)')
plt.xlabel('WQI')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Count plot for WQI classification
plt.figure(figsize=(10, 6))
sns.countplot(x=df_wqi['WQI clf'])
plt.title('Count Plot of WQI Classification')
plt.xlabel('WQI Classification')
plt.ylabel('Count')
plt.show()


In [None]:
# Heatmap of the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
plt.title('Confusion Matrix Heatmap')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


In [None]:
# Bar plot of WQI classification counts
wqi_class_counts = df_wqi['WQI clf'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
sns.barplot(x=wqi_class_counts.index, y=wqi_class_counts.values, palette='viridis')
plt.title('Bar Plot of WQI Classification Counts')
plt.xlabel('WQI Classification')
plt.ylabel('Count')
plt.show()


In [None]:
# Box plot for various features
plt.figure(figsize=(15, 8))
sns.boxplot(data=df_num)
plt.title('Box Plot for Various Features')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
corr_matrix = df_num.corr()
sns.heatmap(corr_matrix, annot=True, cmap='magma', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
