<a href="https://colab.research.google.com/github/ranggaadinugraha/water-quality-classification-naive-bayes/blob/main/water_quality_naive_bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Water Quality Classification using Naive Bayes & Cross Validation
## This project aims to classify water quality (Safe / Not Safe) based on chemical parameters using the Naive Bayes algorithm.

ðŸ”· IMPORT LIBRARIES

In [None]:
# Import Libraries for Data Processing
import pandas as pd
import numpy as np

In [None]:
# Import Libraries for Data Visualization
import plotly.figure_factory as ff
from sklearn.metrics import RocCurveDisplay
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Import Libraries for Machine Learning & Evaluation
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

ðŸ”· LOAD DATASET

In [None]:
# Load Water Quality Dataset
df = pd.read_csv('waterQuality1.csv')
df

ðŸ”· EXPLORATORY DATA ANALYSIS (EDA)

In [None]:
# Display Dataset Overview
df.info()

In [None]:
# Check Missing Values
df.isna().sum()

ðŸ”· DATA SELECTION & PREPARATION

In [None]:
# Limit Dataset Size for Faster Processing
# Separate Features (X) and Target Variable (Y)
# Target : is_safe
df = df[:1000]
Y = df.iloc[:,20]
X = df.drop(['ammonia', 'is_safe'], axis=1)

In [None]:
X

In [None]:
Y

ðŸ”· TRAIN-TEST SPLIT

In [None]:
# Split Dataset into Training and Testing Sets
X_latih, X_tes, Y_latih, Y_tes = train_test_split(X,
                                                  Y,
                                                  test_size=0.3,
                                                  random_state=1)

In [None]:
# Display Dataset Split Information
print('Data X Latih:', len(X_latih))
print('Data Y Latih:', len(Y_latih))
print('Data X Testing:', len(X_tes))
print('Data Y Testing:', len(X_tes))

ðŸ”· MODEL DEVELOPMENT â€“ NAIVE BAYES

In [None]:
# Build and Train Gaussian Naive Bayes Model
gnb = GaussianNB()

gnb.fit(X_latih, Y_latih)

# Make Predictions on Test Data
Y_pred = gnb.predict(X_tes)

In [None]:
# Calculate Model Accuracy
print("Model Accuracy : ",accuracy_score(Y_tes, Y_pred))

ðŸ”· CONFUSION MATRIX

In [None]:
# Generate Confusion Matrix
gnb_cf = confusion_matrix(Y_tes, Y_pred)

In [None]:
gnb_cf

In [None]:
# Function to Visualize Confusion Matrix
def plot_cm(matrix, title):
  z = matrix
  x = ['Safe', 'Not']
  y = x

  z_text = [[str(y) for y in x] for x in z]
  z_text.reverse()

  fig = ff.create_annotated_heatmap(z, x=x, y=y,
                                    annotation_text=z_text,
                                    colorscale='blugrn')

  fig.update_layout(
      title_text='<i><b>Confusion matrix {}</b></i>'.format(title))

  fig.add_annotation({'font':{'color':"black",'size':14},
                            'x':0.5,
                            'y':-0.1,
                            'showarrow':False,
                            'text':"",
                            'xref':"paper",
                            'yref':"paper"})

  fig.add_annotation({'font':{'color':"black",'size':14},
                            'x':-0.20,
                            'y':0.5,
                            'showarrow':False,
                            'text':"",
                            'textangle':-90,
                            'xref':"paper",
                            'yref':"paper"})


  fig.update_layout(margin={'t':50, 'l':20},width=500,height=500)

  fig['data'][0]['showscale'] = True
  fig.show()

In [None]:
# Display Confusion Matrix
plot_cm(gnb_cf, title="model")

ðŸ”· CLASSIFICATION REPORT

In [None]:
# Generate Classification Report
nb_report = classification_report(Y_tes, Y_pred,
                                  output_dict=True,
                                  target_names=['Safe','Not'])
pd.DataFrame(nb_report).transpose()

ðŸ”· MODEL PERFORMANCE SUMMARY

In [None]:
# Create Performance Metrics Table
metrics = pd.DataFrame({'accuracy' : [nb_report['accuracy']],
                        'precision_0' : [nb_report['Safe']['precision']],
                        'recall_0' : [nb_report['Safe']['recall']],
                        'f1-score_0' : [nb_report['Safe']['f1-score']],
                        'precision_1' : [nb_report['Not']['precision']],
                        'recall_1' : [nb_report['Not']['recall']],
                        'f1-score_1' : [nb_report['Not']['f1-score']]},
                        index=['Naive Bayes Classifier'])
multiheader = [('','accuracy'),
               ('Safe', 'precision'),
               ('Safe', 'recall'),
               ('Safe', 'f1-score'),
               ('Not', 'precision'),
               ('Not', 'recall'),
               ('Not', 'f1-score')]
metrics.columns = pd.MultiIndex.from_tuples(multiheader)
metrics

ðŸ”· CROSS-VALIDATION

In [None]:
# Build Pipeline with Standardization and Naive Bayes
pipeline = make_pipeline(StandardScaler(),
    GaussianNB(priors=None))

In [None]:
# Stratified K-Fold Cross Validation
# Perform Cross Validation
# Display Cross Validation Result
strtfdkFold = StratifiedKFold(n_splits=10)
kfold = strtfdkFold.split(X_latih, Y_latih)
scores =[]
for k, (train, test) in enumerate(kfold):
  pipeline.fit(X_latih.iloc[train, :], Y_latih.iloc[train])
  score = pipeline.score(X_latih.iloc[test, :],
                         Y_latih.iloc[test])

  scores.append(score)
  print('Fold: %2d, Training/Test Split Distributiomn: %s, Accuracy: %.3f' % (
      k+1, np.bincount(Y_latih.iloc[train]),score))

  print('\n\nCross-Validation accuracy: %.3f +\- %.3f'%(
      np.mean(scores), np.std(scores)))


ðŸ”· CROSS-VALIDATION VISUALIZATION

In [None]:
# Visualize Cross Validation Accuracy
sns.set_theme(style="darkgrid")

y_axis = scores
sns.lineplot(y_axis)
plt.show()