<a href="https://colab.research.google.com/github/rohan3433/Ground_water_quality_detection-ML/blob/main/Ground_Water_Quality_Detection_Project_ML_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load datasets
df_1 = pd.read_csv('/content/ground_water_quality_2018_post.csv')
df_2 = pd.read_csv('/content/ground_water_quality_2019_post.csv')
df_3 = pd.read_csv('/content/ground_water_quality_2020_post.csv')

In [None]:
# Fill missing values with column mean
df_1.fillna(df_1.mean(), inplace=True)

# Rename columns in df_2 and fill missing values
replaced_names = {'CO_-2 ': 'CO3', 'HCO_ - ': 'HCO3', 'Cl -': 'Cl', 'F -': 'F',
                  'NO3- ': 'NO3 ', 'SO4-2': 'SO4', 'Na+': 'Na', 'K+': 'K',
                  'Ca+2': 'Ca', 'Mg+2': 'Mg', 'EC': 'E.C'}
df_2.rename(columns=replaced_names, inplace=True)
df_2.fillna(df_2.mean(), inplace=True)

# Drop unwanted column in df_3 and fill missing values
df_3.drop('Unnamed: 8', axis=1, inplace=True)
df_3.fillna(df_3.mean(), inplace=True)

# Combine the datasets
df = pd.concat([df_1, df_2, df_3])

# Replace classification values
df['Classification'] = df['Classification'].replace({'O.G': 'OG'})

# List of numerical columns
numcol = ['lat_gis', 'long_gis', 'gwl', 'pH', 'E.C', 'TDS', 'CO3', 'HCO3',
          'Cl', 'F', 'NO3 ', 'SO4', 'Na', 'K', 'Ca', 'Mg', 'T.H', 'SAR',
          'RSC  meq  / L']

# Correcting a specific value in pH and converting to float
df.loc[261, 'pH'] = 8.05
df['pH'] = df['pH'].astype(float)

In [None]:
# Scatter plots of SAR vs other features
fig = plt.figure(figsize=(25, 30))
for i in range(len(numcol)):
    plt.subplot(7, 3, i+1)
    plt.title(numcol[i])
    sns.scatterplot(data=df, x=df['SAR'], y=df[numcol[i]], hue=df['Classification'])
plt.tight_layout()
plt.show()

# Scatter plots of E.C vs other features
fig = plt.figure(figsize=(25, 30))
for i in range(len(numcol)):
    plt.subplot(7, 3, i+1)
    plt.title(numcol[i])
    sns.scatterplot(data=df, x=df['E.C'], y=df[numcol[i]], hue=df['Classification'])
plt.tight_layout()
plt.show()



In [None]:
# Histograms of numerical features
fig = plt.figure(figsize=(25, 25))
for i in range(len(numcol)):
    plt.subplot(7, 3, i+1)
    sns.histplot(data=df, x=df[numcol[i]], kde=True)
plt.tight_layout()
plt.show()



In [None]:
# Log transformation of specific features
var_features = ['gwl', 'E.C', 'TDS', 'CO3', 'HCO3', 'Cl', 'F', 'NO3 ', 'SO4',
                'Na', 'K', 'Ca', 'Mg', 'T.H', 'SAR']
df[var_features] = np.log1p(df[var_features])

# Histograms of log-transformed features
fig = plt.figure(figsize=(25, 25))
for i in range(len(var_features)):
    plt.subplot(5, 3, i+1)
    sns.histplot(data=df, x=var_features[i], kde=True)
plt.tight_layout()
plt.show()



In [None]:
# Filtering the dataset
df = df[df['Classification'] != 'OG']
df['Classification'] = df['Classification'].replace(
    ['C2S1', 'C3S1', 'C4S2', 'C4S1', 'C3S2', 'C4S4', 'C4S3', 'C1S1',
     'C3S4', 'C3S3', 'C2S2'],
    [2, 1, 0, 0, 1, 0, 0, 3, 0, 1, 2]
)

# Defining features and target
X = df.drop(['sno', 'Classification', 'district', 'mandal', 'village',
             'E.C', 'SAR', 'Classification.1', 'TDS'], axis=1)
y = df['Classification']

# One-hot encoding
X = pd.get_dummies(X)

# Splitting data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)



In [None]:
# SVM classifiers with different kernels
from sklearn.svm import SVC
svm_linear = SVC(kernel='linear', random_state=1234)
svm_poly = SVC(kernel='poly', degree=3, random_state=1234)
svm_rbf = SVC(kernel='rbf', random_state=1234)
svm_sigmoid = SVC(kernel='sigmoid', random_state=1234)

# Training SVM models
svm_linear.fit(X_train, y_train)
svm_poly.fit(X_train, y_train)
svm_rbf.fit(X_train, y_train)
svm_sigmoid.fit(X_train, y_train)

# Evaluating SVM models
linear_accuracy = svm_linear.score(X_test, y_test)
poly_accuracy = svm_poly.score(X_test, y_test)
rbf_accuracy = svm_rbf.score(X_test, y_test)
sigmoid_accuracy = svm_sigmoid.score(X_test, y_test)

print("Linear Kernel Accuracy:", linear_accuracy)
print("Polynomial Kernel Accuracy:", poly_accuracy)
print("RBF Kernel Accuracy:", rbf_accuracy)
print("Sigmoid Kernel Accuracy:", sigmoid_accuracy)



In [None]:
# Precision, recall, F1-score, and classification report
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

linear_predictions = svm_linear.predict(X_test)
poly_predictions = svm_poly.predict(X_test)
rbf_predictions = svm_rbf.predict(X_test)
sigmoid_predictions = svm_sigmoid.predict(X_test)

linear_precision = precision_score(y_test, linear_predictions, average='weighted')
linear_recall = recall_score(y_test, linear_predictions, average='weighted')
linear_f1 = f1_score(y_test, linear_predictions, average='weighted')
linear_classification_report = classification_report(y_test, linear_predictions, zero_division=0)

poly_precision = precision_score(y_test, poly_predictions, average='weighted')
poly_recall = recall_score(y_test, poly_predictions, average='weighted')
poly_f1 = f1_score(y_test, poly_predictions, average='weighted')
poly_classification_report = classification_report(y_test, poly_predictions, zero_division=0)

rbf_precision = precision_score(y_test, rbf_predictions, average='weighted')
rbf_recall = recall_score(y_test, rbf_predictions, average='weighted')
rbf_f1 = f1_score(y_test, rbf_predictions, average='weighted')
rbf_classification_report = classification_report(y_test, rbf_predictions, zero_division=0)

sigmoid_precision = precision_score(y_test, sigmoid_predictions, average='weighted')
sigmoid_recall = recall_score(y_test, sigmoid_predictions, average='weighted')
sigmoid_f1 = f1_score(y_test, sigmoid_predictions, average='weighted')
sigmoid_classification_report = classification_report(y_test, sigmoid_predictions, zero_division=0)

print("Linear Kernel Metrics:")
print(f"Precision: {linear_precision}")
print(f"Recall: {linear_recall}")
print(f"F1-score: {linear_f1}")
print("Classification Report:")
print(linear_classification_report)



In [None]:
# Grid Search for Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

# Parameter grids for each kernel
param_grid_linear = {'C': [0.1, 1, 10, 100]}  # Linear kernel
param_grid_poly = {'C': [0.1, 1, 10, 100], 'degree': [2, 3, 4]}  # Polynomial kernel
param_grid_rbf = {'C': [0.1, 1, 10, 100], 'gamma': [0.1, 0.01, 0.001]}  # RBF kernel
param_grid_sigmoid = {'C': [0.1, 1, 10, 100], 'gamma': [0.1, 0.01, 0.001]}  # Sigmoid kernel

# Perform grid search for each SVM model
grid_linear = GridSearchCV(svm_linear, param_grid_linear, cv=5)
grid_poly = GridSearchCV(svm_poly, param_grid_poly, cv=5)
grid_rbf = GridSearchCV(svm_rbf, param_grid_rbf, cv=5)
grid_sigmoid = GridSearchCV(svm_sigmoid, param_grid_sigmoid, cv=5)

# Fit the models
grid_linear.fit(X_train, y_train)
grid_poly.fit(X_train, y_train)
grid_rbf.fit(X_train, y_train)
grid_sigmoid.fit(X_train, y_train)

# Best parameters for each kernel
print("Best parameters for Linear Kernel:", grid_linear.best_params_)
print("Best parameters for Polynomial Kernel:", grid_poly.best_params_)
print("Best parameters for RBF Kernel:", grid_rbf.best_params_)
print("Best parameters for Sigmoid Kernel:", grid_sigmoid.best_params_)


In [None]:
# Evaluate the best models from Grid Search
best_svm_linear = grid_linear.best_estimator_
best_svm_poly = grid_poly.best_estimator_
best_svm_rbf = grid_rbf.best_estimator_
best_svm_sigmoid = grid_sigmoid.best_estimator_

# Evaluate on test data
best_linear_accuracy = best_svm_linear.score(X_test, y_test)
best_poly_accuracy = best_svm_poly.score(X_test, y_test)
best_rbf_accuracy = best_svm_rbf.score(X_test, y_test)
best_sigmoid_accuracy = best_svm_sigmoid.score(X_test, y_test)

print("Best Linear Kernel Accuracy:", best_linear_accuracy)
print("Best Polynomial Kernel Accuracy:", best_poly_accuracy)
print("Best RBF Kernel Accuracy:", best_rbf_accuracy)
print("Best Sigmoid Kernel Accuracy:", best_sigmoid_accuracy)



In [None]:
# Classification reports for best models
best_linear_predictions = best_svm_linear.predict(X_test)
best_poly_predictions = best_svm_poly.predict(X_test)
best_rbf_predictions = best_svm_rbf.predict(X_test)
best_sigmoid_predictions = best_svm_sigmoid.predict(X_test)

best_linear_classification_report = classification_report(y_test, best_linear_predictions, zero_division=0)
best_poly_classification_report = classification_report(y_test, best_poly_predictions, zero_division=0)
best_rbf_classification_report = classification_report(y_test, best_rbf_predictions, zero_division=0)
best_sigmoid_classification_report = classification_report(y_test, best_sigmoid_predictions, zero_division=0)

print("Best Linear Kernel Classification Report:")
print(best_linear_classification_report)

print("Best Polynomial Kernel Classification Report:")
print(best_poly_classification_report)

print("Best RBF Kernel Classification Report:")
print(best_rbf_classification_report)

print("Best Sigmoid Kernel Classification Report:")
print(best_sigmoid_classification_report)

In [None]:
# Visualize confusion matrices for the best models
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Confusion matrices
linear_cm = confusion_matrix(y_test, best_linear_predictions)
poly_cm = confusion_matrix(y_test, best_poly_predictions)
rbf_cm = confusion_matrix(y_test, best_rbf_predictions)
sigmoid_cm = confusion_matrix(y_test, best_sigmoid_predictions)

# Plot confusion matrices
fig, axes = plt.subplots(2, 2, figsize=(20, 15))

sns.heatmap(linear_cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 0])
axes[0, 0].set_title('Linear Kernel Confusion Matrix')
axes[0, 0].set_xlabel('Predicted')
axes[0, 0].set_ylabel('Actual')

sns.heatmap(poly_cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 1])
axes[0, 1].set_title('Polynomial Kernel Confusion Matrix')
axes[0, 1].set_xlabel('Predicted')
axes[0, 1].set_ylabel('Actual')

sns.heatmap(rbf_cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 0])
axes[1, 0].set_title('RBF Kernel Confusion Matrix')
axes[1, 0].set_xlabel('Predicted')
axes[1, 0].set_ylabel('Actual')

sns.heatmap(sigmoid_cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 1])
axes[1, 1].set_title('Sigmoid Kernel Confusion Matrix')
axes[1, 1].set_xlabel('Predicted')
axes[1, 1].set_ylabel('Actual')

plt.tight_layout()
plt.show()