# Water Quality Assessment using SVM

### Import Libraries

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [34]:
data = pd.read_csv('water_data.csv')

In [35]:
data.head()

Unnamed: 0.1,Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target_Class
0,0,8.332988,8.3e-05,8.605777,122.799772,3.713298e-52,3.434827,Colorless,0.22683,0.607283,...,3.708178,2.269945e-15,332.118789,,,43.493324,January,29.0,4.0,1
1,1,6.917863,8.1e-05,3.734167,227.029851,7.849261999999999e-94,1.245317,Faint Yellow,0.19007,0.622874,...,3.292038,8.024076e-07,284.641984,Lake,15.348981,71.220586,November,26.0,16.0,1
2,2,5.443762,0.020106,3.816994,230.99563,5.2866160000000004e-76,0.52828,Light Yellow,3.19956,0.423423,...,3.560224,0.07007989,570.054094,River,11.643467,44.89133,January,31.0,8.0,0
3,3,7.955339,0.143988,8.224944,178.12994,3.997118e-176,4.027879,Near Colorless,1.66319,0.208454,...,3.516907,0.02468295,100.043838,Ground,10.092392,60.843233,April,1.0,21.0,1
4,4,8.091909,0.002167,9.925788,186.540872,4.171069e-132,3.807511,Light Yellow,0.04867,0.222912,...,3.177849,0.003296139,168.075545,Spring,15.249416,69.336671,June,29.0,7.0,1


In [36]:
data.tail()

Unnamed: 0.1,Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target_Class
99996,99996,8.818809,7.5e-05,5.491966,,1.018468e-16,0.054606,Light Yellow,6.18404,1.517286,...,2.479101,0.006608,441.819671,River,20.007815,69.299508,August,8.0,22.0,0
99997,99997,7.277367,2.2e-05,2.878326,126.88037,1.965436e-22,3.632629,Colorless,2.91402,0.228949,...,3.315341,0.001116,308.963907,Aquifer,6.7872,70.198798,January,27.0,14.0,1
99998,99998,7.000954,0.012017,4.157934,165.559327,5.501361e-126,2.162606,Faint Yellow,0.55013,0.275803,...,3.457473,0.010697,236.96024,Lake,12.707579,46.856938,December,17.0,1.0,1
99999,99999,8.374296,0.134012,2.631553,148.166082,3.845744e-40,0.251668,Colorless,4.06297,0.458973,...,2.603017,0.000242,405.005236,Lake,20.841677,55.109951,,11.0,3.0,1
100000,100000,7.154356,0.092041,5.656341,212.623606,3.102165e-72,2.462944,Colorless,0.00355,0.962474,...,2.907079,0.023063,409.911033,Well,4.789358,33.62195,January,16.0,22.0,1


In [37]:
data = data.drop([cols for cols in data.columns if cols not in ['pH', 'Turbidity', 'Total Dissolved Solids', 'Target_Class']], axis=1)

In [38]:
data

Unnamed: 0,pH,Turbidity,Total Dissolved Solids,Target_Class
0,8.332988,0.22683,332.118789,1
1,6.917863,0.19007,284.641984,1
2,5.443762,3.19956,570.054094,0
3,7.955339,1.66319,100.043838,1
4,8.091909,0.04867,168.075545,1
...,...,...,...,...
99996,8.818809,6.18404,441.819671,0
99997,7.277367,2.91402,308.963907,1
99998,7.000954,0.55013,236.960240,1
99999,8.374296,4.06297,405.005236,1


In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100001 entries, 0 to 100000
Data columns (total 4 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   pH                      98096 non-null   float64
 1   Turbidity               99186 non-null   float64
 2   Total Dissolved Solids  99976 non-null   float64
 3   Target_Class            100001 non-null  int64  
dtypes: float64(3), int64(1)
memory usage: 3.1 MB


### Handling Missing values

In [40]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
data.iloc[:,:3] = imputer.fit_transform(data.iloc[:,:3])

In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100001 entries, 0 to 100000
Data columns (total 4 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   pH                      100001 non-null  float64
 1   Turbidity               100001 non-null  float64
 2   Total Dissolved Solids  100001 non-null  float64
 3   Target_Class            100001 non-null  int64  
dtypes: float64(3), int64(1)
memory usage: 3.1 MB


In [43]:
X = data.iloc[:, :3]
y = data.iloc[:, -1]

In [44]:
print(X)
print(y)

              pH  Turbidity  Total Dissolved Solids
0       8.332988    0.22683              332.118789
1       6.917863    0.19007              284.641984
2       5.443762    3.19956              570.054094
3       7.955339    1.66319              100.043838
4       8.091909    0.04867              168.075545
...          ...        ...                     ...
99996   8.818809    6.18404              441.819671
99997   7.277367    2.91402              308.963907
99998   7.000954    0.55013              236.960240
99999   8.374296    4.06297              405.005236
100000  7.154356    0.00355              409.911033

[100001 rows x 3 columns]
0         1
1         1
2         0
3         1
4         1
         ..
99996     0
99997     1
99998     1
99999     1
100000    1
Name: Target_Class, Length: 100001, dtype: int64


### Splitting data into training and testing

In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

### Feature Scaling

In [46]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### SMOTE to handle class imbalance in the training set

In [49]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)



### Train the model

In [53]:
from sklearn.svm import SVC

svm = SVC(probability=True, random_state=42)
svm.fit(X_train_resampled, y_train_resampled)

### Predict Test result

In [55]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = svm.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Accuracy:")
print(round(accuracy_score(y_test, y_pred)*100, 2))

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      6404
           1       0.96      0.97      0.97     13597

    accuracy                           0.96     20001
   macro avg       0.95      0.95      0.95     20001
weighted avg       0.96      0.96      0.96     20001

Confusion Matrix:
[[ 5916   488]
 [  380 13217]]
Accuracy:
95.66


In [62]:
import pickle

with open('svm_model.pkl', 'wb') as file:
    pickle.dump(svm, file)
