In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib
from sklearn.impute import SimpleImputer

from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv('Water Quality Prediction.csv')
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [5]:
# get a sample of 500000 rows
df = df.sample(n=200000, random_state=42)

In [6]:
df.isna().sum()

Index                        0
pH                        3982
Iron                      1328
Nitrate                   3543
Chloride                  5890
Lead                       880
Zinc                      5296
Color                      202
Turbidity                 1653
Fluoride                  6324
Copper                    6756
Odor                      5970
Sulfate                   6477
Conductivity              5569
Chlorine                  1915
Manganese                 3622
Total Dissolved Solids      56
Source                    2925
Water Temperature         5536
Air Temperature            956
Month                     3211
Day                       3330
Time of Day               3864
Potability                   0
dtype: int64

In [7]:
# df.dropna(inplace=True)

In [8]:
# check for duplicate rows
duplicate_rows = df[df.duplicated()]
if duplicate_rows.count().sum() == 0:
   print("No duplicate rows")
else:
   print("Duplicate rows are present")

No duplicate rows


In [9]:
# remove month and index columns
df = df.drop(['Month', 'Index', 'Source', 'Zinc', 'Lead', 'Time of Day'], axis=1) # axis=1 indicates we are dropping a column, not a row


In [10]:
# drop rows with null values in color and source columns
df = df.dropna(subset=["Color"])
# df

In [11]:
df.head()

Unnamed: 0,pH,Iron,Nitrate,Chloride,Color,Turbidity,Fluoride,Copper,Odor,Sulfate,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Water Temperature,Air Temperature,Month,Day,Potability
781974,7.004799,6e-06,7.114755,120.527769,Near Colorless,0.613998,1.758451,0.255472,2.09209,120.745502,241.446886,3.099394,0.04469775,257.717511,22.900917,54.310518,January,7.0,0
937737,8.299823,0.001846,4.71349,220.284903,Faint Yellow,1.543039,4.341496,0.316341,3.077392,228.707208,282.409585,3.749201,8.41e-05,92.378364,64.103574,72.016863,March,6.0,0
907828,8.077128,0.001998,6.999236,157.332074,Faint Yellow,0.363389,0.537449,0.032343,1.306127,136.219129,214.876158,2.215031,0.000195404,187.093504,26.600483,74.400507,October,6.0,0
784628,7.813995,0.001145,6.168141,200.820979,Near Colorless,0.731114,0.07373,0.700787,0.440061,72.935525,342.590598,3.030572,0.04894026,334.951667,16.434954,98.879709,February,11.0,0
662460,6.691067,0.506861,8.280426,143.161413,Colorless,0.026614,1.532804,0.161636,2.780277,300.992636,291.962088,3.105734,3.49e-18,211.253831,12.467716,46.854295,February,20.0,0


In [12]:
# Identify the numeric columns
numeric_columns = df.select_dtypes(include=['number']).columns

# # Create a SimpleImputer and apply it to the numeric columns
imputer = SimpleImputer(strategy='median')  # You can choose a different strategy if needed
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])


In [13]:
# we need x, y values as numpy arrays
X = df.iloc[:, 0:-1].values
Y = df.iloc[:, -1].values

# X = df.iloc[:, 1:-4].values
# Y = df.iloc[:, -1].values

In [15]:
X[0]

array([7.004799273, 6.13e-06, 7.114755278, 120.5277688, 'Near Colorless',
       0.613997908, 1.758450685, 0.255472008, 2.092090468, 120.745502,
       241.4468855, 3.099393646, 0.044697746, 257.7175114, 22.90091727,
       54.31051792, 'January', 7.0], dtype=object)

In [12]:
# Label Encode categorical values (1, 2, 3 ... values)
le = LabelEncoder()
X[:, 4] = le.fit_transform(X[:, 4])


In [13]:
# Column transform categorical columns (0, 1, 0 ...)
ct = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [4])], remainder='passthrough')
X = ct.fit_transform(X)

X

array([[0.0, 0.0, 0.0, ..., 22.90091727, 54.31051792, 7.0],
       [0.0, 1.0, 0.0, ..., 64.10357372, 72.01686324, 6.0],
       [0.0, 1.0, 0.0, ..., 26.60048284, 74.40050714, 6.0],
       ...,
       [0.0, 0.0, 0.0, ..., 26.49301707, 34.48383196, 14.0],
       [1.0, 0.0, 0.0, ..., 22.62244894, 71.3791377, 9.0],
       [0.0, 0.0, 0.0, ..., 19.78753869, 43.39457387, 1.0]], dtype=object)

In [14]:
# Splitting the data set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
x_train[0]

array([0.0, 1.0, 0.0, 0.0, 0.0, 7.567215991, 0.019643421, 5.414722979,
       172.5103624, 0.028778173, 0.206780098, 0.424872971, 2.442587157,
       149.2967975, 495.5623242, 3.882148975, 3.37e-05, 69.24749741,
       10.14057688, 65.33090843, 16.0], dtype=object)

In [15]:
# Normalize input values

sc = StandardScaler()  # range: -3 to +3
x_train[:, 5:] = sc.fit_transform(x_train[:, 5:])
x_test[:, 5:] = sc.transform(x_test[:, 5:])

print("X TRAIN", x_train[0])
print("Y TRAIN", y_train)

X TRAIN [0.0 1.0 0.0 0.0 0.0 0.1408904573263271 -0.2103070960014174
 -0.2150394345896286 -0.13995091267580903 -0.5275610428430832
 -0.9348446020430538 -0.11482005730145783 0.6500644675829823
 0.07498904697418717 0.38011355492738697 0.9021391562886674
 -0.20926001552954823 -1.2651858332440258 -0.7900219418667164
 0.29651709560112804 0.031392933494053296]
Y TRAIN [0. 1. 0. ... 0. 0. 1.]


### Calculate recall, accuracy, precision and F1 of trained_RDF_model_3

In [16]:
# from sklearn.metrics import precision_score, recall_score
# from sklearn.metrics import f1_score

# classifier = joblib.load('trained_RDF_model_3.joblib')

# y_pred_rdf = classifier.predict(x_test)
# print(y_pred_rdf)

# # Assuming y_true contains the true labels and y_pred contains the predicted labels
# precision = precision_score(y_test, y_pred_rdf)
# recall = recall_score(y_test, y_pred_rdf)
# f1 = f1_score(y_test, y_pred_rdf)

# print("F1 Score:", f1)

# print("Precision:", precision)
# print("Recall:", recall)

### Create the Logistic Regression classification model

In [17]:
# lr_classifier = LogisticRegression(random_state=0)
# lr_classifier.fit(x_train, y_train)

In [18]:
# save the trained model
# joblib.dump(lr_classifier, 'trained_LR_model.joblib')

In [19]:
# y_pred_lr = lr_classifier.predict(x_test)

In [20]:
# np.set_printoptions(precision=2)
# print(np.concatenate([y_pred_lr.reshape(len(y_pred_lr), 1), y_test.reshape(len(y_test), 1)], axis=1))

In [21]:
# confMatrix = confusion_matrix(y_test, y_pred_lr)
# print(confMatrix)

In [22]:
# lr_model_accuracy = accuracy_score(y_test, y_pred_lr)
# print(lr_model_accuracy)

### Create the Support Vector Machine classification model

In [23]:
# support vector classifier
# svm_classifier = SVC(kernel='linear', random_state=0)  # default is rbf
# svm_classifier.fit(x_train, y_train)

In [24]:
# save the trained model
# joblib.dump(svm_classifier, 'trained_SVM_model.joblib')

In [25]:
# y_pred_svm = svm_classifier.predict(x_test)

In [26]:
# confMatrix = confusion_matrix(y_test, y_pred_svm)
# print(confMatrix)

In [27]:
# svm_model_accuracy = accuracy_score(y_test, y_pred_svm)
# print(svm_model_accuracy)

### Create the K-Nearest Neighbour classification model

In [28]:
# knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)  # classic euclidean distance
# knn_classifier.fit(x_train, y_train)

In [29]:
# save the trained model
# joblib.dump(knn_classifier, 'trained_KNN_model.joblib')

In [30]:
# y_pred_knn = knn_classifier.predict(x_test)

In [31]:
# knn_model_accuracy = accuracy_score(y_test, y_pred_knn)
# print(knn_model_accuracy)

### Create the decision tree classification model

In [32]:
# from sklearn.tree import DecisionTreeClassifier

# dtree_classifier = DecisionTreeClassifier(criterion='gini', random_state=0)
# dtree_classifier.fit(x_train, y_train)

In [33]:
# save the trained model
# joblib.dump(dtree_classifier, 'trained_DTR_model.joblib')

In [34]:
# y_pred_dtree = dtree_classifier.predict(x_test)

In [35]:
# dtc_model_accuracy = accuracy_score(y_test, y_pred_dtree)
# print(dtc_model_accuracy)

### Create the random forest classification model

In [36]:
from sklearn.ensemble import RandomForestClassifier
rfc_classifier = RandomForestClassifier(n_estimators = 100, criterion='entropy', random_state = 0)
rfc_classifier.fit(x_train, y_train)

In [37]:
# save the trained model
joblib.dump(rfc_classifier, 'trained_RDF_model_5.joblib')

['trained_RDF_model_5.joblib']

In [38]:
y_pred_rdf = rfc_classifier.predict(x_test)

In [39]:
rfc_model_accuracy = accuracy_score(y_test, y_pred_rdf)
print(rfc_model_accuracy)

# Accuracy

# remove missing values - 0.8642686209991026
# remove missing of categorical and replace others with median - 0.87615241140421

0.8693443443443444


### Preprocess the input data

In [None]:
# values = [7.66857169, 7.06e-08, 7.541255359, 198.1312335, 1.31e-95,
#        0.767179279, 'Colorless', 0.137766996, 1.008886456, 2.391833449,
#        0.750761234, 148.9474344, 242.7039915, 3.709734571, 2.301398715,
#        100.9851033, 'Stream', 9.674425593, 35.25315137, 12.0, 1.0]

classifier = joblib.load('trained_RDF_model_3.joblib')
values = [1, 7.06e-08, 7.541255359, 198.1312335, 1.31e-95,
       0.767179279, 'Colorless', 0.137766996, 1.008886456, 2.391833449,
       0.750761234, 148.9474344, 242.7039915, 3.709734571, 2.301398715,
       100.9851033, 'Stream', 9.674425593, 35.25315137, 12.0, 1.0]

# Convert the input values to a DataFrame
input_data = pd.DataFrame([values])

# Preprocess the input data
input_data = input_data.values  # Convert to NumPy array

# label encode the values
input_data[:, 6] = le1.transform(input_data[:, 6])
input_data[:, 16] = le2.transform(input_data[:, 16])

# column transform the values
input_data = ct1.transform(input_data)
input_data = ct2.transform(input_data)

# scale the values
input_data[:, 13:] = sc.transform(input_data[:, 13:])

# print(input_data)

y_single = classifier.predict(input_data)

print(y_single[0])

In [None]:
# y_single = rfc_classifier.predict(input_data)

# print(y_single[0])

### Check overfitting

In [None]:
train_sizes, train_scores, test_scores = learning_curve(
    classifier, x_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.title("Learning Curve")
plt.xlabel("Training Examples")
plt.ylabel("Accuracy")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Accuracy")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Test Accuracy")
plt.legend(loc="best")
plt.grid()
plt.show()

## CONCLUSIONS

In [None]:
# print("ACCURACY SCORES OF EACH MODEL\n")

# print("Logistic Regression Classifier\t", round(lr_model_accuracy * 100, 2))
# print("SVM Classifier\t\t\t", round(svm_model_accuracy * 100, 2))
# print("K-NN Classifier\t\t\t", round(knn_model_accuracy * 100, 2))
# print("Decision Tree Classifier\t", round(dtc_model_accuracy * 100, 2))
# print("Random Forest Classifier\t", round(rfc_model_accuracy * 100, 2))