In [124]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

In [125]:
# Load the dataset
df = pd.read_csv('water_potability.csv')
df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.89,20791.32,7.30,368.52,564.31,10.38,86.99,2.96,0
1,3.72,129.42,18630.06,6.64,,592.89,15.18,56.33,4.50,0
2,8.10,224.24,19909.54,9.28,,418.61,16.87,66.42,3.06,0
3,8.32,214.37,22018.42,8.06,356.89,363.27,18.44,100.34,4.63,0
4,9.09,181.10,17978.99,6.55,310.14,398.41,11.56,32.00,4.08,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.67,193.68,47580.99,7.17,359.95,526.42,13.89,66.69,4.44,1
3272,7.81,193.55,17329.80,8.06,,392.45,19.90,,2.80,1
3273,9.42,175.76,33155.58,7.35,,432.04,11.04,69.85,3.30,1
3274,5.13,230.60,11983.87,6.30,,402.88,11.17,77.49,4.71,1


In [126]:
# Check for duplicate values
df.duplicated().sum()

0

In [127]:
# Check for missing values
df.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [128]:
# Now let's see the percentage of missing values in each feature.
df.isnull().mean()*100 

ph                 14.987790
Hardness            0.000000
Solids              0.000000
Chloramines         0.000000
Sulfate            23.840049
Conductivity        0.000000
Organic_carbon      0.000000
Trihalomethanes     4.945055
Turbidity           0.000000
Potability          0.000000
dtype: float64

As we have seen,

ph : 14.98 %
Sulfate : 23.84 %
Trihalomethanes : 4.94 %

In [129]:
#Filling the NaN values with the mean of the columns
df['ph'].fillna((df['ph'].mean()), inplace=True)
df['Sulfate'].fillna((df['Sulfate'].mean()), inplace=True)
df['Trihalomethanes'].fillna((df['Trihalomethanes'].mean()), inplace=True)
df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,7.080804,204.89,20791.32,7.30,368.520000,564.31,10.38,86.990000,2.96,0
1,3.720000,129.42,18630.06,6.64,333.775784,592.89,15.18,56.330000,4.50,0
2,8.100000,224.24,19909.54,9.28,333.775784,418.61,16.87,66.420000,3.06,0
3,8.320000,214.37,22018.42,8.06,356.890000,363.27,18.44,100.340000,4.63,0
4,9.090000,181.10,17978.99,6.55,310.140000,398.41,11.56,32.000000,4.08,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.670000,193.68,47580.99,7.17,359.950000,526.42,13.89,66.690000,4.44,1
3272,7.810000,193.55,17329.80,8.06,333.775784,392.45,19.90,66.396281,2.80,1
3273,9.420000,175.76,33155.58,7.35,333.775784,432.04,11.04,69.850000,3.30,1
3274,5.130000,230.60,11983.87,6.30,333.775784,402.88,11.17,77.490000,4.71,1


In [130]:
#Cross verifying the replacement of missing values
df.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [131]:
# Perform EDA
# ...

In [132]:
# Split the dataset into training and testing sets
X = df.drop('Potability', axis=1)
y = df['Potability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [133]:
# Train a machine learning model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [134]:
# Evaluate the performance of the model
score = model.score(X_test, y_test)
print('Accuracy:', score)

Accuracy: 0.6935286935286935


In [135]:
# Save the trained model to a file
joblib.dump(model, 'water_potability_model.joblib')

['water_potability_model.joblib']

In [136]:
# Create a Streamlit web application
# ...