In [1]:
!pip install xgboost -q


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv("water_potability.csv")
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [4]:
df = df[['ph', 'Conductivity', 'Turbidity', 'Potability']]
df.head()

Unnamed: 0,ph,Conductivity,Turbidity,Potability
0,,564.308654,2.963135,0
1,3.71608,592.885359,4.500656,0
2,8.099124,418.606213,3.055934,0
3,8.316766,363.266516,4.628771,0
4,9.092223,398.410813,4.075075,0


In [5]:
# check for nan
for col in df:
    print(col, df[col].isna().sum() / len(df) * 100)

ph 14.987789987789988
Conductivity 0.0
Turbidity 0.0
Potability 0.0


In [6]:
# drop the nan
df.dropna(inplace=True)
for col in df:
    print(col, df[col].isna().sum() / len(df) * 100)

ph 0.0
Conductivity 0.0
Turbidity 0.0
Potability 0.0


In [7]:
df.shape

(2785, 4)

In [8]:
X = df.drop("Potability", axis="columns")
y = df['Potability']

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [10]:
X_scaled = scaler.fit_transform(X)
X_scaled[:5]

array([[-2.11081821,  2.0483277 ,  0.67585382],
       [ 0.63883842, -0.10465695, -1.16497112],
       [ 0.77537365, -0.78830454,  0.83909342],
       [ 1.2618488 , -0.35414397,  0.13359063],
       [-0.93894394, -1.81116997, -1.79724776]])

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2088, 3), (697, 3), (2088,), (697,))

In [12]:
model = LogisticRegression().fit(X_train, y_train)

In [13]:
model.score(X_train, y_train)

0.6091954022988506

In [14]:
from sklearn.metrics import classification_report

In [15]:
print(classification_report(model.predict(X_test), y_test))

              precision    recall  f1-score   support

           0       1.00      0.59      0.74       697
           1       0.00      0.00      0.00         0

    accuracy                           0.59       697
   macro avg       0.50      0.30      0.37       697
weighted avg       1.00      0.59      0.74       697



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
forest = RandomForestClassifier(random_state=42).fit(X_train, y_train)

In [18]:
print(classification_report(forest.predict(X_test), y_test))

              precision    recall  f1-score   support

           0       0.72      0.58      0.64       517
           1       0.23      0.37      0.28       180

    accuracy                           0.52       697
   macro avg       0.48      0.47      0.46       697
weighted avg       0.60      0.52      0.55       697



In [19]:
from xgboost import XGBClassifier

In [20]:
model = XGBClassifier().fit(X_scaled, y)

In [21]:
print(classification_report(model.predict(X_train), y_train))

              precision    recall  f1-score   support

           0       0.98      0.91      0.94      1378
           1       0.84      0.97      0.90       710

    accuracy                           0.93      2088
   macro avg       0.91      0.94      0.92      2088
weighted avg       0.94      0.93      0.93      2088



In [22]:
print(classification_report(model.predict(X_test), y_test))

              precision    recall  f1-score   support

           0       0.98      0.89      0.93       453
           1       0.82      0.96      0.88       244

    accuracy                           0.91       697
   macro avg       0.90      0.92      0.91       697
weighted avg       0.92      0.91      0.91       697



In [24]:
import joblib

joblib.dump(model, "potability model.model")
joblib.dump(scaler, "feature_scaler.pkl")

['feature_scaler.pkl']