In [5]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import KNNImputer
from sklearn.neighbors import LocalOutlierFactor
from imblearn.over_sampling import SMOTE 
from sklearn.preprocessing import MinMaxScaler

data_raw = pd.read_csv('water_potability.csv')

# Input NaN or Null value
knn = KNNImputer(n_neighbors = 12, weights = "uniform").fit_transform(data_raw)
filled_data = pd.DataFrame(knn, columns = data_raw.columns)

# Remove Outlier (If Inlier is not 1 == Outlier)
predictionY = LocalOutlierFactor(n_neighbors = 5, contamination = 'auto').fit_predict(filled_data) 
filled_data['Inlier'] = predictionY
data = filled_data[filled_data['Inlier'] == 1]
data = data.drop('Inlier',axis = 1)

# Handling Inbalance Class
features, Y = SMOTE().fit_resample(data.drop(["Potability"],axis=1),data["Potability"])
column_names = features.columns
scaled_data = MinMaxScaler().fit_transform(features)

X = pd.DataFrame(scaled_data, columns = column_names)


X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.25,random_state=42)

model = CatBoostClassifier(iterations=3000, learning_rate=0.05, depth=8, loss_function='MultiClass')
model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=100)
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

0:	learn: 0.6894144	test: 0.6914668	best: 0.6914668 (0)	total: 7.31ms	remaining: 21.9s
100:	learn: 0.5163043	test: 0.6184126	best: 0.6184126 (100)	total: 580ms	remaining: 16.6s
200:	learn: 0.4322126	test: 0.6000375	best: 0.6000375 (200)	total: 1.14s	remaining: 15.8s
300:	learn: 0.3619177	test: 0.5895120	best: 0.5895120 (300)	total: 1.69s	remaining: 15.2s
400:	learn: 0.3069281	test: 0.5825414	best: 0.5824840 (399)	total: 2.23s	remaining: 14.4s
500:	learn: 0.2654362	test: 0.5792343	best: 0.5791201 (497)	total: 2.76s	remaining: 13.8s
600:	learn: 0.2298287	test: 0.5771643	best: 0.5764711 (577)	total: 3.34s	remaining: 13.3s
700:	learn: 0.2011041	test: 0.5782382	best: 0.5764711 (577)	total: 3.88s	remaining: 12.7s
800:	learn: 0.1771004	test: 0.5783857	best: 0.5764711 (577)	total: 4.43s	remaining: 12.2s
900:	learn: 0.1576353	test: 0.5807080	best: 0.5764711 (577)	total: 4.97s	remaining: 11.6s
1000:	learn: 0.1398918	test: 0.5845904	best: 0.5764711 (577)	total: 5.55s	remaining: 11.1s
1100:	learn: