In [11]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import KNNImputer
from sklearn.neighbors import LocalOutlierFactor
from imblearn.over_sampling import SMOTE 
from sklearn.preprocessing import MinMaxScaler

data_raw = pd.read_csv('water_potability.csv')

# Input NaN or Null value
knn = KNNImputer(n_neighbors = 12, weights = "uniform").fit_transform(data_raw)
filled_data = pd.DataFrame(knn, columns = data_raw.columns)

# Remove Outlier (If Inlier is not 1 == Outlier)
predictionY = LocalOutlierFactor(n_neighbors = 5, contamination = 'auto').fit_predict(filled_data) 
filled_data['Inlier'] = predictionY
data = filled_data[filled_data['Inlier'] == 1]
data = data.drop('Inlier',axis = 1)

# Handling Inbalance Class
features, Y = SMOTE().fit_resample(data.drop(["Potability"],axis=1),data["Potability"])
column_names = features.columns
scaled_data = MinMaxScaler().fit_transform(features)

X = pd.DataFrame(scaled_data, columns = column_names)


X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.25,random_state=42)

model = CatBoostClassifier(iterations=3000, learning_rate=0.05, depth=8, loss_function='Logloss')
model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=100)
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

0:	learn: 0.6893526	test: 0.6917127	best: 0.6917127 (0)	total: 105ms	remaining: 5m 14s
100:	learn: 0.4382535	test: 0.5923550	best: 0.5923550 (100)	total: 490ms	remaining: 14.1s
200:	learn: 0.3247748	test: 0.5758430	best: 0.5749828 (199)	total: 857ms	remaining: 11.9s
300:	learn: 0.2457024	test: 0.5720260	best: 0.5711966 (263)	total: 1.22s	remaining: 11s
400:	learn: 0.1878979	test: 0.5711591	best: 0.5686978 (321)	total: 1.61s	remaining: 10.4s
500:	learn: 0.1462203	test: 0.5756962	best: 0.5686978 (321)	total: 2.02s	remaining: 10.1s
600:	learn: 0.1157365	test: 0.5784966	best: 0.5686978 (321)	total: 2.42s	remaining: 9.64s
700:	learn: 0.0927144	test: 0.5873121	best: 0.5686978 (321)	total: 2.78s	remaining: 9.13s
800:	learn: 0.0760159	test: 0.5924117	best: 0.5686978 (321)	total: 3.16s	remaining: 8.67s
900:	learn: 0.0623390	test: 0.5998055	best: 0.5686978 (321)	total: 3.54s	remaining: 8.24s
1000:	learn: 0.0525958	test: 0.6064672	best: 0.5686978 (321)	total: 3.91s	remaining: 7.81s
1100:	learn: 0