In [14]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import KNNImputer
from sklearn.neighbors import LocalOutlierFactor
from imblearn.over_sampling import SMOTE 
from sklearn.preprocessing import MinMaxScaler

data_raw = pd.read_csv('water_potability.csv')

# Input NaN or Null value
imputer = KNNImputer(n_neighbors=12, weights="uniform")
input = imputer.fit_transform(data_raw)
filled_data = pd.DataFrame(input,columns=data_raw.columns)

# Remove Outlier (If Inlier is not 1 == Outlier)
clf = LocalOutlierFactor(n_neighbors=5, contamination='auto')
y_pred = clf.fit_predict(filled_data) 

filled_data['Inlier'] = y_pred
data = filled_data[filled_data['Inlier'] == 1]
data = data.drop('Inlier',axis = 1)

# Handling Inbalance Class
oversample = SMOTE()
features, Y = oversample.fit_resample(data.drop(["Potability"],axis=1),data["Potability"])
scaler = MinMaxScaler()
names = features.columns
data_scaler = scaler.fit_transform(features)

X = pd.DataFrame(data_scaler, columns=names)


X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.25,random_state=42)

model = CatBoostClassifier(iterations=3000, learning_rate=0.05, depth=8, loss_function='MultiClass')
model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=100)
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


0:	learn: 0.6909528	test: 0.6925982	best: 0.6925982 (0)	total: 6.92ms	remaining: 20.8s
100:	learn: 0.5195907	test: 0.6167766	best: 0.6167766 (100)	total: 596ms	remaining: 17.1s
200:	learn: 0.4303574	test: 0.5953765	best: 0.5953765 (200)	total: 1.16s	remaining: 16.1s
300:	learn: 0.3629274	test: 0.5871772	best: 0.5871772 (300)	total: 1.71s	remaining: 15.4s
400:	learn: 0.3065619	test: 0.5852453	best: 0.5842794 (337)	total: 2.27s	remaining: 14.7s
500:	learn: 0.2644329	test: 0.5861012	best: 0.5842794 (337)	total: 2.84s	remaining: 14.2s
600:	learn: 0.2294820	test: 0.5873922	best: 0.5842794 (337)	total: 3.4s	remaining: 13.6s
700:	learn: 0.2000324	test: 0.5900287	best: 0.5842794 (337)	total: 3.96s	remaining: 13s
800:	learn: 0.1760425	test: 0.5905153	best: 0.5842794 (337)	total: 4.54s	remaining: 12.5s
900:	learn: 0.1555614	test: 0.5921253	best: 0.5842794 (337)	total: 5.12s	remaining: 11.9s
1000:	learn: 0.1378916	test: 0.5948614	best: 0.5842794 (337)	total: 5.7s	remaining: 11.4s
1100:	learn: 0.1