In [10]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import KNNImputer
from sklearn.neighbors import LocalOutlierFactor
from imblearn.over_sampling import SMOTE 
from sklearn.preprocessing import MinMaxScaler

data_raw = pd.read_csv('water_potability.csv')

# Input NaN or Null value
imputer = KNNImputer(n_neighbors=12, weights="uniform")
input = imputer.fit_transform(data_raw)
filled_data = pd.DataFrame(input,columns=data_raw.columns)

# Remove Outlier (If Inlier is not 1 == Outlier)
clf = LocalOutlierFactor(n_neighbors=5, contamination='auto')
y_pred = clf.fit_predict(filled_data) 

filled_data['Inlier'] = y_pred
data = filled_data[filled_data['Inlier'] == 1]
data = data.drop('Inlier',axis = 1)

# Handling Inbalance Class
oversample = SMOTE()
features, Y = oversample.fit_resample(data.drop(["Potability"],axis=1),data["Potability"])
scaler = MinMaxScaler()
names = features.columns
data_scaler = scaler.fit_transform(features)

X = pd.DataFrame(data_scaler, columns=names)


X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.33,random_state=42)

model = CatBoostClassifier(iterations=3000, learning_rate=0.05, depth=8, loss_function='MultiClass')
model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=100)
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


0:	learn: 0.6898967	test: 0.6913077	best: 0.6913077 (0)	total: 6.45ms	remaining: 19.4s
100:	learn: 0.5161234	test: 0.6219365	best: 0.6219365 (100)	total: 573ms	remaining: 16.4s
200:	learn: 0.4245690	test: 0.6033590	best: 0.6033590 (200)	total: 1.12s	remaining: 15.6s
300:	learn: 0.3515346	test: 0.5962245	best: 0.5962156 (283)	total: 1.66s	remaining: 14.9s
400:	learn: 0.2964899	test: 0.5947371	best: 0.5937456 (337)	total: 2.21s	remaining: 14.3s
500:	learn: 0.2520761	test: 0.5950014	best: 0.5937456 (337)	total: 2.77s	remaining: 13.8s
600:	learn: 0.2173403	test: 0.5982872	best: 0.5937456 (337)	total: 3.31s	remaining: 13.2s
700:	learn: 0.1890389	test: 0.6008504	best: 0.5937456 (337)	total: 3.89s	remaining: 12.8s
800:	learn: 0.1666378	test: 0.6060513	best: 0.5937456 (337)	total: 4.45s	remaining: 12.2s
900:	learn: 0.1476119	test: 0.6091099	best: 0.5937456 (337)	total: 5.02s	remaining: 11.7s
1000:	learn: 0.1309455	test: 0.6130271	best: 0.5937456 (337)	total: 5.59s	remaining: 11.2s
1100:	learn: