## **Catboost**

`!pip install catboost`

In [1]:
# the main difference b/w catboost and other is (catboost can handle categorical data natively labelencoding)

from catboost import CatBoostClassifier, Pool
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [4]:
# loading the data
X, y = make_classification(
    n_samples=10000,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_classes=2,
    random_state=42
)

In [5]:
# train test splitting
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# model defining
model = CatBoostClassifier(
    iterations=200,  # no of boosting rounds
    learning_rate=0.1,
    depth=6,  # depth of each tree
    loss_function='Logloss',
    eval_metric='Accuracy',
    verbose=20, # print at each 20 rounds
    random_state=42
)

In [7]:
model.fit(x_train, y_train, eval_set=(x_test, y_test))

0:	learn: 0.7987500	test: 0.7835000	best: 0.7835000 (0)	total: 153ms	remaining: 30.4s
20:	learn: 0.9096250	test: 0.9085000	best: 0.9085000 (20)	total: 389ms	remaining: 3.32s
40:	learn: 0.9332500	test: 0.9245000	best: 0.9245000 (40)	total: 527ms	remaining: 2.04s
60:	learn: 0.9447500	test: 0.9285000	best: 0.9285000 (60)	total: 673ms	remaining: 1.53s
80:	learn: 0.9530000	test: 0.9315000	best: 0.9325000 (77)	total: 813ms	remaining: 1.19s
100:	learn: 0.9587500	test: 0.9385000	best: 0.9395000 (98)	total: 958ms	remaining: 939ms
120:	learn: 0.9661250	test: 0.9425000	best: 0.9430000 (117)	total: 1.1s	remaining: 722ms
140:	learn: 0.9700000	test: 0.9420000	best: 0.9435000 (124)	total: 1.25s	remaining: 524ms
160:	learn: 0.9742500	test: 0.9430000	best: 0.9435000 (124)	total: 1.4s	remaining: 338ms
180:	learn: 0.9777500	test: 0.9450000	best: 0.9450000 (178)	total: 1.55s	remaining: 163ms
199:	learn: 0.9798750	test: 0.9455000	best: 0.9460000 (195)	total: 1.68s	remaining: 0us

bestTest = 0.946
bestItera

<catboost.core.CatBoostClassifier at 0x258b9a6fc20>

In [8]:
y_pred = model.predict(x_test)

In [9]:
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")

Accuracy score: 0.946
Classification report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1030
           1       0.94      0.94      0.94       970

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000



In [10]:
"""
cat_features = [0, 3, 5]  # indices of categorical columns
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

model.fit(train_pool, eval_set=test_pool)
"""

# it performs ordered target encoding

'\ncat_features = [0, 3, 5]  # indices of categorical columns\ntrain_pool = Pool(X_train, y_train, cat_features=cat_features)\ntest_pool = Pool(X_test, y_test, cat_features=cat_features)\n\nmodel.fit(train_pool, eval_set=test_pool)\n'