# `CatBoost` vs `Light GBM` vs `XGBoost`

In [None]:
!pip install -q hvplot

In [None]:
import pandas as pd
import numpy as np
from time import time
import hvplot.pandas

import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('/kaggle/input/college-data/data.csv')
data.head()

In [None]:
data.loc[data.grad_rate > 100, 'grad_rate'] = 100

In [None]:
accuracy = {}
speed = {}

X = data.drop('private', axis=1)
y = data.private

# 1. Scikit-Learn

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

model = GradientBoostingClassifier()

start = time()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
score = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

speed['GradientBoosting'] = np.round(time() - start, 3)
accuracy['GradientBoosting'] = (np.mean(score) * 100).round(3)

print(f"Mean Accuracy: {accuracy['GradientBoosting']}\nSTD: {np.std(score):.3f}\nRun Time: {speed['GradientBoosting']}s")

# 2. XGBoost

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier()

start = time()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
score = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

speed['XGBoost'] = np.round(time() - start, 3)
accuracy['XGBoost'] = (np.mean(score) * 100).round(3)

print(f"Mean Accuracy: {accuracy['XGBoost']}\nSTD: {np.std(score):.3f}\nRun Time: {speed['XGBoost']}s")

# 3. LightGBM

In [None]:
from lightgbm import LGBMClassifier

model = LGBMClassifier()

start = time()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
score = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

speed['LGBM'] = np.round(time() - start, 3)
accuracy['LGBM'] = (np.mean(score) * 100).round(3)

print(f"Mean Accuracy: {accuracy['LGBM']}\nSTD: {np.std(score):.3f}\nRun Time: {speed['LGBM']}s")

# 4. CatBoost

In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier()

start = time()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
score = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

speed['CatBoost'] = np.round(time() - start, 3)
accuracy['CatBoost'] = (np.mean(score) * 100).round(3)

print(f"Mean Accuracy: {accuracy['CatBoost']}\nSTD: {np.std(score):.3f}\nRun Time: {speed['CatBoost']}s")

# 5. AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier()

start = time()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
score = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

speed['AdaBoost'] = np.round(time() - start, 3)
accuracy['AdaBoost'] = (np.mean(score) * 100).round(3)

print(f"Mean Accuracy: {accuracy['AdaBoost']}\nSTD: {np.std(score):.3f}\nRun Time: {speed['AdaBoost']}s")

# 6. Scikit-Learn vs XGBoost vs LightGBM vs CatBoost

In [None]:
for algo, result in accuracy.items():
    print(f"{algo:{20}}: Score: {result}, Speed: {speed[algo]}")

In [None]:
accuracy_df = pd.DataFrame(list(accuracy.items()), columns=['Algorithm', 'Accuracy'])

speed_df = pd.DataFrame(list(speed.items()), columns=['Algorithm', 'Time'])

In [None]:
accuracy_df.hvplot.barh(x='Algorithm', y='Accuracy')

In [None]:
speed_df.hvplot.barh(x='Algorithm', y='Time')