<a href="https://colab.research.google.com/github/sambhandavale/catboost-vs-lr-performance/blob/main/catboost_vs_lr_performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

In [19]:
url = "https://raw.githubusercontent.com/sambhandavale/catboost-vs-lr-performance/main/melb_data.csv"
df = pd.read_csv(url)

df = df.dropna(subset=['Price'])

features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea',
            'YearBuilt', 'Lattitude', 'Longtitude', 'Type', 'Regionname', 'Method']
X = df[features]
y = df['Price']

X = X.dropna()
y = y.loc[X.index]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(exclude='object').columns.tolist()

In [22]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat = pd.DataFrame(
    ohe.fit_transform(X_train[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X_train.index
)
X_valid_cat = pd.DataFrame(
    ohe.transform(X_valid[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X_valid.index
)

X_train_final = pd.concat([X_train[numerical_cols], X_train_cat], axis=1)
X_valid_final = pd.concat([X_valid[numerical_cols], X_valid_cat], axis=1)

lr_model = LinearRegression()
lr_model.fit(X_train_final, y_train)
lr_preds = lr_model.predict(X_valid_final)
lr_rmse = np.sqrt(mean_squared_error(y_valid, lr_preds))

print(f"Linear Regression RMSE: {lr_rmse:,.0f}")

Linear Regression RMSE: 395,632


In [23]:
cat_features = categorical_cols

cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    verbose=0
)

cat_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_valid, y_valid),
    early_stopping_rounds=50
)

cat_preds = cat_model.predict(X_valid)
cat_rmse = np.sqrt(mean_squared_error(y_valid, cat_preds))

print(f"CatBoost RMSE: {cat_rmse:,.0f}")

CatBoost RMSE: 231,931
