<a href="https://colab.research.google.com/github/sainath5466/asdf/blob/main/notebooks/catboost%20regression%20model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qq yellowbrick category_encoders catboost

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# some builtin imports
import re
import warnings
from collections import Counter
import time
from datetime import datetime
warnings.filterwarnings('ignore')

# Some usual imports here
import csv
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl


# sklearn models
from sklearn import metrics, model_selection

import catboost as cb

# visualizations
import shap
from yellowbrick.regressor import residuals_plot, prediction_error

In [3]:
## Customize Matplotlib Parameters
%matplotlib inline
mpl.rcParams['figure.dpi']= 120
mpl.rcParams['figure.edgecolor']= 'black'
mpl.rcParams['axes.linewidth']= .5
# Customize Seaborn Parameters
sns.set()
rc = {
      'font.family': ['serif'],
      'font.serif':'Times New Roman',
      'grid.color': 'gainsboro',
      'grid.linestyle': '-',
}
sns.set_style(rc=rc)
sns.set_context("notebook", font_scale=0.8)

# Load dataset

In [5]:
df = pd.read_csv('house_rentals.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17890 entries, 0 to 17889
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   url            17890 non-null  object 
 1   listing_type   17890 non-null  object 
 2   name           17890 non-null  object 
 3   price          17890 non-null  int64  
 4   category       17890 non-null  object 
 5   bedrooms       17890 non-null  float64
 6   bathrooms      17890 non-null  float64
 7   floor_area     17890 non-null  float64
 8   location       17890 non-null  object 
 9   condition      17890 non-null  object 
 10  amenities      17890 non-null  object 
 11  region         17890 non-null  object 
 12  locality       17890 non-null  object 
 13  parking_space  17890 non-null  bool   
 14  is_furnished   17890 non-null  object 
 15  lat            17890 non-null  float64
 16  lng            17890 non-null  float64
dtypes: bool(1), float64(5), int64(1), object(10)
memor

# CatBoost Regressor

In [7]:
def compute_metrics(model, x, y, cv=5):
    preds = model.predict(x)
    score = model.score(x, y)

    scores_cvs = model_selection.cross_val_score(model, x, y, scoring='r2', cv=cv)

    return pd.DataFrame([{
        "R2": round(score, 3),
        "mse": round(metrics.mean_squared_error(y, preds), 3),
        "rmse": round(np.sqrt(metrics.mean_squared_error(y, preds)), 3),
        "mae": round(metrics.mean_absolute_error(y, preds), 3),
        "adjusted_r2": round(1 - (1 - score) * (len(y) - 1) / (len(y) - x.shape[1] - 1), 3),
        "cv_score": round(scores_cvs.mean()*100, 2)
    }])

In [20]:
# train-test split for model evaluation
X = df.drop(['price'], axis=1).values # Remove 'log1p_price' from the list of columns to drop
y = df['price'].values # Change the target variable from 'log1p_price' to 'price'

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, train_size=0.8, shuffle=True, random_state=42
)

In [19]:
X_train.shape, X_test.shape

((14312, 16), (3578, 16))

In [35]:
# train-test split for model evaluation
X = df.drop(['price'], axis=1) # Keep DataFrame structure
y = df['price'].values # Change the target variable from 'log1p_price' to 'price'

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, train_size=0.8, shuffle=True, random_state=42
)

# Replace 'url' and 'other_categorical_columns' with your actual categorical column names
# Assuming 'url', 'category', 'condition' are your categorical columns, remove 'furnishing' if it's not present
categorical_features_indices = [X_train.columns.get_loc(col) for col in ['url', 'category', 'condition']]

# Before fitting the model, encode the categorical features using OrdinalEncoder:
# Make sure 'furnishing' is removed from the list of columns if it's not in your DataFrame
# Assuming 'url', 'category', 'condition' are your categorical features
!pip install -qq category_encoders
import category_encoders as ce
encoder = ce.OrdinalEncoder(cols=['url', 'category', 'condition'])
X_train_encoded = encoder.fit_transform(X_train)
X_test_encoded = encoder.transform(X_test)

# Now use the encoded data for training:
model = cb.CatBoostRegressor(random_seed=42, logging_level='Silent')

# Get the indices of the encoded categorical features in X_train_encoded
encoded_categorical_features_indices = [X_train_encoded.columns.get_loc(col) for col in ['url', 'category', 'condition']]

#Train with encoded data and encoded indices
model = model.fit(X_train_encoded, y_train, cat_features=encoded_categorical_features_indices) # Pass encoded data and correct indices

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=1]="rentals": Cannot convert 'b'rentals'' to float

### Training metrics


In [34]:
train_metrics = compute_metrics(model, X_train, y_train)
train_metrics

CatBoostError: There is no trained model to use predict(). Use fit() to train model. Then use this method.

### Validation metrics


In [None]:
val_metrics = compute_metrics(model, X_test, y_test)
val_metrics

# GridSearch

In [None]:
model.get_all_params()

In [None]:
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

grid = {
    'iterations': [100, 200, 500, 1000],
    'learning_rate': [0.03, 0.1],
    'depth': [2, 4, 6, 8],
    'l2_leaf_reg': [0.2, 0.5, 1, 3]
}

In [None]:
grid_search = RandomizedSearchCV(
    estimator=cb.CatBoostRegressor(random_seed=42, logging_level='Silent'), param_distributions=grid,
    refit=True, verbose=3, cv=5, scoring='r2', n_jobs=-1, n_iter=50
)
grid_search.fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)
print("Best R2 score:", grid_search.best_score_)

In [None]:
best_model = grid_search.best_estimator_

train_metrics = compute_metrics(best_model, X_train, y_train)
train_metrics

In [None]:
val_metrics = compute_metrics(best_model, X_test, y_test)
val_metrics

### Residual plots

In [None]:
from yellowbrick.contrib.wrapper import wrap

In [None]:
visualizer = residuals_plot(
    wrap(best_model), X_train, y_train, X_test, y_test,
)

In [None]:
visualizer = residuals_plot(
    wrap(best_model), X_train, y_train, X_test, y_test, hist=False, qqplot=True
)

The QQ-Plot shows the residuals are normally distributed, because their quantiles when plotted against quantiles of normal distribution forms a straight line.

### Prediction Error

In [None]:
visualizer = prediction_error(wrap(best_model), X_train, y_train, X_test, y_test)

## Feature importance

In [None]:
variables = abs(best_model.feature_importances_)
coef_df = pd.DataFrame(
    {
        "Variable": df.drop(['price', 'log1p_price'], axis=1).columns,
        "Value": variables,
    }
)
n = 10
sorted_df = (
    coef_df.sort_values(by="Value", ascending=False)
    .head(n)
    .sort_values(by="Value")
)
sorted_df

In [None]:
my_range = range(1, len(sorted_df.index) + 1)
plt.figure(figsize=(8, 5 * (n // 10)))
plt.hlines(
    y=my_range,
    xmin=0,
    xmax=sorted_df["Value"],
    color="skyblue",
)
plt.plot(sorted_df["Value"], my_range, "o")
plt.yticks(my_range, sorted_df["Variable"])
plt.title("CatBoostRegressor Feature Importance Plot")
plt.xlabel("Variable Importance")
plt.ylabel("Features")
plt.show()

## SHAP Values

In [None]:
explainer = shap.TreeExplainer(best_model)

shap_values = explainer.shap_values(X_test)

In [None]:
feature_names=coef_df.sort_values(by="Value", ascending=False)['Variable'].values
feature_names

In [None]:
shap.summary_plot(shap_values, X_test, feature_names=feature_names)

## Visualize decision tree

In [None]:
best_model.plot_tree(
    tree_idx=0,
)

## Actual Predictions

In [None]:
preds = best_model.predict(X_test)

In [None]:
# Reverse the log transformation
actual_preds = np.expm1(preds)
actual_y_test = np.expm1(y_test)

# Compute metrics on the original scale
def compute_metrics_original_scale(y_true, y_pred):
    return pd.DataFrame([{
        "R2": round(metrics.r2_score(y_true, y_pred), 3),
        "mse": round(metrics.mean_squared_error(y_true, y_pred), 3),
        "rmse": round(np.sqrt(metrics.mean_squared_error(y_true, y_pred)), 3),
        "mae": round(metrics.mean_absolute_error(y_true, y_pred), 3),
    }])

val_metrics_original_scale = compute_metrics_original_scale(actual_y_test, actual_preds)
val_metrics_original_scale

In [None]:
pred_df = pd.DataFrame({"actual": actual_y_test, "pred": actual_preds})
pred_df.head(10)

In [None]:
plt.scatter(pred_df.actual, pred_df.pred, c='crimson')
p1 = max(max(pred_df.actual), max(pred_df.pred))
p2 = min(min(pred_df.actual), min(pred_df.pred))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('Actual Values', fontsize=15)
plt.ylabel('Predicted Values', fontsize=15)
plt.axis('equal')
plt.show()

In [None]:
sns.distplot((pred_df.actual-pred_df.pred))
plt.title('Distribution of residuals')
plt.xlabel("Residuals")
plt.show()

# Model Pipeline

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/house_rentals_cleaned.csv')

In [None]:
df.info()

In [None]:
df['amenities'].head()

In [None]:
category_cols = ["category", "condition", "furnishing"]
text_cols = ["amenities"]
bool_cols = ['parking_space']
target_columns = ['log1p_price']
numeric_cols = list(set(df.columns) - set(target_columns) - set(category_cols) - set(text_cols) - set(bool_cols))

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer


In [None]:
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)
categorical_preprocessor = Pipeline(
    steps=[
        (
            "imputation_most_frequent",
            SimpleImputer(strategy="most_frequent"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

def text_tokenizer(x):
    return x.split(',')

text_preprocessor = Pipeline(
    steps=[
        ("count_vectorizer", CountVectorizer(max_features=20,tokenizer=text_tokenizer, binary=True))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_preprocessor, numeric_cols),
        ("categorical", categorical_preprocessor, category_cols),
        ("text", text_preprocessor, text_cols[0]),
    ]
)

In [None]:
best_params = {'learning_rate': 0.03, 'l2_leaf_reg': 1, 'iterations': 1000, 'depth': 8}

pipe = make_pipeline(preprocessor, cb.CatBoostRegressor(random_seed=42, logging_level='Silent', **best_params))
pipe

In [None]:
# train-test split for model evaluation
X = df.drop(columns=['log1p_price'], axis=1)
y = df['log1p_price'].values

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, train_size=0.8, shuffle=True, random_state=42
)

In [None]:
X_train.info()

In [None]:
# Fit the pipeline to your training data
pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_train, y_train)

In [None]:
pipe.score(X_test, y_test)

In [None]:
# save the pipeline to disk

import joblib

# Assuming 'pipe' is your fitted pipeline
joblib.dump(pipe, 'house_rental_pipeline.joblib')

In [None]:
!ls -lha house_rental_pipeline.joblib

In [None]:
!cp house_rental_pipeline.joblib /content/drive/MyDrive/Models/

In [None]:
pipe = joblib.load('house_rental_pipeline.joblib')
pipe

In [None]:
for col in category_cols:
    print(col, df[col].unique())

In [None]:
# create a counter to count amentities in the dataset
results = Counter()
df.amenities.apply(lambda x: results.update(x.split(",")))
# create a new sub dataframe with 'amenity' and 'count'
amenity_df = pd.DataFrame(results.most_common(), columns=['amenity', 'count'])
amenity_df

In [None]:
amenity_df['amenity'].values.tolist()