# CAT Boost Model

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

sns.set_style("whitegrid")

The cleaned and prepared scraped dataset is stored in apartments_final.csv

In [7]:
df = pd.read_csv("apartments_final.csv")

df.head()


Unnamed: 0,url,listing_id,zipcode,monthly_rent,bedrooms,bathrooms,square_feet,walk_score,transit_score,deposit,latitude,longitude,neighborhood,nbhd
0,https://www.apartments.com/2372-beckwith-dr-in...,ze0jqwy,46218,751,2,1.0,679.0,25,33,705,39.80125,-86.12643,Martindale-Brightwood,Martindale-Brightwood
1,https://www.apartments.com/7491-n-shadeland-av...,s34dq64,46250,1795,3,2.0,2500.0,54,33,1795,39.89034,-86.0454,I-69 Fall Creek,I-69 Fall Creek
2,https://www.apartments.com/nice-3-bedroom-ranc...,n669z9m,46237,1095,3,1.0,1439.0,29,32,1095,39.72238,-86.12113,University Heights,University Heights
3,https://www.apartments.com/1102-n-oakland-ave-...,c4kr5zf,46201,1300,3,1.5,1500.0,67,38,800,39.78265,-86.11278,Near Eastside,Near Eastside
4,https://www.apartments.com/634-e-10th-st-india...,rl2dfp0,46202,1600,2,2.0,1400.0,74,53,1600,39.78089,-86.1461,Chatham Arch,Chatham Arch


In [13]:
df.describe()

Unnamed: 0,zipcode,monthly_rent,bedrooms,bathrooms,square_feet,walk_score,transit_score,deposit,latitude,longitude
count,871.0,871.0,871.0,871.0,871.0,871.0,871.0,871.0,871.0,871.0
mean,46222.282434,1283.833525,2.045924,1.564868,1130.67853,37.180253,25.227325,1207.065442,39.791344,-86.148214
std,31.355223,454.606361,1.121454,0.590232,627.812978,23.78391,17.084812,513.401254,0.079023,0.092329
min,46038.0,450.0,0.0,1.0,50.0,0.0,0.0,100.0,39.62688,-86.34127
25%,46211.0,909.0,1.0,1.0,700.0,16.0,14.0,850.0,39.7484,-86.214015
50%,46224.0,1202.0,2.0,1.5,1025.0,37.0,29.0,1150.0,39.79534,-86.14399
75%,46237.0,1634.5,3.0,2.0,1390.0,54.0,35.0,1597.0,39.84942,-86.084705
max,46298.0,2500.0,5.0,4.0,9991.0,99.0,68.0,2500.0,39.94625,-85.94992


In [14]:
df.dtypes

url               object
listing_id        object
zipcode            int64
monthly_rent       int64
bedrooms           int64
bathrooms        float64
square_feet      float64
walk_score         int64
transit_score      int64
deposit            int64
latitude         float64
longitude        float64
neighborhood      object
nbhd              object
dtype: object

In [94]:
df["bedXbath"] = df.bedrooms * df.bathrooms
df["room_per_feet"] = df.bedrooms/df.square_feet

X = df.copy()
y = df['monthly_rent']

drop_cols = [
             "monthly_rent",
             "deposit",
             "listing_id",
             #"latitude",
             #"longitude",
             "url",
             "neighborhood",
             #"zipcode",
              ]
for col in drop_cols:
    X.drop(col, axis=1, inplace=True)

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=3)

In [96]:
model = CatBoostRegressor(iterations=100, depth=4, learning_rate=0.3, loss_function='RMSE')
model.fit(X_train, y_train,
          cat_features=["nbhd"],
          eval_set=(X_test, y_test),plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 396.1600793	test: 358.9746364	best: 358.9746364 (0)	total: 16.3ms	remaining: 1.61s
1:	learn: 352.8954334	test: 320.4837965	best: 320.4837965 (1)	total: 27.5ms	remaining: 1.35s
2:	learn: 325.5687049	test: 294.1710431	best: 294.1710431 (2)	total: 35.8ms	remaining: 1.16s
3:	learn: 310.2762052	test: 281.9287861	best: 281.9287861 (3)	total: 44.8ms	remaining: 1.07s
4:	learn: 300.4030218	test: 273.9979581	best: 273.9979581 (4)	total: 54.4ms	remaining: 1.03s
5:	learn: 288.3192232	test: 266.4982155	best: 266.4982155 (5)	total: 64.8ms	remaining: 1.01s
6:	learn: 281.8915384	test: 265.6150889	best: 265.6150889 (6)	total: 75.2ms	remaining: 999ms
7:	learn: 276.5431502	test: 258.2135826	best: 258.2135826 (7)	total: 84.9ms	remaining: 977ms
8:	learn: 272.9232265	test: 255.5841276	best: 255.5841276 (8)	total: 95.2ms	remaining: 963ms
9:	learn: 269.8913054	test: 253.9499232	best: 253.9499232 (9)	total: 106ms	remaining: 958ms
10:	learn: 266.3626085	test: 250.6495908	best: 250.6495908 (10)	total: 

<catboost.core.CatBoostRegressor at 0x1734bfa2610>

In [97]:
model.score(X_test, y_test)

0.7013654550458175

In [98]:
predicted = model.predict(X_test)

In [99]:
baseline = y.mean()

In [100]:
baseline_var = ((y_test-baseline)**2).sum()/len(y_test)
print(f"Baseline RMSE: {baseline_var}")
print(f"Baseline rent error: {baseline_var**0.5}")

Baseline RMSE: 179309.48955428004
Baseline rent error: 423.449512402931


In [101]:
model_var = ((y_test-predicted)**2).sum()/len(y_test)
print(f"Baseline RMSE: {model_var}")
print(f"Baseline rent error: {model_var**0.5}")

Baseline RMSE: 53329.66999788583
Baseline rent error: 230.93217618574903
