In [25]:
import torch
import pandas as pd
import numpy as np

In [26]:
df = pd.read_csv("listings.csv")

df['room_type'] = df['room_type'].replace('Private room', 0)
df['room_type'] = df['room_type'].replace('Entire home/apt', 1)

In [27]:
df = df[['room_type', 'minimum_nights', 'availability_365', 'latitude', 'longitude', 'number_of_reviews', 'reviews_per_month', 'number_of_reviews_ltm', 'price', 'calculated_host_listings_count', 'last_review']].dropna()

df = df[df['price'] < 1000]

df['price'] = np.log1p(df['price'])
df['minimum_nights'] = np.log1p(df['minimum_nights'])
df['number_of_reviews'] = np.log1p(df['number_of_reviews'])
df['number_of_reviews_ltm'] = np.log1p(df['number_of_reviews_ltm'])
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)
df['reviews_per_month'] = np.log1p(df['reviews_per_month'])
df['calculated_host_listings_count'] = np.log1p(df['calculated_host_listings_count'])

df['last_review'] = pd.to_datetime(df['last_review'])
df['days_since_last_review'] = (pd.Timestamp('today') - df['last_review']).dt.days
df['days_since_last_review'] = df['days_since_last_review'].fillna(df['days_since_last_review'].max())
df['days_since_last_review'] = np.log1p(df['days_since_last_review'])

df = df.drop(columns=['last_review'])

df

Unnamed: 0,room_type,minimum_nights,availability_365,latitude,longitude,number_of_reviews,reviews_per_month,number_of_reviews_ltm,price,calculated_host_listings_count,days_since_last_review
0,1,1.098612,276,32.755220,-117.128730,5.147494,0.609766,2.944439,5.493061,0.693147,4.644391
1,0,1.945910,327,32.805330,-117.234000,1.609438,0.019803,1.386294,4.465908,1.098612,5.476464
2,1,1.609438,346,32.807510,-117.257600,4.770685,0.500775,2.564949,5.384495,1.791759,4.912655
3,0,0.693147,171,32.805830,-117.242440,6.855409,1.864080,4.356709,4.382027,1.098612,4.709530
4,1,0.693147,83,32.806553,-117.234808,7.045777,2.140066,4.624973,4.595120,1.609438,4.574711
...,...,...,...,...,...,...,...,...,...,...,...
12842,1,0.693147,361,32.790072,-117.093397,0.693147,0.693147,0.693147,5.356586,1.791759,4.644391
12843,1,1.098612,323,32.779696,-117.204063,0.693147,0.693147,0.693147,6.056784,3.044522,4.663439
12844,1,1.098612,355,32.830981,-117.279346,0.693147,0.693147,0.693147,5.613128,4.672829,4.584967
12848,1,1.098612,361,32.830945,-117.279261,0.693147,0.693147,0.693147,5.384495,4.672829,4.574711


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

In [29]:
X = pd.get_dummies(df.drop('price', axis=1), drop_first=True)
y = df['price']

df['availability_365'] = MinMaxScaler().fit_transform(df[['availability_365']])

coords = df[['latitude', 'longitude']]
df['location_cluster'] = KMeans(n_clusters=10, random_state=42).fit_predict(coords)
df = pd.get_dummies(df, columns=['location_cluster'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2); 

df

Unnamed: 0,room_type,minimum_nights,availability_365,latitude,longitude,number_of_reviews,reviews_per_month,number_of_reviews_ltm,price,calculated_host_listings_count,days_since_last_review,location_cluster_1,location_cluster_2,location_cluster_3,location_cluster_4,location_cluster_5,location_cluster_6,location_cluster_7,location_cluster_8,location_cluster_9
0,1,1.098612,0.756164,32.755220,-117.128730,5.147494,0.609766,2.944439,5.493061,0.693147,4.644391,False,True,False,False,False,False,False,False,False
1,0,1.945910,0.895890,32.805330,-117.234000,1.609438,0.019803,1.386294,4.465908,1.098612,5.476464,False,False,False,False,False,False,True,False,False
2,1,1.609438,0.947945,32.807510,-117.257600,4.770685,0.500775,2.564949,5.384495,1.791759,4.912655,False,False,False,False,False,False,True,False,False
3,0,0.693147,0.468493,32.805830,-117.242440,6.855409,1.864080,4.356709,4.382027,1.098612,4.709530,False,False,False,False,False,False,True,False,False
4,1,0.693147,0.227397,32.806553,-117.234808,7.045777,2.140066,4.624973,4.595120,1.609438,4.574711,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12842,1,0.693147,0.989041,32.790072,-117.093397,0.693147,0.693147,0.693147,5.356586,1.791759,4.644391,False,False,False,False,False,True,False,False,False
12843,1,1.098612,0.884932,32.779696,-117.204063,0.693147,0.693147,0.693147,6.056784,3.044522,4.663439,False,False,False,False,False,False,False,False,True
12844,1,1.098612,0.972603,32.830981,-117.279346,0.693147,0.693147,0.693147,5.613128,4.672829,4.584967,True,False,False,False,False,False,False,False,False
12848,1,1.098612,0.989041,32.830945,-117.279261,0.693147,0.693147,0.693147,5.384495,4.672829,4.574711,True,False,False,False,False,False,False,False,False


In [82]:
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=300,
    max_depth=10,
    learning_rate=0.025,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=43
)

model.fit(X_train, y_train); 

preds = model.predict(X_test); 

print("MAE (in dollars):", mean_absolute_error(np.expm1(y_test), np.expm1(preds)))
print("Average Listing Price (in dollars):", np.expm1(df['price'].mean()))
print("R² (percentage of data predicted):", r2_score(y_test, preds))

MAE (in dollars): 80.63142696581389
Average Listing Price (in dollars): 183.86998535713894
R² (percentage of data predicted): 0.5706299305246845
