In [4]:
import pandas as pd
import numpy as np

# Load
df = pd.read_csv("../data/listings_combined.csv", low_memory=False)

# Clean price column
df = df[df['price'].notnull()]
df['price'] = df['price'].replace(r'[\$,]', '', regex=True).astype(float)

# Filter rows with missing key features
important_features = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 
                      'room_type', 'property_type', 'latitude', 'longitude',
                      'review_scores_rating', 'availability_365']
df = df.dropna(subset=important_features)

# Select useful features 
features = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'latitude', 'longitude',
            'room_type', 'property_type', 'city', 'review_scores_rating', 'availability_365']
target = 'price'

# One-hot encode categorical variables 
df_model = pd.get_dummies(df[features + [target]], drop_first=True)

# Cleaned Data
print(f"Cleaned dataset shape: {df_model.shape}")
df_model.to_csv("../data/price_model_data.csv", index=False)

Cleaned dataset shape: (74484, 279)


In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# Load cleaned data
df = pd.read_csv("../data/price_model_data.csv")

# Remove price outliers and apply log transform
df = df[df['price'] < 1000]
df['log_price'] = np.log1p(df['price'])

# Split for final evaluation
X = df.drop(columns=['price', 'log_price'])
y = df['log_price']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train final Random Forest with best hyperparameters
final_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=30,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features=None,
    n_jobs=-1,
    random_state=42
)

final_model.fit(X_train, y_train)

# Predict on test set
y_pred_log = final_model.predict(X_test)
y_pred_actual = np.expm1(y_pred_log)
y_test_actual = np.expm1(y_test)

# Evaluate metrics
rmse = mean_squared_error(y_test_actual, y_pred_actual, squared=False)
mae = mean_absolute_error(y_test_actual, y_pred_actual)
r2 = r2_score(y_test_actual, y_pred_actual)

print("Final Random Forest Model Evaluation:")
print(f"RMSE (Root Mean Squared Error): {rmse:.2f}")
print(f"MAE  (Mean Absolute Error): {mae:.2f}")
print(f"R² Score (Variance Explained): {r2:.4f}")

# Step 6: Save model and feature list
joblib.dump(final_model, '../airbnb_app/airbnb_price_model_rf.pkl')
pd.Series(X.columns).to_csv("../airbnb_app/airbnb_price_model_features.csv", index=False)


Final Random Forest Model Evaluation:
RMSE (Root Mean Squared Error): 102.11
MAE  (Mean Absolute Error): 60.74
R² Score (Variance Explained): 0.6692
