In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

In [3]:
df = pd.read_csv("apartments_for_rent_classified_10K.csv", delimiter=';', encoding="Windows-1252")

In [4]:
print(df.columns)

Index(['id', 'category', 'title', 'body', 'amenities', 'bathrooms', 'bedrooms',
       'currency', 'fee', 'has_photo', 'pets_allowed', 'price',
       'price_display', 'price_type', 'square_feet', 'address', 'cityname',
       'state', 'latitude', 'longitude', 'source', 'time'],
      dtype='object')


In [5]:
# only these features seem to affect the price of a listing.
df = df[['amenities', 'bathrooms', 'bedrooms', 'has_photo', 'pets_allowed', 'price', 'square_feet', 'cityname', 'state']]
drop_null = ['bathrooms', 'bedrooms', 'cityname', 'state']

# drop the null values in the bathrooms, bedrooms, cityname, and state columns
df = df.dropna(subset=drop_null)

In [6]:
# if there are amenities, pets, or photos, mark as 1, else mark as 0.
df['amenities'] = df['amenities'].apply(lambda x: 0 if pd.isna(x) or str(x).strip() == '' else 1)
df['pets_allowed'] = df['pets_allowed'].apply(lambda x: 0 if pd.isna(x) or str(x).strip() == '' else 1)
df['has_photo'] = df['has_photo'].apply(lambda x: 1 if isinstance(x, str) and x.strip().lower() in ['yes', 'thumbnail'] else 0)

In [9]:
# Combine cityname and state into 'location'
df['location'] = df['cityname'].str.strip() + ", " + df['state'].str.strip()

# Calculate average price by location
avg_price = df.groupby('location')['price'].mean().reset_index(name='avg_price_location')

# Merge average price back to the original df
df = df.merge(avg_price, on='location', how='left')

df = df.drop(columns=['cityname', 'state'])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9883 entries, 0 to 9882
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   amenities           9883 non-null   int64  
 1   bathrooms           9883 non-null   float64
 2   bedrooms            9883 non-null   float64
 3   has_photo           9883 non-null   int64  
 4   pets_allowed        9883 non-null   int64  
 5   price               9883 non-null   int64  
 6   square_feet         9883 non-null   int64  
 7   location            9883 non-null   object 
 8   avg_price_location  9883 non-null   float64
dtypes: float64(3), int64(5), object(1)
memory usage: 695.0+ KB


In [11]:
features = ['amenities', 'bathrooms', 'bedrooms', 'has_photo', 'pets_allowed', 'square_feet', 'avg_price_location']
X = df[features]
y = df['price']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# use random forest to calculate RSME and correlation
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_train_pred_rf = rf.predict(X_train)
y_test_pred_rf = rf.predict(X_test)

print("Random Forest Results:")
print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train, y_train_pred_rf)):.2f}")
print(f"Train R^2: {r2_score(y_train, y_train_pred_rf):.2f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred_rf)):.2f}")
print(f"Test R^2: {r2_score(y_test, y_test_pred_rf):.2f}")
print()

Random Forest Results:
Train RMSE: 289.79
Train R^2: 0.93
Test RMSE: 415.40
Test R^2: 0.77



In [17]:
# use XGBoost to calculate RSME and correlation
xgbr = xgb.XGBRegressor(n_estimators=200, random_state=42, learning_rate=0.1, max_depth=5)
xgbr.fit(X_train, y_train)
y_train_pred_xgb = xgbr.predict(X_train)
y_test_pred_xgb = xgbr.predict(X_test)

print("XGBoost Results:")
print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train, y_train_pred_xgb)):.2f}")
print(f"Train R^2: {r2_score(y_train, y_train_pred_xgb):.2f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred_xgb)):.2f}")
print(f"Test R^2: {r2_score(y_test, y_test_pred_xgb):.2f}")
print()

XGBoost Results:
Train RMSE: 326.09
Train R^2: 0.92
Test RMSE: 531.52
Test R^2: 0.62

