In [21]:
import pandas as pd

df = pd.read_csv('apartments_for_rent_10k_numeric.csv', sep=',', header=0)
df.head(5)

Unnamed: 0,bathrooms,bedrooms,price,square_feet,latitude,longitude,time,number_of_amenities
0,1.0,0.0,1390,107,38.891,-77.0816,1577359410,0
1,1.0,0.0,925,116,47.616,-122.3275,1576667743,0
2,1.0,0.0,2475,130,40.7629,-73.9885,1577289784,5
3,1.0,0.0,1495,138,37.7599,-122.4379,1577358313,1
4,1.0,0.0,1695,190,37.7599,-122.4379,1577015121,1


In [22]:
df.describe(include='all')

Unnamed: 0,bathrooms,bedrooms,price,square_feet,latitude,longitude,time,number_of_amenities
count,9950.0,9950.0,9950.0,9950.0,9950.0,9950.0,9950.0,9950.0
mean,1.380553,1.747538,1487.58593,943.271055,37.696104,-94.663883,1574881000.0,3.125025
std,0.615313,0.941543,1077.350394,526.416278,5.501219,15.760136,3767608.0,3.430139
min,1.0,0.0,200.0,107.0,21.3155,-158.0221,1568744000.0,0.0
25%,1.0,1.0,950.0,650.0,33.6795,-101.3017,1568781000.0,0.0
50%,1.0,2.0,1275.0,803.0,38.8096,-93.6516,1577358000.0,2.0
75%,2.0,2.0,1695.0,1100.0,41.3498,-82.302,1577359000.0,5.0
max,8.5,9.0,52500.0,11318.0,61.594,-70.1916,1577362000.0,18.0


In [23]:
# Remove extreme outliers that skew linear regression
# Q1 = df['price'].quantile(0.25)
# Q3 = df['price'].quantile(0.75)
# IQR = Q3 - Q1
# df = df[~((df['price'] < (Q1 - 1.5 * IQR)) | (df['price'] > (Q3 + 1.5 * IQR)))]

Linear Regression

In [24]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


In [25]:

X = df.drop(columns=['price'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_log = np.log(df['price'])  # Log transform target

# Test 1: Scaled X, original y (your best result so far)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, df['price'], test_size=0.2, random_state=42)
model1 = LinearRegression()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: ${rmse:.2f}")
print(f"Scaled X, Original Y - R²: {r2_score(y_test, y_pred):.3f}")


RMSE: $750.53
Scaled X, Original Y - R²: 0.322


In [26]:
# # Before scaling, add these engineered features:
# df['price_per_sqft'] = df['price'] / df['square_feet']
df['rooms_per_sqft'] = (df['bedrooms'] + df['bathrooms']) / df['square_feet']

X = df.drop(columns=['price'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_log = np.log(df['price'])  # Log transform target

# Test 1: Scaled X, original y (your best result so far)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, df['price'], test_size=0.2, random_state=42)
model2 = LinearRegression()
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: ${rmse:.2f}")
print(f"Scaled X, Original Y - R²: {r2_score(y_test, y_pred):.3f}")


RMSE: $751.92
Scaled X, Original Y - R²: 0.320


MLP (Neural Network)

In [27]:
from tensorflow import keras
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1)  # Output layer (no activation for regression)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [28]:
model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

In [29]:
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

Epoch 1/100
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3474591.0000 - mae: 1489.2770 - val_loss: 3174976.7500 - val_mae: 1444.1996
Epoch 2/100
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 3095986.2500 - mae: 1376.8364 - val_loss: 2092044.2500 - val_mae: 1089.7764
Epoch 3/100
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 2614365.0000 - mae: 935.2024 - val_loss: 947539.1875 - val_mae: 585.0547
Epoch 4/100
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1795685.5000 - mae: 560.5655 - val_loss: 717016.3125 - val_mae: 510.0309
Epoch 5/100
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 803869.0000 - mae: 522.6229 - val_loss: 677182.8750 - val_mae: 501.0307
Epoch 6/100
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1128850.6250 - mae: 519.0213 - val_loss: 670626.8750 - val_m

In [30]:
y_pred = model.predict(X_test)
y_pred = y_pred.flatten()

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Neural Network RMSE: ${rmse:.2f}")
print(f"Neural Network R²: {r2:.3f}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Neural Network RMSE: $648.19
Neural Network R²: 0.494
