In [1]:
# Real Estate Valuation: Model Selection and Training
# ===================================================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
df = pd.read_csv('../data/processed/realtor-data-imputed.csv')

In [16]:
X=df.drop(columns=['price_sqrt'])
y=df['price_sqrt']

In [17]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Train a Random Forest model
rf_model = RandomForestRegressor(n_estimators=50, n_jobs=-1,max_depth=20,random_state=42)
rf_model.fit(X, y)

# Get feature importances
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
print("Feature Importances:")
print(feature_importances.sort_values(ascending=False))

# Set a threshold and drop features below it
importance_threshold = 0.01  # Adjust as needed
selected_features = feature_importances[feature_importances > importance_threshold].index
X_selected = X[selected_features]

Feature Importances:
price_per_bedroom     1.000000e+00
price_per_sqft        2.762410e-09
acre_lot_cbrt         5.733998e-10
brokered_by           5.235995e-10
bed_bath_ratio        4.090479e-10
zip_code              4.080366e-10
city                  4.057192e-10
house_size_log        3.735438e-10
efficient_land_use    3.414599e-10
street                3.355078e-10
bath_log              3.279675e-10
average_size_room     2.981321e-10
state                 2.691550e-10
bed_bath_balance      2.334581e-10
status                3.392708e-11
bed_log               0.000000e+00
dtype: float64


In [18]:
# Final feature set
X_boosting = X[['city','state','bed_log', 'price_per_bedroom', 'bed_bath_ratio', 'bed_bath_balance']]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_boosting, y, test_size=0.2, random_state=42)

In [20]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize XGBoost model
xgb_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

# Train
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_test)

# Evaluate
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

MSE: 8.162035748394773e-05
R2 Score: 0.9999135487037017


In [56]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
}

grid_search = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid,
    scoring='r2',
    cv=5,
    verbose=1
)

grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}
Best R2 Score: 0.9999214448537493


In [21]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define the best hyperparameters based on previous tuning
best_params = {
    'learning_rate': 0.05,
    'max_depth': 7,
    'n_estimators': 300,
    'subsample': 0.8
}

# Retrain the model using the best hyperparameters
final_model = XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

# Save the model using Booster's save_model method
final_model.get_booster().save_model('../models/xgb_final_model.json')
print("Model saved to ../models/xgb_final_model.json")

# Predict on the test set using the trained model
y_pred = final_model.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test Mean Squared Error: {mse}")
print(f"Test R2 Score: {r2}")

# Load the model using Booster's load_model method
loaded_model = XGBRegressor()
loaded_model.load_model('../models/xgb_final_model.json')

# Predict again to verify consistency
y_pred_loaded = loaded_model.predict(X_test)

# Check if predictions are identical
print(f"Predictions from loaded model are equal to previous predictions: {all(y_pred == y_pred_loaded)}")

Model saved to ../models/xgb_final_model.json
Test Mean Squared Error: 7.764113407914655e-05
Test R2 Score: 0.9999177634490448
Predictions from loaded model are equal to previous predictions: True


In [60]:
import joblib
from sklearn.preprocessing import LabelEncoder

# Load the incorrect encoders (which are dictionaries)
city_dict = joblib.load("../encoders/city_encoder.pkl")
state_dict = joblib.load("../encoders/state_encoder.pkl")

# Convert dictionary to LabelEncoder
city_encoder = LabelEncoder()
state_encoder = LabelEncoder()

# Fit the encoders with the original mapping
city_encoder.classes_ = list(city_dict.keys())  # Restore labels
state_encoder.classes_ = list(state_dict.keys())

# Save the fixed encoders
joblib.dump(city_encoder, "../encoders/city_encoder_fixed.pkl")
joblib.dump(state_encoder, "../encoders/state_encoder_fixed.pkl")

print("Fixed encoders saved successfully!")

Fixed encoders saved successfully!
