## Model Selection

This notebook should include preliminary and baseline modeling.
- Try as many different models as possible.
- Don't worry about hyperparameter tuning or cross validation here.
- Ideas include:
    - linear regression
    - support vector machines
    - random forest
    - xgboost

In [149]:
# import models and fit
import numpy as np
import pandas as pd
import xgboost as xgb
from functions_variables import evaluate_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import RFE

In [73]:
#Setting up data for Train/Test Split

data = pd.read_csv('processed/chosen_features.csv')

X = data.drop(columns = ['sold_price'], axis=1)
y = data['sold_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [75]:
print(X_train.shape)
print(X_test.shape)

(953, 10)
(239, 10)


In [127]:
# Run Regression models on entire feature set from chosen_features
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Lasso Regression
lasso_model = Lasso(alpha=1000)
lasso_model.fit(X_train, y_train)

# Ridge Regression
ridge_model = Ridge(alpha=100)
ridge_model.fit(X_train, y_train)

# Support Vector Regression (SVR)
svr_model_rbf = SVR(kernel='rbf')
svr_model_rbf.fit(X_train, y_train)

# Support Vector Regression (SVR)
svr_model_linear = SVR(kernel='linear')
svr_model_linear.fit(X_train, y_train)

# Random Forest Regression
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train, y_train)

# XGBoost Regression
xgb_model = xgb.XGBRegressor(objective='reg:absoluteerror')
xgb_model.fit(X_train, y_train)

In [129]:
# Confirm XGBoost is working for me (Colter)
y_pred = xgb_model.predict(X_test)
print(y_pred[:5])  # Print first 5 predictions

[2.6324244 2.5663652 2.572972  2.6406112 2.52705  ]


In [131]:
def evaluate_model(model, X_test, y_test):
    """Prints MAE, RMSE, and R² for a given model"""
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"{model.__class__.__name__}:")
    print(f"  MAE:  {mae:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R²:   {r2:.4f}\n")

models = [lr_model, lasso_model, ridge_model, svr_model_rbf, svr_model_linear, rf_model, xgb_model]

for model in models:
    evaluate_model(model, X_test, y_test)

LinearRegression:
  MAE:  0.0269
  RMSE: 0.0434
  R²:   0.4712

Lasso:
  MAE:  0.0411
  RMSE: 0.0601
  R²:   -0.0130

Ridge:
  MAE:  0.0280
  RMSE: 0.0462
  R²:   0.4019

SVR:
  MAE:  0.0404
  RMSE: 0.0576
  R²:   0.0695

SVR:
  MAE:  2167.9196
  RMSE: 3113.3436
  R²:   -2715200478.6259

RandomForestRegressor:
  MAE:  0.0240
  RMSE: 0.0385
  R²:   0.5849

XGBRegressor:
  MAE:  0.0227
  RMSE: 0.0371
  R²:   0.6139



In [83]:
print(X_train.describe())  # Check feature magnitudes

             sqft       baths  price_reduced_amount         Stdev  \
count  953.000000  953.000000            953.000000    953.000000   
mean     2.136004    2.341087              0.738266  46184.962547   
std      0.055354    1.056389              1.086200  13376.262577   
min      1.757132    0.000000              0.000000      0.000000   
25%      2.100484    2.000000              0.000000  38611.333333   
50%      2.132860    2.000000              0.000000  44154.000000   
75%      2.169222    3.000000              2.253121  52978.000000   
max      2.431579    9.000000              2.646038  86147.000000   

                Mean  waterfront      garage  cost_of_living_housing  \
count     953.000000  953.000000  953.000000              953.000000   
mean    66167.511885    0.023085    1.154250                1.858342   
std     23645.397128    0.150252    1.217142                1.085560   
min         0.000000    0.000000    0.000000                1.000000   
25%     49943.0000

In [85]:
# Rescale data to see if SVR Linear performs better/differently
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Run SVR Linear again to see if we get better evaluation metrics
svr_model_linear.fit(X_train, y_train)

# Evaluate VR Linear model after rescaling
evaluate_model(svr_model_linear, X_test, y_test)

SVR:
  MAE:  2167.9196
  RMSE: 3113.3436
  R²:   -2715200478.6259



In [87]:
# Polynomial Regression (using PolynomialFeatures)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) #Unable to run higher than 4 due to number of columns slowing down system
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

# Predictions
y_train_pred = poly_model.predict(X_train_poly)
y_test_pred = poly_model.predict(X_test_poly)

# Evaluate model performance
def evaluate_poly_model(y_true, y_pred, dataset_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mse)

    print(f"\nPolynomial Regression Model - {dataset_name} Metrics:")
    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"R²: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")

# Print feature expansion
print(f"Original features: {X_train.shape[1]}")
print(f"Expanded polynomial features: {X_train_poly.shape[1]}")

# Print train & test metrics
evaluate_poly_model(y_train, y_train_pred, "Training")
evaluate_poly_model(y_test, y_test_pred, "Testing")

Original features: 10
Expanded polynomial features: 55

Polynomial Regression Model - Training Metrics:
MAE: 0.0236
MSE: 0.0012
R²: 0.6307
RMSE: 0.0350

Polynomial Regression Model - Testing Metrics:
MAE: 0.0261
MSE: 0.0019
R²: 0.4692
RMSE: 0.0435


In [89]:
# Polynomial Regression (using Ridge and PolynomialFeatures)
ridge_poly_model = Ridge(alpha=1.0)
ridge_poly_model.fit(X_train_poly, y_train)
y_train_pred_ridge = ridge_poly_model.predict(X_train_poly)
y_test_pred_ridge = ridge_poly_model.predict(X_test_poly)

# Evaluate model performance
def evaluate_poly_model(y_true, y_pred, dataset_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mse)

    print(f"\nPolynomial Regression Model - {dataset_name} Metrics:")
    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"R²: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")

# Print feature expansion
print(f"Original features: {X_train.shape[1]}")
print(f"Expanded polynomial features: {X_train_poly.shape[1]}")

# Print train & test metrics
evaluate_poly_model(y_train, y_train_pred_ridge, "Training")
evaluate_poly_model(y_test, y_test_pred_ridge, "Testing")

Original features: 10
Expanded polynomial features: 55

Polynomial Regression Model - Training Metrics:
MAE: 0.0237
MSE: 0.0012
R²: 0.6283
RMSE: 0.0351

Polynomial Regression Model - Testing Metrics:
MAE: 0.0257
MSE: 0.0018
R²: 0.4822
RMSE: 0.0430


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [91]:
# Polynomial Regression (using Ridge and PolynomialFeatures)
ridge_poly_model = Ridge(alpha=10.0)  # Try higher alpha to see if that corrects the correlation error
ridge_poly_model.fit(X_train_poly, y_train)
y_train_pred_ridge = ridge_poly_model.predict(X_train_poly)
y_test_pred_ridge = ridge_poly_model.predict(X_test_poly)

# Evaluate model performance
def evaluate_poly_model(y_true, y_pred, dataset_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mse)

    print(f"\nPolynomial Regression Model - {dataset_name} Metrics:")
    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"R²: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")

# Print feature expansion
print(f"Original features: {X_train.shape[1]}")
print(f"Expanded polynomial features: {X_train_poly.shape[1]}")

# Print train & test metrics
evaluate_poly_model(y_train, y_train_pred_ridge, "Training")
evaluate_poly_model(y_test, y_test_pred_ridge, "Testing")

Original features: 10
Expanded polynomial features: 55

Polynomial Regression Model - Training Metrics:
MAE: 0.0238
MSE: 0.0012
R²: 0.6256
RMSE: 0.0352

Polynomial Regression Model - Testing Metrics:
MAE: 0.0255
MSE: 0.0018
R²: 0.4918
RMSE: 0.0426


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [93]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error

# Hyperparameter grid 
param_dist = {
    'n_estimators': [50, 100, 200, 500],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_tuned_model = RandomForestRegressor(random_state=42)

random_search = RandomizedSearchCV(
    rf_tuned_model, 
    param_distributions=param_dist, 
    n_iter=20, 
    cv=5, 
    scoring='neg_root_mean_squared_error', 
    random_state=42, 
    error_score=np.nan
)
random_search.fit(X_train, y_train)

# Best parameters and score
print("Best Random Forest Model:", random_search.best_params_)
print("Best CV score (RMSE):", -random_search.best_score_)

# Perform cross-validation on best model
best_rf = random_search.best_estimator_
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(f"Cross-validation RMSE: {cv_scores.mean():.4f}")

# Print feature importances
feature_importance = pd.DataFrame(
    {'Feature': X_train.columns, 'Importance': best_rf.feature_importances_}
).sort_values(by='Importance', ascending=False)

print("\nFeature Importances from Best Random Forest Model:")
print(feature_importance.head(10))  # Show features in order of importance

Best Random Forest Model: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 10}
Best CV score (RMSE): 0.03507018089726462
Cross-validation RMSE: -0.0351

Feature Importances from Best Random Forest Model:
                     Feature  Importance
0                       sqft    0.251431
1                      baths    0.191614
3                      Stdev    0.141637
4                       Mean    0.119302
7     cost_of_living_housing    0.099107
6                     garage    0.080099
8  total_population_category    0.040792
9     cost_of_living_grocery    0.036833
2       price_reduced_amount    0.034628
5                 waterfront    0.004556


In [95]:
# Same as above but with n_iter = 50 instead
# Hyperparameter grid 
param_dist = {
    'n_estimators': [50, 100, 200, 500],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_tuned_model = RandomForestRegressor(random_state=42)

random_search = RandomizedSearchCV(
    rf_tuned_model, 
    param_distributions=param_dist, 
    n_iter=50, 
    cv=5, 
    scoring='neg_root_mean_squared_error', 
    random_state=42, 
    error_score=np.nan
)
random_search.fit(X_train, y_train)

# Best parameters and score
print("Best Random Forest Model:", random_search.best_params_)
print("Best CV score (RMSE):", -random_search.best_score_)

# Perform cross-validation on best model
best_rf = random_search.best_estimator_
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(f"Cross-validation RMSE: {cv_scores.mean():.4f}")

# Print feature importances
feature_importance = pd.DataFrame(
    {'Feature': X_train.columns, 'Importance': best_rf.feature_importances_}
).sort_values(by='Importance', ascending=False)

print("\nFeature Importances from Best Random Forest Model:")
print(feature_importance.head(10))  # Show features in order of importance

Best Random Forest Model: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None}
Best CV score (RMSE): 0.03499392832061003
Cross-validation RMSE: -0.0350

Feature Importances from Best Random Forest Model:
                     Feature  Importance
0                       sqft    0.259722
1                      baths    0.171954
3                      Stdev    0.145723
4                       Mean    0.130743
7     cost_of_living_housing    0.091603
6                     garage    0.077256
8  total_population_category    0.041568
2       price_reduced_amount    0.039253
9     cost_of_living_grocery    0.032161
5                 waterfront    0.010019


In [97]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge

# Define parameter grid
param_grid = {'alpha': [0.1, 1, 10, 100, 1000]}  # values to test for Alpha

# For Lasso regression
lasso = Lasso(max_iter=10000)
lasso_grid_search = GridSearchCV(
    lasso, param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=1
)
lasso_grid_search.fit(X_train, y_train)
print(f"Best alpha for Lasso: {lasso_grid_search.best_params_}, Best RMSE: {-lasso_grid_search.best_score_:.4f}")

# For Ridge regression
ridge = Ridge(max_iter=10000)
ridge_grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=1)
ridge_grid_search.fit(X_train, y_train)
print(f"Best alpha for Ridge: {ridge_grid_search.best_params_}, Best RMSE: {-ridge_grid_search.best_score_:.4f}")

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best alpha for Lasso: {'alpha': 0.1}, Best RMSE: 0.0538
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best alpha for Ridge: {'alpha': 0.1}, Best RMSE: 0.0378


In [99]:
from sklearn.tree import DecisionTreeRegressor

# Initialize model
dt_model = DecisionTreeRegressor(random_state=10)
dt_model.fit(X_train, y_train)

# Make predictions and calculate metrics
y_pred = dt_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

# Print results
print("Decision Tree Regression:")
print(f"MAE: {mae}, MSE: {mse}, R2: {r2}, RMSE: {rmse}")

Decision Tree Regression:
MAE: 0.03246852538901924, MSE: 0.0023448147307753952, R2: 0.34316493458291075, RMSE: 0.04842328707115405


In [101]:
from sklearn.neighbors import KNeighborsRegressor

# Initialize model
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)

# Calculate initial Score
y_pred = knn_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

# Print results
print("K-Nearest Neighbors Regression:")
print(f"MAE: {mae}, MSE: {mse}, R2: {r2}, RMSE: {rmse}")

K-Nearest Neighbors Regression:
MAE: 0.030211400238612074, MSE: 0.002094152081288033, R2: 0.4133811506500997, RMSE: 0.04576190644289235


Consider what metrics you want to use to evaluate success.
- If you think about mean squared error, can we actually relate to the amount of error?
- Try root mean squared error so that error is closer to the original units (dollars)
- What does RMSE do to outliers?
- Is mean absolute error a good metric for this problem?
- What about R^2? Adjusted R^2?
- Briefly describe your reasons for picking the metrics you use

In [133]:
# gather evaluation metrics and compare results
# Combine model information
models = [lr_model, lasso_model, ridge_model, svr_model_rbf, svr_model_linear, rf_model, xgb_model, knn_model, dt_model]
model_names = ['Linear Regression', 'Lasso Regression', 'Ridge Regression','Support Vector Regression (RBF)','Support Vector Regression (Linear)', 'Random Forest Regression', 'XGBoost Regression', 'Nearest Neighbour', 'Decision Tree']
model_scores = []

# model scores
for model, name in zip(models, model_names):
    y_pred = model.predict(X_test)            
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mse)  # Calculate RMSE
    model_scores.append((name, mae, mse, r2, rmse))

# Print results for each model
for name, mae, mse, r2, rmse in model_scores:
    print(f"Model: {name}")
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")
    print(f"RMSE: {rmse}")
    print() #Empty line between

Model: Linear Regression
Mean Absolute Error: 0.02691310537665663
Mean Squared Error: 0.001887620875494322
R-squared: 0.47123516200873816
RMSE: 0.04344675909080356

Model: Lasso Regression
Mean Absolute Error: 0.04113705463015752
Mean Squared Error: 0.0036161343869070984
R-squared: -0.012960143676538971
RMSE: 0.06013430291362076

Model: Ridge Regression
Mean Absolute Error: 0.027960560005742426
Mean Squared Error: 0.002135299614376401
R-squared: 0.40185480605957735
RMSE: 0.04620930224939997

Model: Support Vector Regression (RBF)
Mean Absolute Error: 0.04039576579973451
Mean Squared Error: 0.003321770827890505
R-squared: 0.06949767484762792
RMSE: 0.057634805698384244

Model: Support Vector Regression (Linear)
Mean Absolute Error: 2167.9195644615397
Mean Squared Error: 9692908.337030414
R-squared: -2715200478.625897
RMSE: 3113.3435944383677

Model: Random Forest Regression
Mean Absolute Error: 0.024046381151803
Mean Squared Error: 0.001481881285774493
R-squared: 0.584891898491197
RMSE: 

We wanted to compare against each evaluation metric. 

Mean Squared Error is probably the most volatile metric for this model because of the large differences in sold prices. 

The preference would go to R2 and RMSE. 

Because of that, Random Forest and XGBoost appear to be the most reliable models at this point with their nearly identical scores for RMSE and R2.

## Feature Selection - STRETCH

> **This step doesn't need to be part of your Minimum Viable Product (MVP), but its recommended you complete it if you have time!**

Even with all the preprocessing we did in Notebook 1, you probably still have a lot of features. Are they all important for prediction?

Investigate some feature selection algorithms (Lasso, RFE, Forward/Backward Selection)
- Perform feature selection to get a reduced subset of your original features
- Refit your models with this reduced dimensionality - how does performance change on your chosen metrics?
- Based on this, should you include feature selection in your final pipeline? Explain

Remember, feature selection often doesn't directly improve performance, but if performance remains the same, a simpler model is often preferrable. 



In [None]:
# perform feature selection 
# refit models
# gather evaluation metrics and compare to the previous step (full feature set)

In [111]:
# Moving forward to test Linear Regression and Random Forest further
# Determine features to drop, if any from RandomForest

# Get feature importances from Random Forest
rf_importance = rf_model.feature_importances_

# Convert to DataFrame
rf_feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_importance
}).sort_values(by='Importance', ascending=True)  # Sort least important first

# Print least important features
print("Least Important Features (Random Forest):")
print(rf_feature_importance.head(5))  # Change to see more/less

🔹 Least Important Features (Random Forest):
                     Feature  Importance
9     cost_of_living_grocery    0.008297
5                 waterfront    0.008726
8  total_population_category    0.036647
2       price_reduced_amount    0.038106
6                     garage    0.046652


In [114]:
# Determine features to drop, if any from LinearRegression

# Get absolute values of coefficients
lr_coefficients = np.abs(lr_model.coef_)

# Convert to DataFrame
lr_feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': lr_coefficients
}).sort_values(by='Coefficient', ascending=True)  # Sort least important first

# Print least important features
print("Least Important Features (Linear Regression):")
print(lr_feature_importance.head(5))

🔹 Least Important Features (Linear Regression):
                  Feature   Coefficient
4                    Mean  2.933398e-07
3                   Stdev  3.250289e-07
2    price_reduced_amount  1.437399e-03
6                  garage  6.018279e-03
9  cost_of_living_grocery  1.109896e-02


In [143]:
# drop 2 features
drop_features_v1 = ['price_reduced_amount', 'garage']

# Update train, test
X_train_reduced_v1 = X_train.drop(columns=drop_features_v1)
X_test_reduced_v1 = X_test.drop(columns=drop_features_v1)

In [164]:
# Train new models for reduced_v1
lr_model_reduced_v1 = LinearRegression()
rf_model_reduced_v1 = RandomForestRegressor()

lr_model_reduced_v1.fit(X_train_reduced_v1, y_train)
rf_model_reduced_v1.fit(X_train_reduced_v1, y_train)

# Generate predictions for the reduced_v1 feature set
y_pred_lr_reduced_v1 = lr_model_reduced_v1.predict(X_test_reduced_v1)
y_pred_rf_reduced_v1 = rf_model_reduced_v1.predict(X_test_reduced_v1)

# Compute metrics for reduced_v1 models
r2_lr_reduced_v1 = r2_score(y_test, y_pred_lr_reduced_v1)
rmse_lr_reduced_v1 = np.sqrt(mean_squared_error(y_test, y_pred_lr_reduced_v1))

r2_rf_reduced_v1 = r2_score(y_test, y_pred_rf_reduced_v1)
rmse_rf_reduced_v1 = np.sqrt(mean_squared_error(y_test, y_pred_rf_reduced_v1))

# Extract previous R² and RMSE for Linear Regression and Random Forest
r2_lr_original = next(r2 for name, _, _, r2, _ in model_scores if name == "Linear Regression")
r2_rf_original = next(r2 for name, _, _, r2, _ in model_scores if name == "Random Forest Regression")

rmse_lr_original = next(rmse for name, _, _, _, rmse in model_scores if name == "Linear Regression")
rmse_rf_original = next(rmse for name, _, _, _, rmse in model_scores if name == "Random Forest Regression")

# Print side-by-side comparison
print("\nModel Performance Comparison\n")

print("Linear Regression:")
print(f"R²: {r2_lr_original:.4f}  |  R² Reduced_v1: {r2_lr_reduced_v1:.4f}")
print(f"RMSE: {rmse_lr_original:.4f}  |  RMSE Reduced_v1: {rmse_lr_reduced_v1:.4f}\n")

print("Random Forest:")
print(f"R²: {r2_rf_original:.4f}  |  R² Reduced_v1: {r2_rf_reduced_v1:.4f}")
print(f"RMSE: {rmse_rf_original:.4f}  |  RMSE Reduced_v1: {rmse_rf_reduced_v1:.4f}")


Model Performance Comparison

Linear Regression:
R²: 0.4712  |  R² Reduced_v1: 0.4346
RMSE: 0.0434  |  RMSE Reduced_v1: 0.0449

Random Forest:
R²: 0.5849  |  R² Reduced_v1: 0.5531
RMSE: 0.0385  |  RMSE Reduced_v1: 0.0399


In [141]:
# drop 1 feature
drop_features_v2 = ['price_reduced_amount']

# Update train, test
X_train_reduced_v2 = X_train.drop(columns=drop_features_v2)
X_test_reduced_v2 = X_test.drop(columns=drop_features_v2)

In [162]:
# Train new models for reduced_v2
lr_model_reduced_v2 = LinearRegression()
rf_model_reduced_v2 = RandomForestRegressor()

lr_model_reduced_v2.fit(X_train_reduced_v2, y_train)
rf_model_reduced_v2.fit(X_train_reduced_v2, y_train)

# Generate predictions for the reduced_v2 feature set
y_pred_lr_reduced_v2 = lr_model_reduced_v2.predict(X_test_reduced_v2)
y_pred_rf_reduced_v2 = rf_model_reduced_v2.predict(X_test_reduced_v2)

# Compute metrics for reduced_v2 models
r2_lr_reduced_v2 = r2_score(y_test, y_pred_lr_reduced_v2)
rmse_lr_reduced_v2 = np.sqrt(mean_squared_error(y_test, y_pred_lr_reduced_v2))

r2_rf_reduced_v2 = r2_score(y_test, y_pred_rf_reduced_v2)
rmse_rf_reduced_v2 = np.sqrt(mean_squared_error(y_test, y_pred_rf_reduced_v2))

# Extract previous R² and RMSE for Linear Regression and Random Forest
r2_lr_original = next(r2 for name, _, _, r2, _ in model_scores if name == "Linear Regression")
r2_rf_original = next(r2 for name, _, _, r2, _ in model_scores if name == "Random Forest Regression")

rmse_lr_original = next(rmse for name, _, _, _, rmse in model_scores if name == "Linear Regression")
rmse_rf_original = next(rmse for name, _, _, _, rmse in model_scores if name == "Random Forest Regression")

# Print side-by-side comparison
print("\nModel Performance Comparison\n")

print("Linear Regression:")
print(f"R²: {r2_lr_original:.4f}  |  R² Reduced_v1: {r2_lr_reduced_v1:.4f}  |  R² Reduced_v2: {r2_lr_reduced_v2:.4f}")
print(f"RMSE: {rmse_lr_original:.4f}  |  RMSE Reduced_v1: {rmse_lr_reduced_v1:.4f}  |  RMSE Reduced_v2: {rmse_lr_reduced_v2:.4f}\n")

print("Random Forest:")
print(f"R²: {r2_rf_original:.4f}  |  R² Reduced_v1: {r2_rf_reduced_v1:.4f}  |  R² Reduced_v2: {r2_rf_reduced_v2:.4f}")
print(f"RMSE: {rmse_rf_original:.4f}  |  RMSE Reduced_v1: {rmse_rf_reduced_v1:.4f}  |  RMSE Reduced_v2: {rmse_rf_reduced_v2:.4f}")


Model Performance Comparison

Linear Regression:
R²: 0.4712  |  R² Reduced_v1: 0.4346  |  R² Reduced_v2: 0.4686
RMSE: 0.0434  |  RMSE Reduced_v1: 0.0449  |  RMSE Reduced_v2: 0.0436

Random Forest:
R²: 0.5849  |  R² Reduced_v1: 0.5437  |  R² Reduced_v2: 0.5756
RMSE: 0.0385  |  RMSE Reduced_v1: 0.0404  |  RMSE Reduced_v2: 0.0389


In [160]:
# Make a copy of the original feature set to avoid overwriting
X_train_copy_rfe = X_train.copy()
X_test_copy_rfe = X_test.copy()

# Apply RFE to select top 5 features using Random Forest
rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=5)
X_train_rfe = rfe.fit_transform(X_train_copy_rfe, y_train)
X_test_rfe = rfe.transform(X_test_copy_rfe)

# Get selected feature names
selected_features = X_train.columns[rfe.support_]
print("Selected Features After RFE:", list(selected_features))
print(f"Number of Features Selected: {X_train_rfe.shape[1]}")

# Train new models on RFE-selected features
lr_model_rfe = LinearRegression()
rf_model_rfe = RandomForestRegressor()

lr_model_rfe.fit(X_train_rfe, y_train)
rf_model_rfe.fit(X_train_rfe, y_train)

# Generate predictions for RFE-selected features
y_pred_lr_rfe = lr_model_rfe.predict(X_test_rfe)
y_pred_rf_rfe = rf_model_rfe.predict(X_test_rfe)

# Compute performance metrics
r2_lr_rfe = r2_score(y_test, y_pred_lr_rfe)
rmse_lr_rfe = np.sqrt(mean_squared_error(y_test, y_pred_lr_rfe))

r2_rf_rfe = r2_score(y_test, y_pred_rf_rfe)
rmse_rf_rfe = np.sqrt(mean_squared_error(y_test, y_pred_rf_rfe))

# Print model performance after RFE
print("\nModel Performance with RFE-Selected Features\n")

print("Linear Regression:")
print(f"R²: {r2_lr_rfe:.4f}")
print(f"RMSE: {rmse_lr_rfe:.4f}\n")

print("Random Forest:")
print(f"R²: {r2_rf_rfe:.4f}")
print(f"RMSE: {rmse_rf_rfe:.4f}")

Selected Features After RFE: ['sqft', 'baths', 'Stdev', 'Mean', 'cost_of_living_housing']
Number of Features Selected: 5

Model Performance with RFE-Selected Features

Linear Regression:
R²: 0.4045
RMSE: 0.0461

Random Forest:
R²: 0.5216
RMSE: 0.0413


In [158]:
# Make a copy of the original feature set to avoid overwriting
X_train_copy_rfe_lr = X_train.copy()
X_test_copy_rfe_lr = X_test.copy()

# Apply RFE to select top 5 features using Linear Regression
rfe = RFE(estimator=LinearRegression(), n_features_to_select=5)
X_train_rfe_lr = rfe.fit_transform(X_train_copy_rfe_lr, y_train)
X_test_rfe_lr = rfe.transform(X_test_copy_rfe_lr)

# Get selected feature names
selected_features = X_train.columns[rfe.support_]
print("Selected Features After RFE:", list(selected_features))
print(f"Number of Features Selected: {X_train_rfe_lr.shape[1]}")

# Train new models on RFE-selected features
lr_model_rfe_lr = LinearRegression()
rf_model_rfe_lr = RandomForestRegressor()

lr_model_rfe_lr.fit(X_train_rfe_lr, y_train)
rf_model_rfe_lr.fit(X_train_rfe_lr, y_train)

# Generate predictions for RFE-selected features
y_pred_lr_rfe_lr = lr_model_rfe_lr.predict(X_test_rfe_lr)
y_pred_rf_rfe_lr = rf_model_rfe_lr.predict(X_test_rfe_lr)

# Compute performance metrics
r2_lr_rfe_lr = r2_score(y_test, y_pred_lr_rfe_lr)
rmse_lr_rfe_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr_rfe_lr))

r2_rf_rfe_lr = r2_score(y_test, y_pred_rf_rfe_lr)
rmse_rf_rfe_lr = np.sqrt(mean_squared_error(y_test, y_pred_rf_rfe_lr))

# Print model performance after RFE
print("\nModel Performance with RFE_LR-Selected Features\n")

print("Linear Regression:")
print(f"R²: {r2_lr_rfe_lr:.4f}")
print(f"RMSE: {rmse_lr_rfe_lr:.4f}\n")

print("Random Forest:")
print(f"R²: {r2_rf_rfe_lr:.4f}")
print(f"RMSE: {rmse_rf_rfe_lr:.4f}")

Selected Features After RFE: ['sqft', 'baths', 'waterfront', 'cost_of_living_housing', 'total_population_category']
Number of Features Selected: 5

Model Performance with RFE_LR-Selected Features

Linear Regression:
R²: 0.3878
RMSE: 0.0467

Random Forest:
R²: 0.3005
RMSE: 0.0500


The Random Forest using sqft, vaths, Stdev, Mean, cost_of_living_housing as our features has produced the most efficient and effective model according to RFE. Removing features made very little impact on the performance of the model, it simply improved simplicity.