In [11]:
import numpy as np
import pandas as pd
import joblib

# Loading preprocessed data
X_scaled = np.load("../../outputs/X_processed.npy")
y = pd.read_csv("../../outputs/y.csv")["SalePrice"].values

In [13]:
# --- Split once and reuse for all models ---
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# --- Model Dictionary ---
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
}

# --- Train & Evaluate (Non-SVR models) ---
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)    

    rmse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({"Model": name, "MAE": mae, "RMSE": rmse, "R² Score": r2})

# --- SVR with y-scaling ---
y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.reshape(-1, 1)).ravel()
y_test_scaled = y_scaler.transform(y_test.reshape(-1, 1)).ravel()

svr = SVR(kernel='rbf')
svr.fit(X_train, y_train_scaled)

y_pred_scaled = svr.predict(X_test)
y_pred_svr = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

mae = mean_absolute_error(y_test, y_pred_svr)
rmse = mean_squared_error(y_test, y_pred_svr)
r2 = r2_score(y_test, y_pred_svr)

results.append({"Model": "SVR (scaled y)", "MAE": mae, "RMSE": rmse, "R² Score": r2})

# Display results
results_df = pd.DataFrame(results, columns=['Model', 'MAE', 'RMSE', 'R² Score'])
print(results_df.sort_values(by='R² Score', ascending=False))

               Model           MAE          RMSE      R² Score
3  Gradient Boosting  7.146089e+02  1.459640e+06  9.997358e-01
2      Random Forest  4.146981e+02  1.746532e+06  9.996838e-01
1      Decision Tree  5.785822e+02  3.411796e+06  9.993823e-01
4     SVR (scaled y)  1.024332e+04  2.447455e+08  9.556920e-01
0  Linear Regression  1.194127e+13  1.640156e+28 -2.969291e+18
