In [2]:
# 1. Setup and Load Data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load 
df = pd.read_csv(r"C:\Users\hp\Desktop\ml learning\ds-ml-bootcamp\dataset\clean_house_l5_dataset.csv")

# 2. Prepare Features & Target
# We want to predict 'Price' using everything else except 'LogPrice'
y = df['Price']
X = df.drop(columns=['Price', 'LogPrice'])

# 3. Split Data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Helper Function to Evaluate Performance
def show_scores(name, model):
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predictions)
    
    print(f"{name} Performance:")
    print(f"  R²   : {r2:.2f}")
    print(f"  MAE  : {mae:,.0f}")
    print(f"  RMSE : {rmse:,.0f}")
    print("-" * 30)

# 5. Train Models
# Model 1: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Model 2: Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 6. Print Results
show_scores("Linear Regression", lr_model)
show_scores("Random Forest", rf_model)

# 7. Single-row Sanity Check
# Pick one house (row 0) from the test set
row_to_check = X_test.iloc[[0]]
actual_price = y_test.iloc[0]

lr_guess = lr_model.predict(row_to_check)[0]
rf_guess = rf_model.predict(row_to_check)[0]

print(f"Actual Price: ${actual_price:,.0f}")
print(f"Linear Regression Predicted: ${lr_guess:,.0f}")
print(f"Random Forest Predicted: ${rf_guess:,.0f}") 

Linear Regression Performance:
  R²   : 0.85
  MAE  : 63,086
  RMSE : 75,624
------------------------------
Random Forest Performance:
  R²   : 0.86
  MAE  : 52,524
  RMSE : 72,686
------------------------------
Actual Price: $642,500
Linear Regression Predicted: $656,755
Random Forest Predicted: $789,031
