# Laod pakeges

In [2]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Read data

In [3]:
#csv_path
CSV_PATH ='clean_house_l5_dataset.csv'

#Read data
df = pd.read_csv(CSV_PATH)
#read first 10
print(df.head(10))

   Size_sqft  Bedrooms  Bathrooms  YearBuilt     Price  Location_City  \
0   1.030281 -1.463643   0.088986  -1.279342  812100.0              1   
1  -0.482463 -1.463643   1.347506   1.326476  547000.0              1   
2   0.468877  0.007430  -1.169534  -1.339942  693700.0              1   
3   1.079817  0.742966   1.347506  -0.915740  848300.0              1   
4   0.788954  1.478502  -1.169534   0.962873  806000.0              0   
5   0.881674  0.007430  -1.169534  -1.339942  724400.0              1   
6   1.280500  1.478502   0.088986  -0.612737  715100.0              0   
7  -0.266538  0.742966  -1.169534  -0.733938  547900.0              0   
8  -0.802540 -0.728107   0.088986  -1.339942  516700.0              0   
9  -1.263604  0.742966   1.347506  -1.036940  418600.0              1   

   Location_Rural  Location_Suburb  HouseAge  Rooms_per_1000sqft  \
0               0                0  1.279342           -1.061465   
1               0                0 -1.326476           -0.26

## Features & Target

In [4]:
x = df.drop(columns=['Price','LogPrice'])
y = df['Price']

## Split Data

In [11]:
X_train, X_test, y_train, y_test =train_test_split(
    x, y, test_size=0.2, random_state=42)

## Train Models

### LinearRegration

In [12]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print(lr_pred[:10])

[656754.66720779 822634.7173445  188637.49473079 594040.9378655
 609615.22485083 411139.21882379 444365.96225017 727107.31998415
 718486.32165982 825315.58832644]


### RandomForestRegressor

In [13]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

## Evaluate Performance

In [14]:
def print_metrics(model_name, y, y_predict):
    r2 = r2_score(y, y_predict)
    mae = mean_absolute_error(y, y_predict)
    mse = mean_squared_error(y, y_predict)
    rmse = np.sqrt(mse)
    #PRINT
    print(f"Model: {model_name}")
    print(f"R2: {r2:.3f}")
    print(f"MAE: {mae:.3f}")
    print(f"MSE: {mse:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print("----------------------------")


In [15]:
print_metrics("Linear Regression", y_test, lr_pred)
print_metrics("Random Forest", y_test, rf_pred)

Model: Linear Regression
R2: 0.848
MAE: 63085.838
MSE: 5718940940.597
RMSE: 75623.680
----------------------------
Model: Random Forest
R2: 0.859
MAE: 52523.850
MSE: 5283317454.950
RMSE: 72686.432
----------------------------


## Single-row Sanity Check

In [18]:
# Pick one row
row = X_test.iloc[[0]]

# Actual price
actual_price = y_test.iloc[0]
print(f"Actual Price: {actual_price}")

# Predictions
lr_price = lr.predict(row)[0]
rf_price = rf.predict(row)[0]

print(f"Linear Regression Prediction: {lr_price:.2f}")
print(f"Random Forest Prediction: {rf_price:.2f}")


Actual Price: 642500.0
Linear Regression Prediction: 656754.67
Random Forest Prediction: 789031.00
