In [1]:
import pandas as pd

# Load your dataset from the data folder
df = pd.read_excel("../data/real_estate_taiwan.xlsx", engine="openpyxl")

df.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1


In [2]:
print(df.columns.tolist())

['No', 'X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'X5 latitude', 'X6 longitude', 'Y house price of unit area']


In [3]:
# Select ONLY the required features
X = df[["X2 house age", "X3 distance to the nearest MRT station"]]
y = df["Y house price of unit area"]

X.head(), y.head()

(   X2 house age  X3 distance to the nearest MRT station
 0          32.0                                84.87882
 1          19.5                               306.59470
 2          13.3                               561.98450
 3          13.3                               561.98450
 4           5.0                               390.56840,
 0    37.9
 1    42.2
 2    47.3
 3    54.8
 4    43.1
 Name: Y house price of unit area, dtype: float64)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_scaled, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [7]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)

print("Training R2:", r2_score(y_train, y_pred_train))
print("Testing  R2:", r2_score(y_test, y_pred_test))

print("Training RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train)))
print("Testing  RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))

Training R2: 0.4900402076228402
Testing  R2: 0.4850034956707202
Training RMSE: 9.88312816771585
Testing  RMSE: 9.280105077985796


In [3]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import numpy as np

# Create grid for the surface plot
ha_range = np.linspace(X["X2 house age"].min(), X["X2 house age"].max(), 40)
mrt_range = np.linspace(X["X3 distance to the nearest MRT station"].min(),
                        X["X3 distance to the nearest MRT station"].max(), 40)

ha_grid, mrt_grid = np.meshgrid(ha_range, mrt_range)

# Combine points for prediction
grid_points = np.column_stack([ha_grid.ravel(), mrt_grid.ravel()])
grid_points_scaled = scaler.transform(grid_points)

price_pred_grid = model.predict(grid_points_scaled).reshape(ha_grid.shape)

# Plotting
fig = plt.figure(figsize=(11, 8))
ax = fig.add_subplot(111, projection='3d')

# Surface
ax.plot_surface(ha_grid, mrt_grid, price_pred_grid, cmap=cm.viridis, alpha=0.6)

# Training scatter
ax.scatter(X_train["X2 house age"], 
           X_train["X3 distance to the nearest MRT station"], 
           y_train, c='blue', label='Training')

# Test scatter
ax.scatter(X_test["X2 house age"], 
           X_test["X3 distance to the nearest MRT station"], 
           y_test, c='red', label='Test')

ax.set_xlabel("House Age (years)")
ax.set_ylabel("Distance to MRT (meters)")
ax.set_zlabel("Price (NT$ per ping)")
ax.set_title("House Price Prediction Surface")

plt.legend()
plt.show()

NameError: name 'X' is not defined