In [12]:
import pandas as pd
data = pd.read_csv('Real_Estate.csv')
data.head()

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,2012-09-02 16:42:30.519336,13.3,4082.015,8,25.007059,121.561694,6.488673
1,2012-09-04 22:52:29.919544,35.5,274.0144,2,25.012148,121.54699,24.970725
2,2012-09-05 01:10:52.349449,1.1,1978.671,10,25.00385,121.528336,26.694267
3,2012-09-05 13:26:01.189083,22.2,1055.067,5,24.962887,121.482178,38.091638
4,2012-09-06 08:29:47.910523,8.5,967.4,6,25.011037,121.479946,21.65471


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import datetime


In [13]:
# convert "Transaction date" to datetime and extract year and month
data['Transaction date'] = pd.to_datetime(data['Transaction date'])

In [6]:
data.head()

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,2012-09-02 16:42:30.519336,13.3,4082.015,8,25.007059,121.561694,6.488673
1,2012-09-04 22:52:29.919544,35.5,274.0144,2,25.012148,121.54699,24.970725
2,2012-09-05 01:10:52.349449,1.1,1978.671,10,25.00385,121.528336,26.694267
3,2012-09-05 13:26:01.189083,22.2,1055.067,5,24.962887,121.482178,38.091638
4,2012-09-06 08:29:47.910523,8.5,967.4,6,25.011037,121.479946,21.65471


In [14]:
data['Transaction Year'] = data['Transaction date'].dt.year
data['Transaction month'] = data['Transaction date'].dt.month
data.head()

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area,Transaction Year,Transaction month
0,2012-09-02 16:42:30.519336,13.3,4082.015,8,25.007059,121.561694,6.488673,2012,9
1,2012-09-04 22:52:29.919544,35.5,274.0144,2,25.012148,121.54699,24.970725,2012,9
2,2012-09-05 01:10:52.349449,1.1,1978.671,10,25.00385,121.528336,26.694267,2012,9
3,2012-09-05 13:26:01.189083,22.2,1055.067,5,24.962887,121.482178,38.091638,2012,9
4,2012-09-06 08:29:47.910523,8.5,967.4,6,25.011037,121.479946,21.65471,2012,9


In [18]:
# drop the original "Transaction date" as we've extracted relevant features
data = data.drop(columns=['Transaction date'])
# define features and target variable
X = data.drop('House price of unit area', axis=1)
y = data['House price of unit area']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled.shape

(331, 7)

Model Training and Comparison
Now, we’ll proceed with training multiple models and comparing their performance. We’ll start with a few commonly used models for regression tasks:

Linear Regression: A good baseline model for regression tasks.

Decision Tree Regressor: To see how a simple tree-based model performs.

Random Forest Regressor: An ensemble method to improve upon the decision tree’s performance.

Gradient Boosting Regressor: Another powerful ensemble method for regression.

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [21]:
# initialize the models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

In [22]:
# dictionary to hold the evaluation metrics for each model
results = {}

# train and evaluate each model
for name, model in models.items():
    # training the model
    model.fit(X_train_scaled, y_train)

    # making predictions on the test set
    predictions = model.predict(X_test_scaled)

    # calculating evaluation metrics
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # storing the metrics
    results[name] = {"MAE": mae, "R²": r2}

results_df = pd.DataFrame(results).T  # convert the results to a DataFrame for better readability
print(results_df)

                         MAE        R²
Linear Regression   9.748246  0.529615
Decision Tree      11.760342  0.204962
Random Forest       9.887601  0.509547
Gradient Boosting  10.000117  0.476071


In [23]:
for name, model in models.items():
  print(name,model)

Linear Regression LinearRegression()
Decision Tree DecisionTreeRegressor(random_state=42)
Random Forest RandomForestRegressor(random_state=42)
Gradient Boosting GradientBoostingRegressor(random_state=42)


In [25]:
for name, model in enumerate(models):
  print(name,model)

0 Linear Regression
1 Decision Tree
2 Random Forest
3 Gradient Boosting
