In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

import pandas as pd
from sklearn.neighbors import BallTree

from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [2]:
base = "Data"

df_train = pd.read_csv(f"{base}/train.csv")
df_test = pd.read_csv(f'{base}/test.csv')

df_hdb = pd.read_csv(f'{base}/auxiliary-data/sg-hdb-block-details.csv')
df_mrt = pd.read_csv(f'{base}/auxiliary-data/sg-mrt-stations.csv')
df_shopping = pd.read_csv(f'{base}/auxiliary-data/sg-shopping-malls.csv')
df_hawker = pd.read_csv(f'{base}/auxiliary-data/sg-gov-hawkers.csv')
df_primary_schools = pd.read_csv(f'{base}/auxiliary-data/sg-primary-schools.csv')
df_secondary_schools = pd.read_csv(f'{base}/auxiliary-data/sg-secondary-schools.csv')

# Data Cleaning

In [3]:
def clean_train_test_dataframe(df, is_test=False):
    def floor_range_avg(rng):
        upper, lower = rng.split(" to ")
        upper = int(upper)
        lower = int(lower)
        return (upper + lower)/2

    df['BLOCK'] = df['BLOCK'].str.strip().str.lower()
    df['STREET'] = df['STREET'].str.strip().str.lower()

    df["ADDRESS"] = df['BLOCK'] + " " + df['STREET']

    df['FLAT_TYPE'] = df['FLAT_TYPE'].str.replace('-', ' ').str.strip().str.lower()

    df['YEAR'] = df['MONTH'].str.split('-').str[0].astype(int)
    df['MONTH'] = df['MONTH'].str.split('-').str[1].astype(int)

    df["FLOOR"] = df["FLOOR_RANGE"].apply(floor_range_avg)
    df["FLAT_AGE"] = df["YEAR"] - df["LEASE_COMMENCE_DATA"]

    df = df.drop('ECO_CATEGORY', axis=1)
    df = df.drop('FLOOR_RANGE', axis=1)
    df = df.drop('BLOCK', axis=1)
    df = df.drop('STREET', axis=1)

    if not is_test:
        df.drop_duplicates(inplace=True)

    return df

In [4]:
def clean_df_hdb(df):
    df["ADDRESS"] = df["ADDRESS"].str.strip().str.lower()
    df["BLOCK"] = df["BLOCK"].str.strip().str.lower()

    df["ADDRESS"] = df["BLOCK"] + " " + df["ADDRESS"]

    return df.drop('BLOCK', axis=1)

def remove_near_duplicate_schools(df):
    df = df.drop_duplicates(subset=['LATITUDE', 'LONGITUDE'], keep='last')
    return df

In [5]:
df_train = clean_train_test_dataframe(df_train)
df_test = clean_train_test_dataframe(df_test, is_test=True)

df_hdb = clean_df_hdb(df_hdb)

df_primary_schools = remove_near_duplicate_schools(df_primary_schools)
df_secondary_schools = remove_near_duplicate_schools(df_secondary_schools)

# Data Augmentation

In [6]:
def convert_to_coordinates(df, df_coords):
    df = df.merge(
        df_coords[['ADDRESS', 'LATITUDE', 'LONGITUDE', 'MAX_FLOOR']],
        on=['ADDRESS'],
        how='left'
    )
    return df

def calculate_distance_and_metadata(feature, df_features, df_main, meta_cols=None):
    tree = BallTree(np.radians(df_features[['LATITUDE', 'LONGITUDE']].to_numpy()), metric='haversine')
    distances, indices = tree.query(np.radians(df_main[['LATITUDE', 'LONGITUDE']].to_numpy()), k=10)
    distances_km = distances * 6371

    df_main[f"DIST_AVG_{feature}"] = np.mean(distances_km, axis=1)

    if meta_cols:
        nearest_idx = indices[:, 0]
        for col in meta_cols:
            df_main[f"{feature}_{col}"] = df_features.iloc[nearest_idx][col].values



def augment_auxiliary_data(df, mrt, shopping, hawker, primary_schools, secondary_schools):
    calculate_distance_and_metadata("MRT", mrt, df, meta_cols=["STATUS"])
    calculate_distance_and_metadata("OPEN_MRT", mrt[mrt["STATUS"] == "open"], df)
    calculate_distance_and_metadata("SHOPPING_MALL", shopping, df)
    calculate_distance_and_metadata(
        "HAWKER_CENTRE", hawker, df,
        meta_cols=["TYPE", "OWNER", "NUMBER_OF_STALLS"]
    )
    calculate_distance_and_metadata("PRIMARY_SCHOOL", primary_schools, df)
    calculate_distance_and_metadata("SECONDARY_SCHOOL", secondary_schools, df)
    return df


In [7]:
df_train = convert_to_coordinates(df_train, df_hdb)
df_test = convert_to_coordinates(df_test, df_hdb)

df_train = augment_auxiliary_data(df_train, df_mrt, df_shopping, df_hawker, df_primary_schools, df_secondary_schools)
df_test = augment_auxiliary_data(df_test, df_mrt, df_shopping, df_hawker, df_primary_schools, df_secondary_schools)

In [8]:
df_train

Unnamed: 0,MONTH,TOWN,FLAT_TYPE,FLOOR_AREA_SQM,FLAT_MODEL,LEASE_COMMENCE_DATA,RESALE_PRICE,ADDRESS,YEAR,FLOOR,...,DIST_AVG_MRT,MRT_STATUS,DIST_AVG_OPEN_MRT,DIST_AVG_SHOPPING_MALL,DIST_AVG_HAWKER_CENTRE,HAWKER_CENTRE_TYPE,HAWKER_CENTRE_OWNER,HAWKER_CENTRE_NUMBER_OF_STALLS,DIST_AVG_PRIMARY_SCHOOL,DIST_AVG_SECONDARY_SCHOOL
0,10,woodlands,4 room,102.0,premium apartment,2000,420000.0,681b woodlands drive 62,2020,8.0,...,2.247771,open,2.431632,6.006075,6.870498,HC,HDB,78,1.186822,1.566378
1,7,bishan,4 room,104.0,model a,1992,585000.0,264 bishan street 24,2021,8.0,...,1.180095,planned,1.528859,3.441297,1.479107,MHC,HDB,118,1.493487,1.305563
2,5,bukit panjang,4 room,102.0,model a,1998,450000.0,520 jelapang road,2021,20.0,...,2.144211,open,2.549748,5.770026,6.471479,MHC,Government,179,1.243942,1.915541
3,8,punggol,4 room,93.0,model a,2017,465000.0,121b edgedale plains,2021,17.0,...,1.714221,open,2.079248,3.305183,6.110540,MHC,HDB,186,0.870008,1.400426
4,5,hougang,5 room,113.0,improved,2018,710000.0,997b buangkok crescent,2023,11.0,...,2.249401,open,2.709037,3.187883,3.646112,HC,Government,36,1.171904,1.590458
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162459,7,hougang,4 room,91.0,new generation,1985,335000.0,708 hougang avenue 2,2017,2.0,...,1.782063,open,2.374770,3.321377,3.212259,MHC,HDB,186,1.224402,1.422697
162460,9,pasir ris,4 room,104.0,model a,1995,388000.0,634 pasir ris drive 1,2020,5.0,...,1.888235,planned,3.436926,3.279076,5.411295,MHC,HDB,144,1.687278,2.563210
162461,10,geylang,4 room,84.0,simplified,1985,373000.0,319 ubi avenue 1,2017,11.0,...,1.383391,open,1.383391,3.802703,1.780405,MHC,HDB,110,1.929194,2.239811
162462,8,sengkang,5 room,110.0,improved,2001,420000.0,290b compassvale crescent,2020,2.0,...,1.611449,open,1.820023,3.390379,5.565424,MHC,HDB,186,0.931490,1.247597


In [9]:
len(df_test)

50000

# Model Training

In [10]:
target = 'RESALE_PRICE'

num_features = [
     'DIST_AVG_MRT',
     'DIST_AVG_SHOPPING_MALL',
     'DIST_AVG_PRIMARY_SCHOOL',
     'DIST_AVG_HAWKER_CENTRE',
     'DIST_AVG_SECONDARY_SCHOOL',
     'HAWKER_CENTRE_NUMBER_OF_STALLS',
     'LATITUDE',
     'LONGITUDE',
     'MAX_FLOOR',
     'FLOOR',
     'FLAT_AGE',
     'YEAR',
     'FLOOR_AREA_SQM',
     'MONTH'
]

cat_features = [
    'HAWKER_CENTRE_OWNER',
     'FLAT_TYPE',
     'HAWKER_CENTRE_TYPE',
     'FLAT_MODEL',
     'MRT_STATUS',
     'TOWN'
]

features = cat_features + num_features


X = df_train[features].copy()
y = df_train[target].copy()


for col in cat_features:
     X[col] = X[col].astype("category")

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [11]:
param_dist = {
    'max_depth': [ 3, 4, 5, 6, 8, 10, 12, 15],
    'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
    'colsample_bytree': [ 0.3, 0.4, 0.5, 0.7],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'min_child_weight': [1, 3, 5, 7, 9, 11],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'reg_lambda': [1, 3, 5, 7, 10],
    'reg_alpha': [0, 0.1, 0.5, 1]
}

model = XGBRegressor(
    n_estimators=2000,
    objective='reg:squarederror',
    eval_metric='rmse',
    early_stopping_rounds=100,
    random_state=42,
    tree_method='hist',
    verbosity=0,
    enable_categorical=True
)

search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    scoring='neg_root_mean_squared_error',
    cv=3,
    n_iter=30,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

search.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

print("Best parameters found:")
print(search.best_params_)
print("Best RMSE: ", np.sqrt(-search.best_score_))


Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.15, max_depth=4, min_child_weight=11, reg_alpha=1, reg_lambda=3, subsample=1.0; total time=  11.9s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.3, max_depth=5, min_child_weight=1, reg_alpha=1, reg_lambda=10, subsample=0.9; total time=  12.0s
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.15, max_depth=4, min_child_weight=11, reg_alpha=1, reg_lambda=3, subsample=1.0; total time=  12.2s
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.15, max_depth=4, min_child_weight=11, reg_alpha=1, reg_lambda=3, subsample=1.0; total time=  12.5s
[CV] END colsample_bytree=0.4, gamma=0, learning_rate=0.25, max_depth=5, min_child_weight=11, reg_alpha=0, reg_lambda=7, subsample=0.7; total time=  13.7s
[CV] END colsample_bytree=0.4, gamma=0, learning_rate=0.25, max_depth=5, min_child_weight=11, reg_alpha=0, reg_lambda=7, subsample=0.7; total time=  14.9s
[CV]



[CV] END colsample_bytree=0.4, gamma=0, learning_rate=0.3, max_depth=12, min_child_weight=1, reg_alpha=0.5, reg_lambda=10, subsample=0.8; total time=   8.1s
[CV] END colsample_bytree=0.4, gamma=0.4, learning_rate=0.05, max_depth=4, min_child_weight=1, reg_alpha=0.5, reg_lambda=7, subsample=0.7; total time=  17.6s
[CV] END colsample_bytree=0.4, gamma=0.4, learning_rate=0.05, max_depth=4, min_child_weight=1, reg_alpha=0.5, reg_lambda=7, subsample=0.7; total time=  15.3s
[CV] END colsample_bytree=0.4, gamma=0, learning_rate=0.3, max_depth=12, min_child_weight=1, reg_alpha=0.5, reg_lambda=10, subsample=0.8; total time=   9.4s
[CV] END colsample_bytree=0.4, gamma=0.4, learning_rate=0.05, max_depth=4, min_child_weight=1, reg_alpha=0.5, reg_lambda=7, subsample=0.7; total time=  15.2s
[CV] END colsample_bytree=0.3, gamma=0.2, learning_rate=0.15, max_depth=5, min_child_weight=5, reg_alpha=0, reg_lambda=3, subsample=0.9; total time=  16.8s
[CV] END colsample_bytree=0.7, gamma=0.3, learning_rate=

In [12]:
best_params = search.best_params_

model = XGBRegressor(
    **best_params,
    n_estimators=2000,
    objective='reg:squarederror',
    eval_metric='rmse',
    random_state=42,
    tree_method='hist',
    verbosity=0,
    enable_categorical=True
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

cv_scores = cross_val_score(model, X, y, scoring=rmse_scorer, cv=kf, n_jobs=-1, verbose=1)

mean_rmse = -np.mean(cv_scores)
std_rmse = np.std(cv_scores)

print("5-Fold Cross-Validation Results:")
print(f"RMSE scores (per fold): {-cv_scores}")
print(f"Mean RMSE: {mean_rmse:.4f}")
print(f"Std RMSE: {std_rmse:.4f}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.


5-Fold Cross-Validation Results:
RMSE scores (per fold): [25213.37308793 25342.93331906 25130.31048404 25362.90972979
 25301.45129615]
Mean RMSE: 25270.1956
Std RMSE: 86.7874


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.6s finished


In [13]:
final_model = XGBRegressor(
    **best_params,
    n_estimators=2000,
    objective='reg:squarederror',
    eval_metric='rmse',
    random_state=42,
    early_stopping_rounds=100,
    tree_method='hist',
    enable_categorical=True,
    verbosity=1
)

final_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=50
)

y_pred = final_model.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

print("Final Model Evaluation (Validation Set):")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

[0]	validation_0-rmse:176581.64048	validation_1-rmse:174864.47113
[50]	validation_0-rmse:56831.85766	validation_1-rmse:56502.00567
[100]	validation_0-rmse:40124.85892	validation_1-rmse:40139.40997
[150]	validation_0-rmse:35555.65697	validation_1-rmse:35716.55128
[200]	validation_0-rmse:32957.62539	validation_1-rmse:33254.91535
[250]	validation_0-rmse:31228.34140	validation_1-rmse:31649.56698
[300]	validation_0-rmse:29996.58736	validation_1-rmse:30514.67934
[350]	validation_0-rmse:28859.39465	validation_1-rmse:29484.97200
[400]	validation_0-rmse:27976.54807	validation_1-rmse:28689.37087
[450]	validation_0-rmse:27269.90204	validation_1-rmse:28083.32225
[500]	validation_0-rmse:26725.34311	validation_1-rmse:27629.86648
[550]	validation_0-rmse:26284.15509	validation_1-rmse:27276.14443
[600]	validation_0-rmse:25867.21608	validation_1-rmse:26939.15403
[650]	validation_0-rmse:25514.89105	validation_1-rmse:26671.79972
[700]	validation_0-rmse:25190.45534	validation_1-rmse:26425.76949
[750]	valid

# Prediction

In [14]:
X_test = df_test[features].copy()

for col in cat_features:
     X_test[col] = X_test[col].astype("category")

y_test_pred = final_model.predict(X_test)

submission = pd.DataFrame({
    "Id": df_test.index,
    "Predicted": y_test_pred
})

submission.to_csv("submission_xgb.csv", index=False)
print("Saved predictions to submission_xgb.csv")

Saved predictions to submission_xgb.csv
