# Gradient Boosting

## XGBoost Model

In [9]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from utils.transform_scale import transform_scale_df, transform_v2_scale_df, TARGET_VARIABLE_COLUMN

import torch
import torch.nn as nn
import torch.optim as optim

DATA_PATH = Path("data")

In [7]:
# Load augmented data
train_augmented = pd.read_csv(DATA_PATH / "train-augmented.csv", parse_dates=["month"])
test_augmented = pd.read_csv(DATA_PATH / "test-augmented.csv", parse_dates=["month"])

train_augmented.head()

Unnamed: 0,month,town,flat_type,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,...,mean_age_m,std_age_f,std_age_m,pri_sch_dist,pri_sch,sec_sch_dist,sec_sch,mall_dist,mrt_name,mrt_dist
0,2001-08-01,pasir ris,4 room,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.369008,...,36.16763,20.331631,19.999478,0.344087,Loyang Primary School,0.428301,Pasir Ris Crest Secondary School,1.033216,Pasir Ris,1.137522
1,2014-10-01,punggol,5 room,196B,punggol field,110.0,improved,uncategorized,2003,1.399007,...,31.967676,20.103889,19.793305,0.160852,Edgefield Primary School,0.312383,Meridian Secondary School,0.80604,Cove,0.118373
2,2020-09-01,sengkang,5 room,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.388348,...,34.164736,20.311337,19.94782,0.184906,Fernvale Primary School,0.55838,Pei Hwa Secondary School,0.452556,Fernvale,0.481153
3,2000-10-01,clementi,3 room,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.318493,...,40.577282,21.625967,21.440329,0.304561,Pei Tong Primary School,0.619132,Clementi Town Secondary School,0.456499,Clementi,0.42332
4,2013-01-01,bukit batok,3 room,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.348149,...,38.318241,20.497124,20.287059,0.233809,Princess Elizabeth Primary School,0.217911,Bukit Batok Secondary School,0.764172,Bukit Batok,0.77422


In [10]:
# See linear.ipynb for details - code copied from there
# Split the train data into train and test
X = train_augmented.drop(columns=TARGET_VARIABLE_COLUMN)
y = train_augmented[TARGET_VARIABLE_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Transform and scale the data
# See utils/transform_scale.py for details
X_train = transform_v2_scale_df(X_train)
X_test = transform_v2_scale_df(X_test)
X_train.head()

Unnamed: 0,month,flat_type,floor_area_sqm,flat_model,lease_commence_date,elevation,region,median_storey,distance_to_BN,distance_to_CR,distance_to_IEBP,distance_to_IHL,distance_to_market_hawker,mean_age_f,mean_age_m,pri_sch_dist,sec_sch_dist,mall_dist,mrt_dist
424394,1.247215,-0.072072,-0.594695,-1.018272,-1.407101,2.295101,-1.468284,0.669352,-1.011927,0.121163,0.820218,-0.715163,-0.737495,1.492759,1.477374,-0.077129,-0.962841,1.33266,0.5162
120565,-1.442412,1.000376,1.113372,-1.018272,-0.334238,0.07368,0.585179,-1.182206,-0.655657,0.781332,-1.665585,0.259262,0.410626,0.390655,0.458419,-0.592998,-0.980574,-1.261748,7.025002
145559,1.192325,-1.14452,-1.468591,-1.018272,-1.894766,0.496808,-1.468284,-1.182206,-1.520166,1.135453,-0.973454,-0.994291,-0.785118,1.419774,1.15466,1.656842,2.073933,-0.924956,-0.41802
132809,-0.042708,-1.14452,-0.952198,-0.231618,-0.236705,0.60259,1.269667,0.052166,0.497555,-0.070879,0.758776,0.087701,1.643412,-0.081596,0.088005,-1.044695,-0.990678,1.714609,-0.391958
32978,-0.015263,-0.072072,-0.51525,1.866128,0.641092,1.025718,-0.099309,-1.182206,1.602925,-1.183845,-0.121179,1.390263,-0.081635,-1.181189,-1.145237,0.774164,0.00773,0.202829,-0.283933


In [49]:
# Convert the data to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Define the model parameters
params = {'objective': 'reg:squarederror', 'learning_rate': 0.1, 'max_depth': 19, 'n_estimators': 100}

# Train the model
model = xgb.XGBRegressor(**params)
model.fit(X_train_tensor, y_train_tensor)

# Evaluate the model
with torch.no_grad():
    y_pred = model.predict(X_test_tensor)
    print(f"Mean squared error: {mean_squared_error(y_test_tensor, y_pred)}")
    print(f"Mean absolute error: {mean_absolute_error(y_test_tensor, y_pred)}")
    print(f"R2 score: {r2_score(y_test_tensor, y_pred)}")

Mean squared error: 289524608.0
Mean absolute error: 11881.869140625
R2 score: 0.982715521381119


## K-Fold Validation

In [50]:
from sklearn.model_selection import KFold

# Define the k-fold cross-validation parameters
k = 5  # number of folds
params = {'objective': 'reg:squarederror', 'learning_rate': 0.1, 'max_depth': 19, 'n_estimators': 100}
mse_scores = []
mae_scores = []
r2_scores = []

# Perform k-fold cross-validation
kf = KFold(n_splits=k, shuffle=True, random_state=42)
for train_idx, val_idx in kf.split(X_train_tensor):
    # Split the data into training and validation sets for this fold
    X_train_fold = X_train_tensor[train_idx]
    y_train_fold = y_train_tensor[train_idx]
    X_val_fold = X_train_tensor[val_idx]
    y_val_fold = y_train_tensor[val_idx]

    # Train the XGBoost model on the training set
    model = xgb.XGBRegressor(**params)
    model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the validation set
    y_pred_fold = model.predict(X_val_fold)
    mse = mean_squared_error(y_val_fold, y_pred_fold)
    mae = mean_absolute_error(y_val_fold, y_pred_fold)
    r2 = r2_score(y_val_fold, y_pred_fold)

    # Store the evaluation metrics for this fold
    mse_scores.append(mse)
    mae_scores.append(mae)
    r2_scores.append(r2)

# Compute the mean and standard deviation of the evaluation metrics across all folds
print(f"Mean MSE: {np.mean(mse_scores):.4f} +/- {np.std(mse_scores):.4f}")
print(f"Mean MAE: {np.mean(mae_scores):.4f} +/- {np.std(mae_scores):.4f}")
print(f"Mean R-squared: {np.mean(r2_scores):.4f} +/- {np.std(r2_scores):.4f}")



Mean MSE: 303504960.0000 +/- 2418427.7500
Mean MAE: 12106.0840 +/- 20.1078
Mean R-squared: 0.9820 +/- 0.0001
