# <center> Use GA-sklearn On Your Own Dataset: A DEMO </center>

* Paper: Deng, R., Li, Z., & Wang, M. (2025). GeoAggregator: An Efficient Transformer Model for Geo-Spatial Tabular Data. Proceedings of the AAAI Conference on Artificial Intelligence, 39(11), 11572-11580. https://doi.org/10.1609/aaai.v39i11.33259


* Github: https://github.com/ruid7181/GA-sklearn


If you have any questions, feel free to contact Rui Deng (rui.deng@glasgow.ac.uk).

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

from model.estimator import GARegressor

# Basic Usage

![](https://drive.google.com/uc?export=view&id=1i2tL84D36czYSycC9PfZsfDfYu7PEqFn)

## Step 1. Load the dataset

The data used in this training is a classic Seattle house price dataset. This dataset is popularly used to demonstrate methodological developments in spatial modeling.

Key Variables:
* sqft_living: interior living space of the house in square feet.
* sqft_lot: total area of the land/plot the house sits on, measured in square feet.
* bathrooms: number of bathrooms in the house.
* grade:An overall grade given to the house by the King County grading system;
* condition: A rating of the house's overall condition.This is typically on a scale from 1 (poor) to 5 (very good).
* waterfront: A binary variable (0 or 1) indicating whether the house has a view of a waterfront. 1 means it is on the waterfront, 0 means it is not.
* view: A rating (often 0-4) of the quality of the view from the property. A higher number indicates a better view.
* age: The age of the house in years, calculated from the year it was built.
* UTM_X: The geographic east-west coordinate (easting) of the house, based on the Universal Transverse Mercator (UTM) projection system.
* UTM_Y: The geographic north-south coordinate (northing) of the house, based on the UTM system.

\\

We are using a smaller subset of 1000 samples for demonstration purposes due to time constraints.

For full version of the data, results & comparison with other models, please refer to the GeoAggregator paper.

In [None]:
# ### Specify column names of the dataset. Here we have the Housing dataset.
tab_x = ['bathrooms', 'sqft_living', 'sqft_lot',
         'grade', 'condition', 'waterfront',
         'view', 'age']
tab_l = ['UTM_X', 'UTM_Y']
tab_y = ['log_price']

# # ### Load the tabular dataset.
df = pd.read_csv(r'./data/tabular_datasets/cupum-demo.csv')
df[tab_l] = df[tab_l].apply(
    lambda x: (x - x.min()) / (x.max() - x.min() + 1e-8)
)
# df[tab_y] = df[tab_y].apply(
#     lambda x: 10 ** x / 1e5
# )   # Only the housing dataset needs this step.

# ### Train-Test split.
X, y = df[tab_x + tab_l], df[tab_y]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## Step 2. sklearn-style training

In [None]:
# ### Specify the hyperparameters for the GA model.
# Check the docstring of`GeoAggregator` class for details.
params = {
    'x_cols': tab_x,
    'spa_cols': tab_l,
    'y_cols': tab_y,
    'attn_variant': 'MCPA',
    # 'model_variant': 'mini',
    'd_model': 32,
    'n_attn_layer': 1,
    'idu_points': 1,
    'seq_len': 144,
    'attn_dropout': 0.05,
    'attn_bias_factor': None,
    'reg_lin_dims': [16, 1],
    'epochs': 27,
    'lr': 5e-3,
    'batch_size': 8,
    'verbose': True   # show model summary
}

# ### Initialize the GA model.
model = GARegressor(
    **params
)

# ### Train the GA model.
# Need to pass co-variates, spatial coordinates and target variable.
model.fit(X=X_train[tab_x], l=X_train[tab_l], y=y_train)

## Step 3. sklearn-style testing

In [None]:
# ### Predict on the test dataset.
y_pred, y_pred_std = model.predict(X=X_test[tab_x], l=X_test[tab_l], n_estimate=8, get_std=True)

print(f'R-sq = {r2_score(y_true=y_test[tab_y], y_pred=y_pred)}')
print(f'MAE = {mean_absolute_error(y_true=y_test[tab_y], y_pred=y_pred)}')

# Hyperparameter Tuning [optional]

In [None]:
import time
import numpy as np
import optuna

from optuna.samplers import TPESampler
from sklearn.model_selection import KFold

In [None]:
def objective(trial, n_split=4):
    params = {
        'x_cols': tab_x,
        'spa_cols': tab_l,
        'y_cols': tab_y,
        'attn_variant': 'MCPA',
        'd_model': trial.suggest_categorical('d_model', [32, 64]),
        'n_attn_layer': 1,
        'idu_points': 1,
        'seq_len': trial.suggest_categorical('seq_len', [100, 128, 144]),
        'attn_dropout': trial.suggest_categorical('dropout', [0.05, 0.1, 0.2]),
        'attn_bias_factor': None,
        'reg_lin_dims': [16, 1],
        'epochs': trial.suggest_int('epochs', 15, 21),
        'lr': 5e-3,
        'batch_size': 8,
        'verbose': False,
    }
    loss = np.empty(n_split)
    kf = KFold(n_splits=n_split, shuffle=True)

    for idx, (trn_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
        trn_X, trn_y = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_X, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]

        model = GARegressor(**params)
        model.fit(
            X=trn_X[tab_x],
            l=trn_X[tab_l],
            y=trn_y
        )
        y_pred = model.predict(X=val_X[tab_x], l=val_X[tab_l])
        loss[idx] = mean_absolute_error(y_true=val_y, y_pred=y_pred)

    return np.mean(loss)

In [None]:
sampler = TPESampler()
start_time = time.time()
study = optuna.create_study(
    direction='minimize',
    study_name='ga-hp!',
    sampler=sampler
)
study.optimize(objective, timeout=7200)
end_time = time.time()

best_params = study.best_params
best_value = study.best_value
best_trial = study.best_trial

print('Elapsed time = {:.4f}s'.format(end_time - start_time))
print('Best hyperparameters: ', best_params)
print('Best results: ', best_value)
print('Best trial: ', best_trial)

# Model Explanation (GeoShapley) [Optional]

In [None]:
from geoshapley import GeoShapleyExplainer

In [None]:
# ### Prepare the data to be explained and the background data
X = X.sample(100)
background = X.sample(30).values

# ### Get the predictor function for the GeoShapley Explainer
predictor = model.get_shap_predictor(
    X=X[tab_x],
    l=X[tab_l],
    n_background=30
)

# ### Initiate the Explainer
explainer = GeoShapleyExplainer(
    predict_f=predictor,
    background=background
)

# ### Explain
result = explainer.explain(X_geo=X, n_jobs=1)