In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

from model.estimator import GARegressor

# Basic Usage

## Step 1. Load the dataset

In [19]:
# ### Specify column names of the dataset. Here we have the Housing dataset.
tab_x = ['bathrooms', 'sqft_living', 'sqft_lot',
         'grade', 'condition', 'waterfront',
         'view', 'age']
tab_l = ['UTM_X', 'UTM_Y']
tab_y = ['y']

# # ### Load the tabular dataset.
df = pd.read_csv(r'./data/tabular_datasets/seattle_house_price_ds.csv')
df[tab_l] = df[tab_l].apply(
    lambda x: (x - x.min()) / (x.max() - x.min() + 1e-8)
)
df[tab_y] = df[tab_y].apply(
    lambda x: 10 ** x / 1e5
)   # Only the housing dataset needs this step.

# ### Train-Test split.
X, y = df[tab_x + tab_l], df[tab_y]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Step 2. sklearn-style training

In [None]:
# ### Specify the hyperparameters for the GA model.
# Check the docstring of`GeoAggregator` class for details.
params = {
    'x_cols': tab_x,
    'spa_cols': tab_l,
    'y_cols': tab_y,
    'attn_variant': 'MCPA',
    # 'model_variant': 'mini',
    'd_model': 32,
    'n_attn_layer': 1,
    'idu_points': 1,
    'seq_len': 144,
    'attn_dropout': 0.05,
    'attn_bias_factor': None,
    'reg_lin_dims': [16, 1],
    'epochs': 27,
    'lr': 5e-3,
    'batch_size': 8,
    'verbose': True   # show model summary
}

# ### Initialize the GA model.
model = GARegressor(
    **params
)

# ### Train the GA model.
# Need to pass co-variates, spatial coordinates and target variable.
model.fit(X=X_train[tab_x], l=X_train[tab_l], y=y_train)


            __________ GeoAggregator Model Summary ___________
            attention mechanism type                    MCPA
            d_model                                       32
            # attention layer                              1
            # inducing point                               1
            # sequence length                            144
            regressor neurons                        [16, 1]
            
            ________________ training details ________________
            Training on device                           cpu
            attention dropout rate                      0.05
            maximum learning rate                      0.005
            batch_size                                     8
            # epoch                                       27
            


[INFO] Radius estimation ends after 30 iterations. Estimated radius: 0.04500 (seq_len extended by 1.25).
[INFO] Epoch:  1/27  |  Step:   0/1451  |  loss_step_avg: 3.5629  |  lr: 0.0002  |  abfs: [-0.000, 0.000, 0.000, -0.000]
[INFO] Epoch:  1/27  |  Step: 300/1451  |  loss_step_avg: 3.0025  |  lr: 0.0017  |  abfs: [0.105, 0.099, 0.135, 0.138]
[INFO] Epoch:  1/27  |  Step: 600/1451  |  loss_step_avg: 0.9269  |  lr: 0.0044  |  abfs: [0.373, 0.385, 0.442, 0.443]
[INFO] Epoch:  1/27  |  Step: 900/1451  |  loss_step_avg: 0.8034  |  lr: 0.0050  |  abfs: [0.593, 0.682, 0.676, 0.673]
[INFO] Epoch:  1/27  |  Step: 1200/1451  |  loss_step_avg: 0.8371  |  lr: 0.0050  |  abfs: [0.978, 1.019, 1.105, 0.995]
[INFO] Epoch:  2/27  |  Step:   0/1451  |  loss_step_avg: 0.8380  |  lr: 0.0050  |  abfs: [1.230, 1.236, 1.411, 1.264]
[INFO] Epoch:  2/27  |  Step: 300/1451  |  loss_step_avg: 0.8083  |  lr: 0.0050  |  abfs: [1.485, 1.441, 1.726, 1.495]
[INFO] Epoch:  2/27  |  Step: 600/1451  |  loss_step_avg: 0

## Step 3. sklearn-style testing

In [None]:
# ### Predict on the test dataset.
y_pred, y_pred_std = model.predict(X=X_test[tab_x], l=X_test[tab_l], n_estimate=4, get_std=True)

print(f'R-sq = {r2_score(y_true=y_test[tab_y], y_pred=y_pred)}')
print(f'MAE = {mean_absolute_error(y_true=y_test[tab_y], y_pred=y_pred)}')

# Hyperparameter Tuning [optional]

In [None]:
import time
import numpy as np
import optuna

from optuna.samplers import TPESampler
from sklearn.model_selection import KFold

In [None]:
def objective(trial, n_split=4):
    params = {
        'x_cols': tab_x,
        'spa_cols': tab_l,
        'y_cols': tab_y,
        'attn_variant': 'MCPA',
        'd_model': trial.suggest_categorical('d_model', [32, 64]),
        'n_attn_layer': 1,
        'idu_points': 1,
        'seq_len': trial.suggest_categorical('seq_len', [100, 128, 144]),
        'attn_dropout': trial.suggest_categorical('dropout', [0.05, 0.1, 0.2]),
        'attn_bias_factor': None,
        'reg_lin_dims': [16, 1],
        'epochs': trial.suggest_int('epochs', 15, 21),
        'lr': 5e-3,
        'batch_size': 8,
        'verbose': False,
    }
    loss = np.empty(n_split)
    kf = KFold(n_splits=n_split, shuffle=True)

    for idx, (trn_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
        trn_X, trn_y = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_X, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]

        model = GARegressor(**params)
        model.fit(
            X=trn_X[tab_x],
            l=trn_X[tab_l],
            y=trn_y
        )
        y_pred = model.predict(X=val_X[tab_x], l=val_X[tab_l])
        loss[idx] = mean_absolute_error(y_true=val_y, y_pred=y_pred)

    return np.mean(loss)

In [None]:
sampler = TPESampler()
start_time = time.time()
study = optuna.create_study(
    direction='minimize',
    study_name='ga-hp!',
    sampler=sampler
)
study.optimize(objective, timeout=7200)
end_time = time.time()

best_params = study.best_params
best_value = study.best_value
best_trial = study.best_trial

print('Elapsed time = {:.4f}s'.format(end_time - start_time))
print('Best hyperparameters: ', best_params)
print('Best results: ', best_value)
print('Best trial: ', best_trial)

# Model Explanation (GeoShapley) [Optional]

In [None]:
from geoshapley import GeoShapleyExplainer

In [None]:
# ### Prepare the data to be explained and the background data
X = X.sample(1000)
background = X.sample(30).values

# ### Get the predictor function for the GeoShapley Explainer
predictor = model.get_shap_predictor(
    X=X[tab_x],
    l=X[tab_l],
    n_background=30
)

# ### Initiate the Explainer
explainer = GeoShapleyExplainer(
    predict_f=predictor,
    background=background
)

# ### Explain
result = explainer.explain(X_geo=X, n_jobs=1)