In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

from model.estimator import GARegressor

# Basic Usage

## Step 1. Load the dataset

In [None]:
# ### Specify column names of the dataset. Here we have the Housing dataset.
tab_x = ['bathrooms', 'sqft_living', 'sqft_lot',
         'grade', 'condition', 'waterfront',
         'view', 'age']
tab_l = ['UTM_X', 'UTM_Y']
tab_y = ['y']

# ### Load the tabular dataset.
df = pd.read_csv(r'./data/tabular_datasets/seattle_house_price_ds.csv')
df[tab_l] = df[tab_l].apply(
    lambda x: (x - x.min()) / (x.max() - x.min() + 1e-8)
)
df[tab_y] = df[tab_y].apply(
    lambda x: 10 ** x / 1e5
)   # Only the housing dataset needs this step.

# ### Train-Test split.
X, y = df[tab_x + tab_l], df[tab_y]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Step 2. sklearn-style training

In [None]:
# ### Specify the hyperparameters for the GA model.
# Check the docstring of`GeoAggregator` class for details.
params = {
    'x_cols': tab_x,
    'spa_cols': tab_l,
    'y_cols': tab_y,
    'attn_variant': 'MCPA',
    # 'model_variant': 'mini',
    'd_model': 32,
    'n_attn_layer': 1,
    'idu_points': 1,
    'seq_len': 144,
    'attn_dropout': 0.1,
    'attn_bias_factor': None,
    'reg_lin_dims': [16, 1],
    'epochs': 17,
    'lr': 5e-3,
    'batch_size': 8,
    'verbose': True   # show model summary
}

# ### Initialize the GA model.
model = GARegressor(
    **params
)

# ### Train the GA model.
# Need to pass co-variates, spatial coordinates and target variable.
model.fit(X=X_train[tab_x], l=X_train[tab_l], y=y_train)

## Step 3. sklearn-style testing

In [None]:
# ### Predict on the test dataset.
y_pred, y_pred_std = model.predict(X=X_test[tab_x], l=X_test[tab_l], n_estimate=8, get_std=True)

print(f'R-sq = {r2_score(y_true=y_test[tab_y], y_pred=y_pred)}')
print(f'MAE = {mean_absolute_error(y_true=y_test[tab_y], y_pred=y_pred)}')

# Hyperparameter Tuning [optional]

In [None]:
import time
import numpy as np
import optuna

from optuna.samplers import TPESampler
from sklearn.model_selection import KFold

In [None]:
def objective(trial, n_split=4):
    params = {
        'x_cols': tab_x,
        'spa_cols': tab_l,
        'y_cols': tab_y,
        'attn_variant': 'MCPA',
        'd_model': trial.suggest_categorical('d_model', [32, 64]),
        'n_attn_layer': 1,
        'idu_points': 1,
        'seq_len': trial.suggest_categorical('seq_len', [100, 128, 144]),
        'attn_dropout': trial.suggest_categorical('dropout', [0.05, 0.1, 0.2]),
        'attn_bias_factor': None,
        'reg_lin_dims': [16, 1],
        'epochs': trial.suggest_int('epochs', 15, 21),
        'lr': 5e-3,
        'batch_size': 8,
        'verbose': False,
    }
    loss = np.empty(n_split)
    kf = KFold(n_splits=n_split, shuffle=True)

    for idx, (trn_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
        trn_X, trn_y = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_X, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]

        model = GARegressor(**params)
        model.fit(
            X=trn_X[tab_x],
            l=trn_X[tab_l],
            y=trn_y
        )
        y_pred = model.predict(X=val_X[tab_x], l=val_X[tab_l])
        loss[idx] = mean_absolute_error(y_true=val_y, y_pred=y_pred)

    return np.mean(loss)

In [None]:
sampler = TPESampler()
start_time = time.time()
study = optuna.create_study(
    direction='minimize',
    study_name='ga-hp!',
    sampler=sampler
)
study.optimize(objective, timeout=7200)
end_time = time.time()

best_params = study.best_params
best_value = study.best_value
best_trial = study.best_trial

print('Elapsed time = {:.4f}s'.format(end_time - start_time))
print('Best hyperparameters: ', best_params)
print('Best results: ', best_value)
print('Best trial: ', best_trial)