In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

from model.estimator import GARegressor

# Basic Usage

## Step 1. Load the dataset

In [None]:
# Specify column names for the dataset. Here is the poverty dataset.
tab_x = ['ep_unem', 'ep_pci', 'ep_nohs', 'ep_sngp',
         'ep_lime', 'ep_crow', 'ep_nove', 'rent_1', 'rntov30p_1',
         'ep_unin', 'ep_minrty', 'ep_age65', 'ep_age17', 'ep_disabl']
tab_l = ['latitude', 'longitude']
tab_y = ['ep_pov']

# Load the tabular dataset.
df = pd.read_csv(r'./data/tabular_datasets/us_sdoh_2014.csv')
X, y = df[tab_x + tab_l], df[tab_y]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Step 2. sklearn-style training

In [None]:
# Specify the hyperparameters for the GA model.
# Check the docstring of`GeoAggregator` class for details.
params = {
    'x_cols': tab_x,
    'spa_cols': tab_l,
    'y_cols': tab_y,
    'attn_variant': 'MCPA',
    'model_variant': 'small',
    'd_model': 32,
    # 'n_attn_layer': 1,
    # 'idu_points': 4,
    # 'seq_len': 128,
    'attn_dropout': 0.2,
    'attn_bias_factor': None,
    'reg_lin_dims': [16, 1],
    'epochs': 20,
    'lr': 5e-3,
    'batch_size': 8,
    'verbose': True   # show model summary
}

# Initialize the GA model.
model = GARegressor(
    **params
)

# Train the GA model (need to pass co-variates, spatial coordinates and target variable).
model.fit(X=X_train[tab_x], l=X_train[tab_l], y=y_train)

## Step 3. sklearn-style testing

In [None]:
# Predict on the GA test dataset.
y_pred = model.predict(X=X_test[tab_x], l=X_test[tab_l])

print(f'R-sq = {r2_score(y_true=y_test[tab_y], y_pred=y_pred)}')
print(f'MAE = {mean_absolute_error(y_true=y_test[tab_y], y_pred=y_pred)}')

# Hyperparameter Tuning [optional]

In [None]:
import time
import numpy as np
import optuna

from optuna.samplers import TPESampler
from sklearn.model_selection import KFold

In [None]:
def objective(trial, n_split=5):
    params = {
        'x_cols': tab_x,
        'spa_cols': tab_l,
        'y_cols': tab_y,
        'attn_variant': 'MCPA',
        'd_model': trial.suggest_categorical('d_model', [32, 64, 80]),
        'n_attn_layer': trial.suggest_int('n_attn_layer', 1, 3),
        'idu_points': trial.suggest_int('idu_points', 2, 8),
        'seq_len': trial.suggest_categorical('seq_len', [64, 81, 100, 144, 256, 400]),
        'attn_dropout': trial.suggest_float('attn_dropout', 0.01, 0.5),
        'attn_bias_factor': None,
        'reg_lin_dims': trial.suggest_categorical('reg_lin_dims', [[1], [4, 1], [16, 1]]),
        'epochs': trial.suggest_int('epochs', 3, 30),
        'lr': 5e-3,
        'batch_size': 8,
    }
    loss = np.empty(n_split)
    kf = KFold(n_splits=n_split, shuffle=True)

    for idx, (trn_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
        trn_X, trn_y = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_X, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]

        model = GARegressor(**params)
        model.fit(
            X=trn_X[tab_x],
            l=trn_X[tab_l],
            y=trn_y
        )
        y_pred = model.predict(X=val_X[tab_x], l=val_X[tab_l])
        loss[idx] = mean_absolute_error(y_true=val_y, y_pred=y_pred)

    return np.mean(loss)

In [None]:
sampler = TPESampler()
start_time = time.time()
study = optuna.create_study(
    direction='minimize',
    study_name='ga-hp!',
    sampler=sampler
)
study.optimize(objective, n_trials=300)
end_time = time.time()

best_params = study.best_params
best_value = study.best_value
best_trial = study.best_trial

print('Elapsed time = {:.4f}s'.format(end_time - start_time))
print('Best hyperparameters: ', best_params)
print('Best results: ', best_value)
print('Best trial: ', best_trial)