## Example for running CARTE on single tables
In this example, we run CARTE on two datasets, one for regression and one for classification.

In [1]:
# Set the current working directory and import packages
import os
from pathlib import Path
os.chdir(Path().cwd().parent)

import json
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import r2_score, roc_auc_score
from src.carte_table_to_graph import Table2GraphTransformer
from src.carte_estimator import CARTERegressor, CARTEClassifier
from configs.directory import config_directory

In [2]:
# Define necessary functions

# Load data
def _load_data(data_name):
    data_pd_dir = f"{config_directory['data_singletable']}/{data_name}/raw.parquet"
    data_pd = pd.read_parquet(data_pd_dir)
    data_pd.fillna(value=np.nan, inplace=True)
    config_data_dir = f"{config_directory['data_singletable']}/{data_name}/config_data.json"
    filename = open(config_data_dir)
    config_data = json.load(filename)
    filename.close()
    return data_pd, config_data

# Set train/test split given the random state
def _set_split(data, data_config, num_train, random_state):
    target_name = data_config["target_name"]
    X = data.drop(columns=target_name)
    y = data[target_name]
    y = np.array(y)

    if data_config["repeated"]:
        entity_name = data_config["entity_name"]
    else:
        entity_name = np.arange(len(y))

    groups = np.array(data.groupby(entity_name).ngroup())
    num_groups = len(np.unique(groups))
    gss = GroupShuffleSplit(
        n_splits=1,
        test_size=int(num_groups - num_train),
        random_state=random_state,
    )
    idx_train, idx_test = next(iter(gss.split(X=y, groups=groups)))

    X_train, X_test = X.iloc[idx_train], X.iloc[idx_test]
    y_train, y_test = y[idx_train], y[idx_test]

    return X_train, X_test, y_train, y_test

Let us first run an example of a regression task. The dataset we will be using is the Wine Poland dataset, which contains information about wines on the polish market. The task is to predict the price.

The basic preparations are:
- set basic specifications
- load the prepared data and configs; set train/test split
- generate graphs for each table entries (rows) using the Table2GraphTransformer

In [3]:
# Set basic specifications
data_name = "wina_pl"      # Name of the data
num_train = 128     # Train-size
random_state = 3    # Random_state

# Load data and set train/test split
data, data_config = _load_data(data_name)
X_train_, X_test_, y_train, y_test = _set_split(
    data,
    data_config,
    num_train,
    random_state=random_state,
)
preprocessor = Table2GraphTransformer()
X_train = preprocessor.fit_transform(X_train_, y=y_train)
X_test = preprocessor.transform(X_test_)

CARTE currently runs with the sklearn interface (fit/predict) and the process is:
- Define parameters
- Set the estimator
- Run 'fit' to train the model and 'predict' to make predictions

In [4]:
# Define some parameters
fixed_params = dict()
fixed_params["loss"] = "squared_error"
fixed_params["scoring"] = "r2_score"
fixed_params["learning_rate"] = 2.5e-4
fixed_params["max_epoch"] = 500
fixed_params["early_stopping_patience"] = 40
fixed_params["val_size"] = 0.2
fixed_params["cross_validate"] = True
fixed_params["batch_size"] = 16
fixed_params["dropout"] = 0
fixed_params["num_model"] = 10 # 10 models for the bagging strategy
fixed_params["load_pretrain"] = True
fixed_params["freeze_pretrain"] = True
fixed_params["num_layers"] = 1
fixed_params["disable_pbar"] = False # True if you want cleanness
fixed_params["random_state"] = 0
fixed_params["device"] = "cpu"
fixed_params["n_jobs"] = 10

# Define the estimator and run fit/predict
estimator = CARTERegressor(**fixed_params) # CARTERegressor for Regression
estimator.fit(X=X_train, y=y_train)
y_pred = estimator.predict(X_test)

# Obtain the r2 score on predictions
score = r2_score(y_test, y_pred)
print(f"\nThe R2 score for CARTE:", "{:.4f}".format(score))

Model No. xx:   9%|▊         | 43/500 [00:12<02:14,  3.39it/s]
Model No. xx:  10%|▉         | 48/500 [00:13<02:06,  3.58it/s]
Model No. xx:  11%|█         | 54/500 [00:15<02:07,  3.50it/s]
Model No. xx:  12%|█▏        | 58/500 [00:16<02:09,  3.42it/s]
Model No. xx:  12%|█▏        | 58/500 [00:16<02:03,  3.57it/s]
Model No. xx:  13%|█▎        | 67/500 [00:18<01:58,  3.65it/s]
Model No. xx:  14%|█▍        | 71/500 [00:18<01:51,  3.86it/s]
Model No. xx:  13%|█▎        | 67/500 [00:18<01:56,  3.71it/s]
Model No. xx:  13%|█▎        | 66/500 [00:18<02:03,  3.50it/s]
Model No. xx:  15%|█▍        | 73/500 [00:20<01:57,  3.64it/s]



The R2 score for CARTE: 0.4243


For classification, the dataset we will be using is the Spotify dataset, which contains generic information on Spotify tracks with some associated audio features. The task is to predict the popularity of the albums.

The procedure will be similar to regression with the difference only in defining the parameters, performance measurements (AUROC), and 'predict_proba' (instead of fit since we are using AUROC)

In [5]:
# Set basic specifications
data_name = "spotify"      # Name of the data
num_train = 128     # Train-size
random_state = 3    # Random_state

# Load data and set train/test split
data, data_config = _load_data(data_name)
X_train_, X_test_, y_train, y_test = _set_split(
    data,
    data_config,
    num_train,
    random_state=random_state,
)
preprocessor = Table2GraphTransformer()
X_train = preprocessor.fit_transform(X_train_, y=y_train)
X_test = preprocessor.transform(X_test_)

In [6]:
# Define some parameters
fixed_params = dict()
fixed_params["loss"] = "binary_crossentropy"
fixed_params["scoring"] = "auroc"
fixed_params["learning_rate"] = 1e-3
fixed_params["max_epoch"] = 500
fixed_params["early_stopping_patience"] = 40
fixed_params["val_size"] = 0.2
fixed_params["cross_validate"] = False
fixed_params["batch_size"] = 16
fixed_params["dropout"] = 0
fixed_params["num_model"] = 10 # 10 models for the bagging strategy
fixed_params["load_pretrain"] = True
fixed_params["freeze_pretrain"] = True
fixed_params["num_layers"] = 1
fixed_params["disable_pbar"] = False # True if you want cleanness
fixed_params["random_state"] = 0
fixed_params["device"] = "cpu"
fixed_params["n_jobs"] = 10

# Define the estimator and run fit/predict
estimator = CARTEClassifier(**fixed_params) # CARTERegressor for Regression
estimator.fit(X=X_train, y=y_train)
y_pred = estimator.predict_proba(X_test)

# Obtain the r2 score on predictions
score = roc_auc_score(y_test, y_pred)
print(f"\nThe AUROC for CARTE:", "{:.4f}".format(score))

Model No. xx:   9%|▉         | 44/500 [00:15<02:38,  2.88it/s]
Model No. xx:  10%|▉         | 49/500 [00:15<02:24,  3.13it/s]
Model No. xx:   9%|▉         | 44/500 [00:15<02:45,  2.75it/s]
Model No. xx:  11%|█         | 53/500 [00:16<02:18,  3.22it/s]
Model No. xx:  12%|█▏        | 59/500 [00:18<02:19,  3.16it/s]
Model No. xx:  11%|█         | 54/500 [00:18<02:35,  2.87it/s]
Model No. xx:  11%|█         | 56/500 [00:19<02:37,  2.82it/s]
Model No. xx:  13%|█▎        | 66/500 [00:20<02:16,  3.19it/s]
Model No. xx:  15%|█▌        | 76/500 [00:23<02:10,  3.25it/s]
Model No. xx:  18%|█▊        | 92/500 [00:25<01:53,  3.60it/s]



The AUROC for CARTE: 0.8718
