# Example for running CARTE on single tables
In this example, we run CARTE on two datasets, one for regression and one for classification.

In [1]:
# Set the current working directory and import packages
import os
from pathlib import Path
os.chdir(Path().cwd().parent)

import json
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import r2_score, roc_auc_score
from src.carte_table_to_graph import Table2GraphTransformer
from src.carte_estimator import CARTERegressor, CARTEClassifier
from configs.directory import config_directory

In [2]:
# Define necessary functions

def _load_data(data_name):
    """Load the preprocessed data."""
    data_pd_dir = f"{config_directory['data_singletable']}/{data_name}/raw.parquet"
    data_pd = pd.read_parquet(data_pd_dir)
    data_pd.fillna(value=np.nan, inplace=True)
    config_data_dir = f"{config_directory['data_singletable']}/{data_name}/config_data.json"
    filename = open(config_data_dir)
    config_data = json.load(filename)
    filename.close()
    return data_pd, config_data

def _set_split(data, data_config, num_train, random_state):
    """Set train/test split given the random state."""
    target_name = data_config["target_name"]
    X = data.drop(columns=target_name)
    y = data[target_name]
    y = np.array(y)

    if data_config["repeated"]:
        entity_name = data_config["entity_name"]
    else:
        entity_name = np.arange(len(y))

    groups = np.array(data.groupby(entity_name).ngroup())
    num_groups = len(np.unique(groups))
    gss = GroupShuffleSplit(
        n_splits=1,
        test_size=int(num_groups - num_train),
        random_state=random_state,
    )
    idx_train, idx_test = next(iter(gss.split(X=y, groups=groups)))

    X_train, X_test = X.iloc[idx_train], X.iloc[idx_test]
    y_train, y_test = y[idx_train], y[idx_test]

    return X_train, X_test, y_train, y_test

Let us first run an example of a regression task. The dataset we will be using is the Wine Poland dataset, which contains information about wines on the polish market. The task is to predict the price.

The basic preparations are:
- preprocess raw data
- load the prepared data and configs; set train/test split
- generate graphs for each table entries (rows) using the Table2GraphTransformer
- create an estimator and make inference

The codes for preprocessing is provided in scripts/preprocess_raw.py. Here, we directly use the transformed data, which should be in data/data_singletable if you have successfully downloaded it with the instructions.

We transform each data point (row) with the Table2GraphTransformer.

In [3]:
# Set basic specifications
data_name = "wina_pl"      # Name of the data
num_train = 128     # Train-size
random_state = 1    # Random_state

# Load data and set train/test split
data, data_config = _load_data(data_name)
X_train_, X_test_, y_train, y_test = _set_split(
    data,
    data_config,
    num_train,
    random_state=random_state,
)
preprocessor = Table2GraphTransformer()
X_train = preprocessor.fit_transform(X_train_, y=y_train)
X_test = preprocessor.transform(X_test_)

In [5]:
# Original data
print("Original Data:\n", X_train_.iloc[0])

# Graph data
print("\nGraph Data:\n", X_train[0])

Original Data:
 name                   Achillée Crémant Soléra AOC Crémant d'Alsace NV
country                                                         France
region                                                          Alsace
appellation                                       Cremant d'Alsace AOC
vineyard                                                      Achillée
vintage                                                            NaN
volume                                                           750.0
ABV                                                               13.5
serving_temperature                                                  9
wine_type                                                          NaN
taste                                                              dry
style                                                          average
vegan                                                            False
natural                                                      

The result is a list of graph objects which can be used as inputs for the neural network in CARTE.

Each row is transformed into a graph data with node features(x), edge index (the graph structure), edge features, and the target y (not visible in the test set).

Also, this data point contains 13 columns (out of 15) which are not missing. Thus, the resulting graph will contain 14 node features (13 columns and center node), and 26 edge features (13 columns and 13 self-loops), as the graph is directed.

For learning, CARTE currently runs with the sklearn interface (fit/predict) and the process is:
- Define parameters
- Set the estimator
- Run 'fit' to train the model and 'predict' to make predictions

In [6]:
# Define some parameters
fixed_params = dict()
fixed_params["num_model"] = 10 # 10 models for the bagging strategy
fixed_params["disable_pbar"] = False # True if you want cleanness
fixed_params["random_state"] = 0
fixed_params["device"] = "cpu"
fixed_params["n_jobs"] = 10

# Define the estimator and run fit/predict
estimator = CARTERegressor(**fixed_params) # CARTERegressor for Regression
estimator.fit(X=X_train, y=y_train)
y_pred = estimator.predict(X_test)

# Obtain the r2 score on predictions
score = r2_score(y_test, y_pred)
print(f"\nThe R2 score for CARTE:", "{:.4f}".format(score))

Model No. xx:   9%|▉         | 46/500 [00:13<02:08,  3.54it/s]
Model No. xx:  10%|█         | 51/500 [00:13<01:59,  3.74it/s]
Model No. xx:  10%|█         | 51/500 [00:14<02:06,  3.54it/s]
Model No. xx:  11%|█         | 54/500 [00:15<02:04,  3.58it/s]
Model No. xx:  13%|█▎        | 66/500 [00:15<01:43,  4.20it/s]
Model No. xx:  12%|█▏        | 62/500 [00:16<01:57,  3.73it/s]
Model No. xx:  12%|█▏        | 61/500 [00:16<02:00,  3.65it/s]
Model No. xx:  20%|█▉        | 98/500 [00:22<01:32,  4.36it/s]]
Model No. xx:  22%|██▏       | 112/500 [00:25<01:27,  4.46it/s]
Model No. xx:  23%|██▎       | 114/500 [00:25<01:25,  4.50it/s]



The R2 score for CARTE: 0.3460


For classification, the dataset we will be using is the Spotify dataset, which contains generic information on Spotify tracks with some associated audio features. The task is to predict the popularity of the albums.

The procedure will be similar to regression with the difference only in defining the parameters, performance measurements (AUROC), and 'predict_proba' (instead of fit since we are using AUROC)

In [7]:
# Set basic specifications
data_name = "spotify"      # Name of the data
num_train = 128     # Train-size
random_state = 1    # Random_state

# Load data and set train/test split
data, data_config = _load_data(data_name)
X_train_, X_test_, y_train, y_test = _set_split(
    data,
    data_config,
    num_train,
    random_state=random_state,
)
preprocessor = Table2GraphTransformer()
X_train = preprocessor.fit_transform(X_train_, y=y_train)
X_test = preprocessor.transform(X_test_)

In [8]:
# Define some parameters
fixed_params = dict()
fixed_params["num_model"] = 10 # 10 models for the bagging strategy
fixed_params["disable_pbar"] = False # True if you want cleanness
fixed_params["random_state"] = 0
fixed_params["device"] = "cpu"
fixed_params["n_jobs"] = 10

# Define the estimator and run fit/predict
estimator = CARTEClassifier(**fixed_params) # CARTERegressor for Regression
estimator.fit(X=X_train, y=y_train)
y_pred = estimator.predict_proba(X_test)

# Obtain the r2 score on predictions
score = roc_auc_score(y_test, y_pred)
print(f"\nThe AUROC for CARTE:", "{:.4f}".format(score))

Model No. xx:  11%|█         | 53/500 [00:13<01:51,  3.99it/s]
Model No. xx:  10%|█         | 50/500 [00:13<02:04,  3.63it/s]
Model No. xx:  14%|█▍        | 71/500 [00:15<01:36,  4.44it/s]
Model No. xx:  13%|█▎        | 65/500 [00:15<01:42,  4.22it/s]
Model No. xx:  14%|█▍        | 70/500 [00:16<01:38,  4.36it/s]
Model No. xx:  12%|█▏        | 61/500 [00:16<02:00,  3.63it/s]
Model No. xx:  13%|█▎        | 63/500 [00:17<01:59,  3.66it/s]
Model No. xx:  13%|█▎        | 64/500 [00:17<01:59,  3.65it/s]
Model No. xx:  12%|█▏        | 60/500 [00:18<02:18,  3.18it/s]
Model No. xx:  14%|█▍        | 72/500 [00:19<01:53,  3.79it/s]



The AUROC for CARTE: 0.8864
