# Import Dependencies

In [None]:
import os
import random
random_state = int(random.random()*1e9)

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings('ignore', category=ConvergenceWarning)

import sys
sys.path.append('/'.join(os.getcwd().split('/')[:-1]))
sys.path.append('/'.join(os.getcwd().split('/')[:-2]))

from tensor_completion_models.ETC import ETC
from tensor_completion_models.utils import *
from tensor_completion_models.ensemble_costco import ensemble_costco as EnT

In [2]:
def one_hot_encode(df, col, target):
    dummies = pd.get_dummies(df[col], prefix=col)
    r_df = pd.concat([dummies.astype(int), df.drop(columns=[col])], axis=1)
    r_df = r_df[[c for c in r_df.columns if c != target] + [target]]
    return r_df

# Data

### Load Data

In [None]:
data_folder = 'data/'

### Setup Sampling

In [None]:
n_splits = 5

In [None]:
datasets = [
    'lattice',
    'crossed_barrel',
    'cogni_spin'
]

In [None]:
all_train_test_splits = list()
for dataset in datasets:
    t = 'none' if dataset == 'cogni_spin' else 'log'
    # t = 'log'

    if dataset == 'lattice':
        df = pd.read_csv(f'{data_folder}database_latticedesign.csv')
        features = ['lattice_type', 't', 'uc_x1', 'uc_x2', 'uc_x3']

        targets = [
            'E (MPa)',
            'E_specific (MPa/g)'
            ]

        if len(targets) > 1:
            df_list = list()
            for ti in range(len(targets)): 
                sub_df = df[features + [targets[ti]]].to_numpy()
                sub_df = np.concatenate((ti*np.ones((sub_df.shape[0], 1)), 
                                        sub_df), 
                                        axis = 1)

                df_list.append(sub_df)

            target = 'target'
            df = pd.DataFrame(np.concatenate(df_list), columns = ['task'] + features + [target])
            features = [x for x in list(df.columns) if x != target]
        else:
            target = targets[0]
            df = df[features + [target]]

    elif dataset == 'crossed_barrel':
        df = pd.read_csv(f'{data_folder}crossed_barrel_dataset_v2.csv')
        features = ['n', 'theta', 'r', 't']
        targets = ['toughness']
        target = targets[0]
        df = df[features + [target]]
        df = df.groupby(features, as_index=False).agg(target=(target, 'median'))
        target = 'target'

    elif dataset.lower() == 'cogni_spin':
        df = pd.read_csv(f'{data_folder}Cogni-e-SpinDB 1.0.csv')

        features = ['solution_concentration', 'voltage_kv', 'flow_rate_ml/h', 'tip_collector_distance_cm',
                    'polymer(s)']
        # features += ['needle_diameter_g']  # ~50% missing values!

        target = 'fiber_diameter_nm'

        df = df[df['polymer(s)'].map(lambda x: x in ['PVDF', 'PVA', 'PAN'])]
        codes, uniques = pd.factorize(df["polymer(s)"])
        df['polymer(s)'] = codes

        df = df[features + [target]].dropna()

        mask = (df['flow_rate_ml/h'] < 5) & (df['tip_collector_distance_cm'] < 45) & (df['voltage_kv'] < 40)
        df = df[mask]

    df = (
        df.groupby(features, as_index=False)
        .agg(
            value_mean=(target, 'mean'),
            target_std=(target, 'std'),
            target_min=(target, 'min'),
            target_max=(target, 'max'),
            num_duplicates=(target, 'count')
        )
    )

    df['target_std'] = df['target_std'].fillna(0)
    df.columns = [x if x != 'value_mean' else target for x in df.columns]

    df = df[df['target_std'] == 0]

    df = df[df.columns[:-4]]

    ts = {
        'none': lambda x: x,
        'log': lambda x: np.log(x),
        'sqrt': lambda x: np.sqrt(x),
        'loglog': lambda x: np.log( np.log(x) )
    }
    t = ts[t]

    df[target] = t(df[target])

    round_ = None
    if type(round_) in [float, int]: round_ = {feature:round_ for feature in features}
    elif round_ is not None:
        for feature in features:
            if feature not in round_: round_[feature] = round_['default']
    tensor_df = df.copy()
    for feature in features:
        if round_ is not None: unique = (tensor_df[feature]//round_[feature]).unique()
        else: unique = tensor_df[feature].unique()
        unique = np.sort(unique)
        conv = {unique[i]:i for i in range(len(unique))}
        if round_ is not None: tensor_df[feature] = (tensor_df[feature]//round_[feature]).map(lambda x: conv[x])
        else: tensor_df[feature] = tensor_df[feature].map(lambda x: conv[x])

    tensor_shape = torch.Size([tensor_df[feature].max()+1 for feature in features])

    indices = tensor_df[features].to_numpy()
    indices = torch.tensor(indices, dtype = torch.int32)
    values = torch.tensor(tensor_df[target].to_numpy())

    sparse_tensor = torch.sparse_coo_tensor(indices = indices.t(), 
                                            values = values, 
                                            size = tensor_shape
                                        ).coalesce()

    if 'lattice_type' in df.columns: df = one_hot_encode(df, 'lattice_type', target)
    if 'polymer(s)' in df.columns: df = one_hot_encode(df, 'polymer(s)', target)

    features = [x for x in df.columns if x != target]    

    if dataset == 'lattice':
        n_in,  n_out = 50, 5
        c1 = 'lattice_type'
        c2 = 't'
        # condition = ((tensor_df[c1] == 0) | (tensor_df[c1] == 1)) & (tensor_df[c2] == 1)

        c1_low, c1_high = 0, 1
        c2_low, c2_high = 1, 1
        condition = (tensor_df[c1] >= c1_low) & (tensor_df[c1] <= c1_high) & (tensor_df[c2] >= c2_low) & (tensor_df[c2] <= c2_high)

    if dataset == 'crossed_barrel':

        n_in,  n_out = 150, 75
        c1 = 'n'
        c2 = 't'
        # condition = (tensor_df[c1] <= 1) & (tensor_df[c2] == 1)

        c1_low, c1_high = 0, 1
        c2_low, c2_high = 1, 2
        condition = (tensor_df[c1] >= c1_low) & (tensor_df[c1] <= c1_high) & (tensor_df[c2] >= c2_low) & (tensor_df[c2] <= c2_high)

    if dataset == 'cogni_spin':
        # n_in,  n_out = 75, 35
        n_in, n_out = 140, 40
        c1 = 'flow_rate_ml/h'
        c2 = 'voltage_kv'

        c1_low, c1_high = 0, 8
        c2_low, c2_high = 10, 18
        condition = (tensor_df[c1] >= c1_low) & (tensor_df[c1] <= c1_high) & (tensor_df[c2] >= c2_low) & (tensor_df[c2] <= c2_high)

    train_test_splits = list()
    for _ in range(n_splits):

        df1 = df[condition]
        df2 = df[~condition]
        if _ == 0: print('\n' + str(df1.shape[0]), str(df2.shape[0]), '\n')

        df1_i = list(df1.index)
        df2_i = list(df2.index)

        random_seed = int(1_000_000 * random.random())
        random.seed(random_seed)
        
        print(f"Random Seed = {random_seed}")

        random.shuffle(df1_i)
        random.shuffle(df2_i)

        train_i = df1_i[:n_in] + df2_i[:n_out]
        test_i = df1_i[n_in:] + df2_i[n_out:]

        test_ood_i = df2_i[n_out:]
        test_ood_df = df.loc[test_ood_i].sample(frac = 1, random_state = random_state).reset_index(drop = True)
        tensor_test_ood_df = tensor_df.loc[test_ood_i].sample(frac = 1, random_state = random_state).reset_index(drop = True)

        # ________________________________________________________________________________________________________________

        train_df = df.loc[train_i].sample(frac = 1, random_state = random_state).reset_index(drop = True)
        test_df = df.loc[test_i].sample(frac = 1, random_state = random_state).reset_index(drop = True)
        X_train = train_df[[c for c in train_df.columns if c != target]].to_numpy()
        Y_train = train_df[target].to_numpy()

        X_test = test_df[[c for c in train_df.columns if c != target]].to_numpy()
        Y_test = test_df[target].to_numpy()

        X_test_ood = test_ood_df[[c for c in train_df.columns if c != target]].to_numpy()
        Y_test_ood = test_ood_df[target].to_numpy()

        X = np.concatenate((X_train, X_test))
        X_train = (X_train - X.min(axis = 0)) / (X.max(axis = 0) - X.min(axis = 0))
        X_test = (X_test - X.min(axis = 0)) / (X.max(axis = 0) - X.min(axis = 0))
        X_test_ood = (X_test_ood - X.min(axis = 0)) / (X.max(axis = 0) - X.min(axis = 0))
        del X

        Y_test = (Y_test - Y_train.min()) / (Y_train.max() - Y_train.min())
        Y_test_ood = (Y_test_ood - Y_train.min()) / (Y_train.max() - Y_train.min())
        Y_train = (Y_train - Y_train.min()) / (Y_train.max() - Y_train.min())

        # ________________________________________________________________________________________________________________

        tensor_train_df = tensor_df.loc[train_i].sample(frac = 1, random_state = random_state).reset_index(drop = True)
        tensor_test_df = tensor_df.loc[test_i].sample(frac = 1, random_state = random_state).reset_index(drop = True)

        tFeatures = [c for c in tensor_test_df.columns if c != target]

        tX_train = torch.tensor(tensor_train_df[tFeatures].to_numpy(), dtype = torch.int32).t()
        tX_test = torch.tensor(tensor_test_df[tFeatures].to_numpy(), dtype = torch.int32).t()
        tX_test_ood = torch.tensor(tensor_test_ood_df[tFeatures].to_numpy(), dtype = torch.int32)

        tY_train = torch.tensor(tensor_train_df[target].to_numpy())
        tY_test = torch.tensor(tensor_test_df[target].to_numpy())
        tY_test_ood = torch.tensor(tensor_test_ood_df[target].to_numpy())

        tY_test = (tY_test - tY_train.min()) / (tY_train.max() - tY_train.min())
        tY_test_ood = (tY_test_ood - tY_train.min()) / (tY_train.max() - tY_train.min())
        tY_train = (tY_train - tY_train.min()) / (tY_train.max() - tY_train.min())

        training_sparse_tensor = torch.sparse_coo_tensor(indices = tX_train,
                                                        values = tY_train, 
                                                        size = tensor_shape
                                                        ).coalesce()

        testing_sparse_tensor = torch.sparse_coo_tensor(indices = tX_test, 
                                                        values = tY_test, 
                                                        size = tensor_shape
                                                        ).coalesce()

        # ________________________________________________________________________________________________________________

        train_test_splits.append({
            'ML':(X_train, X_test, Y_train, Y_test),
            'Tensor':(training_sparse_tensor, testing_sparse_tensor),
            'Test_OOD':(X_test_ood, Y_test_ood, tX_test_ood, tY_test_ood),
        })

    all_train_test_splits.append({
        'dataset':dataset,
        'tensor_shape':tensor_shape,
        'train_test_splits':train_test_splits
    })


149 115 

Random Seed = 821460
Random Seed = 745511
Random Seed = 391005


# Experiments

### Experiment

In [7]:
def get_results():
    
    all_results = list()
    all_preds = list()
    
    for dataset_dict in all_train_test_splits:

        dataset = dataset_dict['dataset']
        tensor_shape = dataset_dict['tensor_shape']
        train_test_splits = dataset_dict['train_test_splits']

        it_list = list()
        pred_it_list = list()

        print(f"Dataset: {dataset}")
        print("Iteration:", end = " ")
        for it in range(len(train_test_splits)):

            random_state = int(random.random()*100_000)
            training_sparse_tensor, testing_sparse_tensor = train_test_splits[it]['Tensor']
            X_train, X_test, Y_train, Y_test = train_test_splits[it]['ML']
            tY_test = testing_sparse_tensor.values().numpy()
            X_test_ood, Y_test_ood, tX_test_ood, tY_test_ood = train_test_splits[it]['Test_OOD']
            tY_test_ood = tY_test_ood.numpy()

            metric_list = list()
            pred_list = list()

            # ________________________________________________________________________________________________________________________

            cpd = train_tensor_completion(model_type = 'cpd',
                                        rank = {'lattice':32, 'crossed_barrel':4, 'cogni_spin':6}[dataset],
                                        sparse_tensor = training_sparse_tensor,
                                        num_epochs = 500,
                                        batch_size = 96,
                                        loss_p = 2,
                                        lr = 1e-3,
                                        wd = 1e-4,
                                        val_size = None,
                                        early_stopping = False,
                                        verbose = False)

            preds = cpd(testing_sparse_tensor.indices().t()).detach().cpu().numpy()
            r2 = r2_score(tY_test, preds)
            mae = abs(preds - tY_test).mean()
            r_mse = (abs((preds - tY_test) ** 2).mean()) ** (1/2)
            mape = abs((preds - tY_test) / tY_test).mean()
            smape = abs( 2 * ((preds - tY_test) / (abs(tY_test) + abs(preds)))).mean()

            ood_preds = cpd(tX_test_ood).detach().cpu().numpy()
            ood_mae = abs(ood_preds - tY_test_ood).mean()

            metric_list.append([r2, mae, r_mse, mape, smape, ood_mae])
            pred_list.append([preds, tY_test])
            del cpd, preds, r2, mae

            # ________________________________________________________________________________________________________________________

            cpd_s = train_tensor_completion(model_type = 'cpd.smooth',
                                            rank = {'lattice':24, 'crossed_barrel':6, 'cogni_spin':4}[dataset],
                                            sparse_tensor = training_sparse_tensor,
                                            non_smooth_modes = {'lattice':[0], 'crossed_barrel':[], 'cogni_spin':[0, 4]}[dataset],
                                            num_epochs = 500,
                                            batch_size = 96,
                                            loss_p = 2,
                                            lr = 1e-3,
                                            wd = 1e-4,
                                            val_size = None,
                                            early_stopping = False,
                                            verbose = False)

            preds = cpd_s(testing_sparse_tensor.indices().t()).detach().cpu().numpy()
            r2 = r2_score(tY_test, preds)
            mae = abs(preds - tY_test).mean()
            r_mse = (abs((preds - tY_test) ** 2).mean()) ** (1/2)
            mape = abs((preds - tY_test) / tY_test).mean()
            smape = abs( 2 * ((preds - tY_test) / (abs(tY_test) + abs(preds)))).mean()

            ood_preds = cpd_s(tX_test_ood).detach().cpu().numpy()
            ood_mae = abs(ood_preds - tY_test_ood).mean()

            metric_list.append([r2, mae, r_mse, mape, smape, ood_mae])
            pred_list.append([preds, tY_test])
            del cpd_s, preds, r2, mae       

            # ________________________________________________________________________________________________________________________

            ent = EnT(tensor_shape = tensor_shape,
                      rank = 5 if dataset == 'lattice' else 32,
                      activation = 'relu',
                      dropout = [0.2, 0],
                      hidden_channels = 7,
                      n_decompositions = 10)

            ent.train_model(training_sparse_tensor.indices().t(),
                            training_sparse_tensor.values(),
                            batch_size = 32,
                            validation_portion = 0,
                            early_stopping = 0,
                            n_epochs = 1000, 
                            lr = 5e-3, 
                            wd = 5e-3,
                            verbose = False)

            preds = ent(testing_sparse_tensor.indices().t()).detach().cpu().numpy()
            r2 = r2_score(tY_test, preds)
            mae = abs(preds - tY_test).mean()
            r_mse = (abs((preds - tY_test) ** 2).mean()) ** (1/2)
            mape = abs((preds - tY_test) / tY_test).mean()
            smape = abs( 2 * ((preds - tY_test) / (abs(tY_test) + abs(preds)))).mean()

            ood_preds = ent(tX_test_ood).detach().cpu().numpy()
            ood_mae = abs(ood_preds - tY_test_ood).mean()

            metric_list.append([r2, mae, r_mse, mape, smape, ood_mae])
            pred_list.append([preds, tY_test])
            del ent, preds, r2, mae

            # ________________________________________________________________________________________________________________________

            neat = train_tensor_completion(model_type = 'NeAT',
                                        rank = 32,
                                        sparse_tensor = training_sparse_tensor,
                                        num_epochs = 500,
                                        batch_size = 64,
                                        loss_p = 2,
                                        lr = 1e-3,
                                        wd = 1e-4,
                                        NeAT_hidden_dim = 32,
                                        NeAT_drop = 0.1,
                                        NeAT_drop2 = 0.5,
                                        val_size = None,
                                        early_stopping = False,
                                        verbose = False)

            preds = neat(testing_sparse_tensor.indices().t()).detach().cpu().numpy()
            r2 = r2_score(tY_test, preds)
            mae = abs(preds - tY_test).mean()
            r_mse = (abs((preds - tY_test) ** 2).mean()) ** (1/2)
            mape = abs((preds - tY_test) / tY_test).mean()
            smape = abs( 2 * ((preds - tY_test) / (abs(tY_test) + abs(preds)))).mean()

            ood_preds = neat(tX_test_ood).detach().cpu().numpy()
            ood_mae = abs(ood_preds - tY_test_ood).mean()

            metric_list.append([r2, mae, r_mse, mape, smape, ood_mae])
            pred_list.append([preds, tY_test])
            del neat, preds, r2, mae

            # ________________________________________________________________________________________________________________________

            xgb = XGBRegressor(n_estimators = 100,
                               max_depth = None,
                               random_state = random_state)

            xgb.fit(X_train, Y_train)
            preds = xgb.predict(X_test)
            r2 = r2_score(Y_test, preds)
            mae = abs(preds - Y_test).mean()
            r_mse = (abs((preds - Y_test) ** 2).mean()) ** (1/2)
            mape = abs((preds - Y_test) / Y_test).mean()
            smape = abs( 2 * ((preds - Y_test) / (abs(Y_test) + abs(preds)))).mean()
            
            ood_preds = xgb.predict(X_test_ood)
            ood_mae = abs(ood_preds - Y_test_ood).mean()

            metric_list.append([r2, mae, r_mse, mape, smape, ood_mae])
            pred_list.append([preds, Y_test])
            del xgb, preds, r2, mae

            # ________________________________________________________________________________________________________________________

            catb = CatBoostRegressor(iterations = 500,
                                     learning_rate = 5e-2,
                                     depth = 16,
                                     loss_function = 'RMSE',
                                     random_state = random_state,
                                     verbose = False)

            catb.fit(X_train, Y_train)
            preds = catb.predict(X_test)
            r2 = r2_score(Y_test, preds)
            mae = abs(preds - Y_test).mean()
            r_mse = (abs((preds - Y_test) ** 2).mean()) ** (1/2)
            mape = abs((preds - Y_test) / Y_test).mean()
            smape = abs( 2 * ((preds - Y_test) / (abs(Y_test) + abs(preds)))).mean()

            ood_preds = catb.predict(X_test_ood)
            ood_mae = abs(ood_preds - Y_test_ood).mean()            

            metric_list.append([r2, mae, r_mse, mape, smape, ood_mae])
            pred_list.append([preds, Y_test])
            del catb, preds, r2, mae     

            # ________________________________________________________________________________________________________________________

            mlp = MLPRegressor(hidden_layer_sizes = (32),
                               activation = 'relu',
                               learning_rate = 'constant',
                               solver = 'adam',
                               validation_fraction = 0,
                               early_stopping = False,
                               random_state = random_state)

            mlp.fit(X_train, Y_train)
            preds = mlp.predict(X_test)
            r2 = r2_score(Y_test, preds)
            mae = abs(preds - Y_test).mean()
            r_mse = (abs((preds - Y_test) ** 2).mean()) ** (1/2)
            mape = abs((preds - Y_test) / Y_test).mean()
            smape = abs( 2 * ((preds - Y_test) / (abs(Y_test) + abs(preds)))).mean()

            ood_preds = mlp.predict(X_test_ood)
            ood_mae = abs(ood_preds - Y_test_ood).mean()

            metric_list.append([r2, mae, r_mse, mape, smape, ood_mae])
            pred_list.append([preds, Y_test])
            del mlp, preds, r2, mae     

            # ________________________________________________________________________________________________________________________

            kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + WhiteKernel(1e-3, (1e-5, 1e1))
            gp = GaussianProcessRegressor(kernel = kernel, 
                                          alpha = 1e-6, 
                                          n_restarts_optimizer = 3, 
                                          random_state = random_state)

            gp.fit(X_train, Y_train)
            preds = gp.predict(X_test)
            r2 = r2_score(Y_test, preds)
            mae = abs(preds - Y_test).mean()
            r_mse = (abs((preds - Y_test) ** 2).mean()) ** (1/2)
            mape = abs((preds - Y_test) / Y_test).mean()
            smape = abs( 2 * ((preds - Y_test) / (abs(Y_test) + abs(preds)))).mean()

            ood_preds = gp.predict(X_test_ood)
            ood_mae = abs(ood_preds - Y_test_ood).mean()

            metric_list.append([r2, mae, r_mse, mape, smape, ood_mae])
            pred_list.append([preds, Y_test])
            del gp, preds, r2, mae

            # ________________________________________________________________________________________________________________________

            rf = RandomForestRegressor(n_estimators = 100, 
                                       max_depth = None, 
                                       random_state = random_state)

            rf.fit(X_train, Y_train)
            preds = rf.predict(X_test)
            r2 = r2_score(Y_test, preds)
            mae = abs(preds - Y_test).mean()
            r_mse = (abs((preds - Y_test) ** 2).mean()) ** (1/2)
            mape = abs((preds - Y_test) / Y_test).mean()
            smape = abs( 2 * ((preds - Y_test) / (abs(Y_test) + abs(preds)))).mean()

            ood_preds = rf.predict(X_test_ood)
            ood_mae = abs(ood_preds - Y_test_ood).mean()

            metric_list.append([r2, mae, r_mse, mape, smape, ood_mae])
            pred_list.append([preds, Y_test])
            del rf, preds, r2, mae

            # ________________________________________________________________________________________________________________________

            svm = SVR(kernel = 'rbf', degree = 3, tol = 1e-4)

            svm.fit(X_train, Y_train)
            preds = svm.predict(X_test)
            r2 = r2_score(Y_test, preds)
            mae = abs(preds - Y_test).mean()
            r_mse = (abs((preds - Y_test) ** 2).mean()) ** (1/2)
            mape = abs((preds - Y_test) / Y_test).mean()
            smape = abs( 2 * ((preds - Y_test) / (abs(Y_test) + abs(preds)))).mean()

            ood_preds = svm.predict(X_test_ood)
            ood_mae = abs(ood_preds - Y_test_ood).mean()

            metric_list.append([r2, mae, r_mse, mape, smape, ood_mae])
            pred_list.append([preds, Y_test])
            del svm, preds, r2, mae

            # ________________________________________________________________________________________________________________________

            lm = LinearRegression()

            lm.fit(X_train, Y_train)
            preds = lm.predict(X_test)
            r2 = r2_score(Y_test, preds)
            mae = abs(preds - Y_test).mean()
            r_mse = (abs((preds - Y_test) ** 2).mean()) ** (1/2)
            mape = abs((preds - Y_test) / Y_test).mean()
            smape = abs( 2 * ((preds - Y_test) / (abs(Y_test) + abs(preds)))).mean()

            ood_preds = lm.predict(X_test_ood)
            ood_mae = abs(ood_preds - Y_test_ood).mean()

            metric_list.append([r2, mae, r_mse, mape, smape, ood_mae])
            pred_list.append([preds, Y_test])
            del lm, preds, r2, mae

            # ________________________________________________________________________________________________________________________

            it_list.append(metric_list)
            pred_it_list.append(pred_list)

            if it == len(train_test_splits)-1: print(it+1, '\n')
            else: print(it+1, end = ", ")

        all_results.append(it_list)
        all_preds.append(pred_it_list)

    all_results = np.stack(all_results)
    print("All done!")

    return all_results, all_preds

### Display Progress

In [8]:
all_results, all_preds = get_results()

Dataset: cogni_spin
Iteration: 1, 2, 3 

All done!


# Results

In [9]:
models_colors = {
    'CPD':'deepskyblue',
    'CPD-S':'orangered',
    'CoSTCo':'limegreen',
    'NeAT':'magenta',
    'XGBoost':'gray',
    'CatBoost':'gray',
    'MLP':'gray',
    'GP':'gray',
    'RF':'gray',
    'SVM':'gray',
    'LR':'gray'
    }

models = list(models_colors.keys())
colors = list(models_colors.values())
pd.DataFrame(all_results[0].mean(axis = 0).round(3),
             columns = ['R²', 'MAE', 'RMSE', 'MAPE', 'sMAPE', 'OOD MAE'],
             index = models)

Unnamed: 0,R²,MAE,RMSE,MAPE,sMAPE,OOD MAE
CPD,-0.283,0.114,0.263,2.727,0.94,0.122
CPD-S,-0.238,0.107,0.257,2.067,0.641,0.116
CoSTCo,0.166,0.094,0.22,2.277,0.645,0.102
NeAT,-0.514,0.115,0.232,6.835,0.798,0.125
XGBoost,-0.003,0.104,0.244,2.445,0.585,0.115
CatBoost,0.375,0.089,0.206,2.286,0.57,0.097
MLP,-0.804,0.139,0.265,5.971,0.963,0.149
GP,0.011,0.106,0.234,2.527,0.856,0.116
RF,0.077,0.103,0.225,2.547,0.593,0.113
SVM,-0.364,0.128,0.235,6.011,0.872,0.136


In [10]:
# dataset_i = 2
# it_i = 0
# len(all_preds[dataset_i])

In [11]:
metric_order = ['R²', 'MAE', 'OOD MAE', 'MAPE']
model_order = ['LR', 'SVM', 'RF', 'XGBoost', 'CatBoost', 'GP', 'MLP', 
               'CPD', 'CPD-S', 'NeAT', 'CoSTCo']

mega_df = list()
for dataset_results in all_results:

    latex_df = list()
    for metr_c in range(dataset_results.shape[2]):
        if metr_c == 0: 
            arr = dataset_results[:, :, metr_c].mean(axis = 0)
            u = np.unique(arr)
            best, best2 = u[-1], u[-2]
        else: 
            arr = dataset_results[:, :, metr_c].mean(axis = 0)
            u = np.unique(arr)
            best, best2 = u[0], u[1]

        latex_df_row = list()
        for model_r in range(dataset_results.shape[1]):

            entry_mean = dataset_results[:, model_r, metr_c].mean()
            entry_std = dataset_results[:, model_r, metr_c].std()

            if entry_mean.round(2) == best.round(2): entry_str = "\\textbf{" + f"{entry_mean:.2f} ± {entry_std:.1f}" + "}"
            elif entry_mean.round(2) == best2.round(2): entry_str = "\\underline{" + f"{entry_mean:.2f} ± {entry_std:.1f}" + "}"
            else: entry_str = f"{entry_mean:.2f} ± {entry_std:.1f}"

            latex_df_row.append(entry_str)
        latex_df.append(latex_df_row)

    metric_names = ['R²', 'MAE', 'RMSE', 'MAPE', 'sMAPE', 'OOD MAE']
    latex_df = pd.DataFrame(np.stack(latex_df).T,
                            index = models,
                            columns = metric_names)

    latex_df = latex_df[metric_order]

    mega_df.append(latex_df)

mega_df = pd.concat(mega_df, axis = 1)
mega_df = mega_df.loc[model_order]

In [12]:
latex_str = mega_df.to_latex()
latex_str = '\n'.join(['\\begin{tabular}{r|cccc|cccc|cccc}'] + 
                      latex_str.split('\n')[1:2] + 
                      ['\multicolumn{1}{c}{} & \multicolumn{4}{c}{Lattice Dataset} & \multicolumn{4}{c}{Crossed Barrel Dataset} & \multicolumn{4}{c}{Cogni-e-Spin Dataset} \\'] +
                      latex_str.split('\n')[2:11] + 
                      ['\\midrule'] + latex_str.split('\n')[11:])
print(latex_str)

\begin{tabular}{r|cccc|cccc|cccc}
\toprule
\multicolumn{1}{c}{} & \multicolumn{4}{c}{Lattice Dataset} & \multicolumn{4}{c}{Crossed Barrel Dataset} & \multicolumn{4}{c}{Cogni-e-Spin Dataset} \
 & R² & MAE & OOD MAE & MAPE \\
\midrule
LR & -0.27 ± 0.7 & 0.12 ± 0.1 & 0.13 ± 0.1 & 4.08 ± 2.2 \\
SVM & -0.36 ± 0.8 & 0.13 ± 0.1 & 0.14 ± 0.1 & 6.01 ± 4.2 \\
RF & 0.08 ± 0.2 & 0.10 ± 0.1 & 0.11 ± 0.1 & 2.55 ± 1.3 \\
XGBoost & -0.00 ± 0.2 & 0.10 ± 0.1 & 0.12 ± 0.1 & 2.45 ± 0.6 \\
CatBoost & \textbf{0.37 ± 0.1} & \textbf{0.09 ± 0.1} & \textbf{0.10 ± 0.1} & 2.29 ± 1.3 \\
GP & 0.01 ± 0.4 & 0.11 ± 0.1 & 0.12 ± 0.1 & 2.53 ± 2.0 \\
MLP & -0.80 ± 1.1 & 0.14 ± 0.1 & 0.15 ± 0.1 & 5.97 ± 3.5 \\
\midrule
CPD & -0.28 ± 0.3 & 0.11 ± 0.1 & 0.12 ± 0.1 & 2.73 ± 1.5 \\
CPD-S & -0.24 ± 0.3 & 0.11 ± 0.1 & 0.12 ± 0.1 & \textbf{2.07 ± 1.0} \\
NeAT & -0.51 ± 1.2 & 0.12 ± 0.1 & 0.13 ± 0.1 & 6.84 ± 6.2 \\
CoSTCo & \underline{0.17 ± 0.1} & \textbf{0.09 ± 0.1} & \textbf{0.10 ± 0.1} & \underline{2.28 ± 1.1} \\
\bottomrule


In [13]:
def parity_plot(dataset_i, model_i, it_i = 0, title = False, figsize = (10, 6), metrics = ['r2', 'r']):

    preds  = all_preds[dataset_i][it_i][model_i][0]
    Y_test = all_preds[dataset_i][it_i][model_i][1]

    r2 = r2_score(Y_test, preds)
    r, pv = pearsonr(Y_test, preds)
    mae = abs(Y_test - preds).mean()
    rmse = np.sqrt( ((Y_test - preds) ** 2).mean() )
    mape = abs((Y_test - preds)/Y_test).mean()

    plt.figure(figsize=figsize)
    plt.scatter(Y_test, preds, color = 'limegreen', edgecolors = 'black')

    min_val = min(Y_test.min(), preds.min())
    max_val = max(Y_test.max(), preds.max())
    plt.plot([min_val, max_val], [min_val, max_val], color = 'black', linestyle="-", lw = 2, zorder = 0)

    plt.xlabel("True values", fontsize = 16)
    plt.ylabel("Predicted values", fontsize = 16)
    
    if title: plt.title(f"Dataset: {datasets[dataset_i]} | Model: {models[model_i]} | Iteration: {it_i}", fontsize = 20)

    textstr = (
        f"R² = {r2:.3f}\nr = {r:.3f}"
    )
    
    textstr = ""
    if 'r2' in metrics: textstr += f"R² = {r2:.3f}\n"
    if 'r' in metrics: textstr += f"r = {r:.3f}\n"
    if 'mae' in metrics: textstr += f"MAE = {mae:.3f}\n"
    if 'rmse' in metrics: textstr += f"RMSE = {rmse:.3f}\n"
    if 'mape' in metrics: textstr += f"MAPE = {mape:.3f}\n"
    textstr = textstr[:-1]


    plt.text(
        0.21 + (0.03 * ('mape' in metrics or 'rmse' in metrics)), 0.96, textstr,
        transform=plt.gca().transAxes,
        fontsize=20,
        verticalalignment="top",
        horizontalalignment="right",
        bbox=dict(
            boxstyle="round,pad=0.3",
            facecolor="white",
            edgecolor="black",
        )
    )

    plt.tight_layout()
    plt.show()

In [14]:
models

['CPD',
 'CPD-S',
 'CoSTCo',
 'NeAT',
 'XGBoost',
 'CatBoost',
 'MLP',
 'GP',
 'RF',
 'SVM',
 'LR']

In [15]:
it_i = 6
dataset_i = 0
figsize = (10, 6)
metrics = ['r2', 'mae', 'rmse', 'mape']

parity_plot(dataset_i, model_i = 2, it_i = it_i, title = True, figsize = figsize, metrics = metrics)
parity_plot(dataset_i, model_i = -3, it_i = it_i, title = True, figsize = figsize, metrics = metrics)

IndexError: list index out of range