In [1]:
import numpy as np
import networkx as nx
import pandas as pd

In [2]:
import os

In [3]:
import pickle as pkl

In [4]:
from utils import *



In [5]:
import glob

In [6]:
DATASETS =  ['contact-high-school', 'contact-primary-school']
SG = 'cbow'
HASSE_LIST = ['uniform', 'counts', 'NObias', 'LOexp']
WORK_FOLDER = './'

In [7]:
metrics = ["harm_mean", "geom_mean", "arith_mean", "adamic_adar", "simplex_PA", "WPKatz", "WPPR", "logreg_supervised"]

# Construct reconstruction/prediction test sets

In [8]:
for DATASET in DATASETS:
    
    hyperedges_path = WORK_FOLDER + 'processed-output/hyperedges/%s/'%(DATASET)

    save_path = WORK_FOLDER + 'processed-output/tables/%s/'%(DATASET)
    os.makedirs(save_path, exist_ok=True)

    load_path = WORK_FOLDER + '3way-metrics-data/'

    #Load Simplices Data
    _, _, data_train, _ = make_train_test_data(DATASET)
    proj_g = nx.Graph([tuple(s) for s in data_train if len(s)==2])
    nodes_train = sorted(nx.connected_components(proj_g), key=len, reverse=True)[0]

    ##############
    
    #reconstruction sets
    open_train = np.load(load_path + '%s-open-tris-0-80.npz'% DATASET)
    triangles_train = open_train[:,:3].astype(str)
    
    #keep triples in the largest component
    idx_train = [i for i, triangle in enumerate(triangles_train) if set(triangle).issubset(nodes_train)]

    positive_ex = set(np.load(hyperedges_path+'%s_pos_%s_%dstring.npz'%('reconstruction', 'all', 3),\
                              allow_pickle=True)['arr_0'])

    y_train = np.array([','.join(map(str, sorted(map(int, tris)))) \
                               in positive_ex for tris in triangles_train[idx_train]]).astype(int)

    open_train[idx_train, -1] = y_train
    np.savez_compressed(save_path+'open-tris-0-80.npz', open_train[idx_train])

    ##############
    
    #prediction sets
    open_test = np.load(load_path + '%s-open-tris-80-100.npz'% DATASET)
    triangles_test = open_test[:,:3].astype(str)
    
    #keep triples in the largest component
    idx_test = [i for i, triangle in enumerate(triangles_test) if set(triangle).issubset(nodes_train)]

    positive_ex = np.load(hyperedges_path+'%s_pos_%s_%dstring.npz'%('prediction', 'all', 3),\
                          allow_pickle=True)['arr_0']
    positive_bounds = np.load(hyperedges_path+'%s_pos_%s_%dbounds.npz'%('prediction', 'all', 3),\
                              allow_pickle=True)['arr_0']

    positive_ex = set(positive_ex[positive_bounds==3])

    y_test = np.array([','.join(map(str, sorted(map(int, tris)))) \
                               in positive_ex for tris in triangles_test[idx_test]]).astype(int)

    open_test[idx_test, -1] = y_test
    np.savez_compressed(save_path+'open-tris-80-100.npz', open_test[idx_test])
    
    ################

# Collect metrics from Benson et al., ref [9] of the paper

In [9]:
for DATASET in DATASETS:
    
    save_path = WORK_FOLDER + 'processed-output/tables/%s/'%(DATASET)
    os.makedirs(save_path, exist_ok=True)

    load_path = WORK_FOLDER + '3way-metrics-data/'

    #Load Simplices Data
    _, _, data_train, _ = make_train_test_data(DATASET)
    proj_g = nx.Graph([tuple(s) for s in data_train if len(s)==2])
    nodes_train = sorted(nx.connected_components(proj_g), key=len, reverse=True)[0]

    ##############
    
    #reconstruction metrics
    open_train = np.load(load_path + '%s-open-tris-0-80.npz'% DATASET)
    triangles_train = open_train[:,:3].astype(str)
    
    #keep triples in the largest component
    idx_train = [i for i, triangle in enumerate(triangles_train) if set(triangle).issubset(nodes_train)]
    
    for m in metrics:
        try:
            y_pred = np.load(load_path+'%s-open-tris-scores-0-80-%s.npz'%(DATASET, m))
            np.savez_compressed(save_path+'open-tris-%s-0-80.npz'%m, y_pred[idx_train])
        except FileNotFoundError:
            pass


    ##############
    
    #prediction metrics
    open_test = np.load(load_path + '%s-open-tris-80-100.npz'% DATASET)
    triangles_test = open_test[:,:3].astype(str)
    
    #keep triples in the largest component
    idx_test = [i for i, triangle in enumerate(triangles_test) if set(triangle).issubset(nodes_train)]

    for m in metrics:
        y_pred = np.load(load_path+'%s-open-tris-scores-80-100-%s.npz'%(DATASET, m))
        np.savez_compressed(save_path+'open-tris-%s-80-100.npz'%m, y_pred[idx_test])
        
    ##############

### Save Reconstruction Scores 

In [10]:
P = 1.
N = 10
WALKLEN = 80
SEED = 0

In [11]:
embdim_list = [8, 16, 32, 64, 128, 256, 512, 1024]

In [12]:
for DATASET in DATASETS:

    load_path = WORK_FOLDER + 'processed-output/tables/%s/'%(DATASET)

    _, _, data_train, _ = make_train_test_data(DATASET)
    proj_g = nx.Graph([tuple(s) for s in data_train if len(s)==2])
    nodes_train = sorted(nx.connected_components(proj_g), key=len, reverse=True)[0]

    open_test = np.load(load_path + 'open-tris-0-80.npz')['arr_0']
    triangles_test = open_test[:,:3].astype(str)
    y_test = open_test[:,-1]

    assert(np.unique(y_test).shape[0]>1)
    
    for HASSE_TYPE in HASSE_LIST:

        for max_order in range(1, MAX_ORDER+1):
            for EMBDIM in embdim_list:

                PARAMS = '%s_%s_%s_%s' %\
                                ( 'dim'+str(EMBDIM), 'n'+str(N), 'p'+str(P), 'walklen'+str(WALKLEN))

                load_path = WORK_FOLDER + 'processed-output/embeddings/%s/%s/'%(DATASET, PARAMS)

                save_path = WORK_FOLDER + 'processed-output/tables/%s/%s/'%(DATASET, PARAMS)
                os.makedirs(save_path, exist_ok=True)

                if os.path.isdir(load_path):

                    #Load Embeddings
                    with open(load_path+'s2vembs_%s_%s_maxorder%s.%s.pkl'%\
                                (SG, HASSE_TYPE, max_order, SEED), 'rb') as fh:
                        model_wv = pkl.load(fh)

                    #node embedding
                    tf_arrays = map(lambda a: [(model_wv[h], model_wv[k]) for h,k in combinations(a,2)],
                                            triangles_test)

                    X_test = np.array(list(map(lambda x: np.mean([a*b for a,b in x], axis=0), tf_arrays)))
                    y_pred = X_test.sum(axis=1)
                    np.savez_compressed(save_path+'open-tris-hadamard-sim-0simplex-0-80-%s-%s-%s-maxorder%s.%s.npz'%\
                                            (SG, 's2v', HASSE_TYPE, max_order, SEED), y_pred)

                    #edge embedding
                    tf_arrays = map(lambda a: [(model_wv[h], model_wv[k]) for h,k in
                            combinations([','.join(map(str, sorted(map(int, edge)))) for edge in combinations(a,2)], 2)],
                            triangles_test)

                    X_test = np.array(list(map(lambda x: np.mean([a*b for a,b in x], axis=0), tf_arrays)))
                    y_pred = X_test.sum(axis=1)
                    np.savez_compressed(save_path+'open-tris-hadamard-sim-1simplex-0-80-%s-%s-%s-maxorder%s.%s.npz'%\
                                            (SG, 's2v', HASSE_TYPE, max_order, SEED), y_pred)

### Reconstruction Search and Plot

In [13]:
for DATASET in DATASETS:
    
    load_path = WORK_FOLDER + 'processed-output/tables/%s/'%(DATASET)

    y_test = np.load(load_path+'open-tris-0-80.npz')['arr_0'][:,-1]
    random_baseline = y_test.sum()/len(y_test)
    
    print(DATASET)
    print()

    scores_baseline = []
    for m in metrics[:-1]:
        y_pred = np.load(load_path+'open-tris-%s-0-80.npz'%m)['arr_0']
        scores_baseline.append((classification_score_from_y(y_test, y_pred), m))

    for s,m in scores_baseline:
        print(m + ' = ', round(s[0]*100,1) , u"\u00B1", round(s[1]*100, 1) )
    print()

    params_folders = glob.glob(load_path+'dim*')
    
    for HASSE_TYPE in HASSE_LIST:

        for max_order in range(1, MAX_ORDER):

            print('H'+str(max_order)+':', HASSE_TYPE)

            scores_simplex0 = []
            scores_simplex1 = []
            for f in params_folders:
                PARAMS = f.split('/')[-1]
                EMBDIM = PARAMS.split('_')[0].replace('dim', '')

                y_pred = np.load(f + '/open-tris-hadamard-sim-0simplex-0-80-%s-%s-%s-maxorder%s.%s.npz'%\
                                (SG, 's2v', HASSE_TYPE, max_order, 0))['arr_0']
                scores_simplex0.append((classification_score_from_y(y_test, y_pred), EMBDIM+'dims'))

                y_pred = np.load(f + '/open-tris-hadamard-sim-1simplex-0-80-%s-%s-%s-maxorder%s.%s.npz'%\
                                (SG, 's2v', HASSE_TYPE, max_order, 0))['arr_0']
                scores_simplex1.append((classification_score_from_y(y_test, y_pred), EMBDIM+'dims'))

            scores_simplex0 = sorted(scores_simplex0, reverse=True)
            scores_simplex1 = sorted(scores_simplex1, reverse=True)
            print('s0 = ', round(scores_simplex0[0][0][0]*100,1), u"\u00B1", round(scores_simplex0[0][0][1]*100,1), '('+scores_simplex0[0][1]+')') 
            print('s1 = ',round(scores_simplex1[0][0][0]*100,1), u"\u00B1", round(scores_simplex1[0][0][1]*100,1), '('+scores_simplex1[0][1]+')') 
        print()

contact-high-school

harm_mean =  85.5 ± 1.5
geom_mean =  85.8 ± 1.1
arith_mean =  78.8 ± 1.1
adamic_adar =  56.5 ± 1.4
simplex_PA =  58.3 ± 1.4
WPKatz =  78.6 ± 1.1
WPPR =  76.9 ± 1.4

H1: uniform
s0 =  56.5 ± 1.9 (64dims)
s1 =  52.3 ± 1.6 (8dims)
H2: uniform
s0 =  55.2 ± 2.1 (8dims)
s1 =  99.5 ± 0.1 (512dims)

H1: counts
s0 =  79.8 ± 1.1 (128dims)
s1 =  84.4 ± 1.0 (8dims)
H2: counts
s0 =  56.6 ± 1.1 (128dims)
s1 =  91.3 ± 0.9 (128dims)

H1: NObias
s0 =  79.5 ± 1.0 (128dims)
s1 =  84.4 ± 0.9 (8dims)
H2: NObias
s0 =  73.0 ± 1.1 (512dims)
s1 =  89.1 ± 0.7 (256dims)

H1: LOexp
s0 =  81.7 ± 2.2 (16dims)
s1 =  89.5 ± 0.9 (8dims)
H2: LOexp
s0 =  84.4 ± 1.6 (16dims)
s1 =  91.9 ± 0.8 (8dims)

contact-primary-school

harm_mean =  88.2 ± 0.7
geom_mean =  88.9 ± 0.6
arith_mean =  83.9 ± 0.5
adamic_adar =  63.6 ± 0.9
simplex_PA =  51.6 ± 0.6
WPKatz =  83.9 ± 0.5
WPPR =  83.5 ± 0.4

H1: uniform
s0 =  64.9 ± 0.6 (64dims)
s1 =  58.0 ± 0.5 (8dims)
H2: uniform
s0 =  64.4 ± 0.6 (8dims)
s1 =  99.5 ± 0.1

### Save Prediction Scores

In [14]:
P = 1.
N = 10
WALKLEN = 80
SEED = 0

In [15]:
embdim_list = [8, 16, 32, 64, 128, 256, 512, 1024]

In [16]:
for DATASET in DATASETS:

    load_path = WORK_FOLDER + 'processed-output/tables/%s/'%(DATASET)

    _, _, data_train, _ = make_train_test_data(DATASET)
    proj_g = nx.Graph([tuple(s) for s in data_train if len(s)==2])
    nodes_train = sorted(nx.connected_components(proj_g), key=len, reverse=True)[0]

    open_test = np.load(load_path + 'open-tris-80-100.npz')['arr_0']
    triangles_test = open_test[:,:3].astype(str)
    y_test = open_test[:,-1]

    assert(np.unique(y_test).shape[0]>1)
    
    for HASSE_TYPE in HASSE_LIST:
        
        for max_order in range(1, MAX_ORDER+1):
            for EMBDIM in embdim_list:

                PARAMS = '%s_%s_%s_%s' %\
                                ( 'dim'+str(EMBDIM), 'n'+str(N), 'p'+str(P), 'walklen'+str(WALKLEN))

                load_path = WORK_FOLDER + 'processed-output/embeddings/%s/%s/'%(DATASET, PARAMS)

                save_path = WORK_FOLDER + 'processed-output/tables/%s/%s/'%(DATASET, PARAMS)
                os.makedirs(save_path, exist_ok=True)

                if os.path.isdir(load_path):

                    #Load Embeddings
                    with open(load_path+'s2vembs_%s_%s_maxorder%s.%s.pkl'%\
                                (SG, HASSE_TYPE, max_order, SEED), 'rb') as fh:
                        model_wv = pkl.load(fh)

                    #node embedding
                    tf_arrays = map(lambda a: [(model_wv[h], model_wv[k]) for h,k in combinations(a,2)],
                                            triangles_test)

                    X_test = np.array(list(map(lambda x: np.mean([a*b for a,b in x], axis=0), tf_arrays)))
                    y_pred = X_test.sum(axis=1)
                    np.savez_compressed(save_path+'open-tris-hadamard-sim-0simplex-80-100-%s-%s-%s-maxorder%s.%s.npz'%\
                                            (SG, 's2v', HASSE_TYPE, max_order, SEED), y_pred)

                    #edge embedding
                    tf_arrays = map(lambda a: [(model_wv[h], model_wv[k]) for h,k in
                            combinations([','.join(map(str, sorted(map(int, edge)))) for edge in combinations(a,2)], 2)],
                            triangles_test)

                    X_test = np.array(list(map(lambda x: np.mean([a*b for a,b in x], axis=0), tf_arrays)))
                    y_pred = X_test.sum(axis=1)
                    np.savez_compressed(save_path+'open-tris-hadamard-sim-1simplex-80-100-%s-%s-%s-maxorder%s.%s.npz'%\
                                            (SG, 's2v', HASSE_TYPE, max_order, SEED), y_pred)

### Prediction Search and Plot

In [17]:
for DATASET in DATASETS:

    load_path = WORK_FOLDER + 'processed-output/tables/%s/'%(DATASET)

    y_test = np.load(load_path+'open-tris-80-100.npz')['arr_0'][:,-1]
    random_baseline = y_test.sum()/len(y_test)
    
    print(DATASET)
    print()

    scores_baseline = []
    for m in metrics:
        y_pred = np.load(load_path+'open-tris-%s-80-100.npz'%m)['arr_0']
        scores_baseline.append((classification_score_from_y(y_test, y_pred), m))

    for s,m in scores_baseline:
        print(m + ' = ', round(s[0]*100,1) , u"\u00B1", round(s[1]*100, 1) )
    print()

    params_folders = glob.glob(load_path+'dim*')
    
    for HASSE_TYPE in HASSE_LIST:

        for max_order in range(1, MAX_ORDER):

            print('H'+str(max_order)+':', HASSE_TYPE)

            scores_simplex0 = []
            scores_simplex1 = []
            for f in params_folders:
                PARAMS = f.split('/')[-1]
                EMBDIM = PARAMS.split('_')[0].replace('dim', '')

                y_pred = np.load(f + '/open-tris-hadamard-sim-0simplex-80-100-%s-%s-%s-maxorder%s.%s.npz'%\
                                (SG, 's2v', HASSE_TYPE, max_order, 0))['arr_0']
                scores_simplex0.append((classification_score_from_y(y_test, y_pred), EMBDIM+'dims'))

                y_pred = np.load(f + '/open-tris-hadamard-sim-1simplex-80-100-%s-%s-%s-maxorder%s.%s.npz'%\
                                (SG, 's2v', HASSE_TYPE, max_order, 0))['arr_0']
                scores_simplex1.append((classification_score_from_y(y_test, y_pred), EMBDIM+'dims'))

            scores_simplex0 = sorted(scores_simplex0, reverse=True)
            scores_simplex1 = sorted(scores_simplex1, reverse=True)
            print('s0 = ', round(scores_simplex0[0][0][0]*100,1), u"\u00B1", round(scores_simplex0[0][0][1]*100,1), '('+scores_simplex0[0][1]+')') 
            print('s1 = ',round(scores_simplex1[0][0][0]*100,1), u"\u00B1", round(scores_simplex1[0][0][1]*100,1), '('+scores_simplex1[0][1]+')') 
        print()

contact-high-school

harm_mean =  71.4 ± 4.3
geom_mean =  73.1 ± 3.8
arith_mean =  69.3 ± 3.6
adamic_adar =  64.8 ± 5.6
simplex_PA =  54.2 ± 6.0
WPKatz =  69.3 ± 3.7
WPPR =  69.8 ± 3.9
logreg_supervised =  68.7 ± 3.1

H1: uniform
s0 =  62.6 ± 5.8 (64dims)
s1 =  49.8 ± 7.4 (8dims)
H2: uniform
s0 =  60.7 ± 3.7 (16dims)
s1 =  68.3 ± 4.7 (16dims)

H1: counts
s0 =  74.8 ± 3.3 (32dims)
s1 =  72.8 ± 3.6 (8dims)
H2: counts
s0 =  65.6 ± 3.5 (256dims)
s1 =  73.1 ± 3.5 (8dims)

H1: NObias
s0 =  74.1 ± 3.4 (32dims)
s1 =  73.1 ± 3.4 (8dims)
H2: NObias
s0 =  70.5 ± 3.4 (64dims)
s1 =  73.4 ± 3.8 (8dims)

H1: LOexp
s0 =  69.9 ± 2.7 (16dims)
s1 =  65.1 ± 5.1 (8dims)
H2: LOexp
s0 =  70.6 ± 3.3 (32dims)
s1 =  65.6 ± 5.0 (8dims)

contact-primary-school

harm_mean =  80.9 ± 1.3
geom_mean =  82.3 ± 1.4
arith_mean =  76.6 ± 1.8
adamic_adar =  65.5 ± 1.4
simplex_PA =  48.8 ± 1.9
WPKatz =  77.5 ± 1.7
WPPR =  79.9 ± 1.4
logreg_supervised =  80.7 ± 1.0

H1: uniform
s0 =  67.9 ± 1.6 (64dims)
s1 =  59.0 ± 1.7 (8di