In [1]:
import numpy as np
import networkx as nx
import pandas as pd

In [2]:
import os

In [3]:
import pickle as pkl

In [4]:
from utils import *



In [5]:
import glob

In [6]:
DATASETS =  ['contact-high-school', 'contact-primary-school']
SG = 'cbow'
HASSE_LIST = ['uniform', 'counts', 'NObias', 'LOexp'] 
WORK_FOLDER = './'

# Construct reconstruction/prediction test sets

In [7]:
for DATASET in DATASETS:
    
    hyperedges_path = WORK_FOLDER + 'processed-output/hyperedges/%s/'%(DATASET)

    save_path = WORK_FOLDER + 'processed-output/tables/%s/'%(DATASET)
    os.makedirs(save_path, exist_ok=True)

    ##############
    
    #reconstruction sets
    positive_ex = np.load(hyperedges_path+'%s_pos_%s_%dstring.npz'%\
                            ('reconstruction', 'all', 4),
                                 allow_pickle=True)['arr_0']
    negative_ex = np.load(hyperedges_path+'%s_neg_%s_%dstring.npz'%\
                            ('reconstruction', 'all', 4),
                             allow_pickle=True)['arr_0']
    negative_bounds = np.load(hyperedges_path+'%s_neg_%s_%dbounds.npz'%\
                            ('reconstruction', 'all', 4),
                             allow_pickle=True)['arr_0']

    positive_ex = np.array(list(map(lambda x: x.split(','), positive_ex)))
    negative_ex = np.array(list(map(lambda x: x.split(','), negative_ex[negative_bounds==4])))

    open_train = np.concatenate((positive_ex, negative_ex)).astype(int)

    y_train = np.array([1,]*positive_ex.shape[0] + [0,]*negative_ex.shape[0]) 

    open_train = np.concatenate((open_train, y_train[:, np.newaxis]), axis=1)
    np.savez_compressed(save_path+'open-quadruples-0-80.npz', open_train)

    ##############
    
    #prediction sets
    positive_ex = np.load(hyperedges_path+'%s_pos_%s_%dstring.npz'%\
                            ('prediction', 'all', 4),
                             allow_pickle=True)['arr_0']
    positive_bounds = np.load(hyperedges_path+'%s_pos_%s_%dbounds.npz'%\
                            ('prediction', 'all', 4),
                             allow_pickle=True)['arr_0']
    negative_ex = np.load(hyperedges_path+'%s_neg_%s_%dstring.npz'%\
                            ('prediction', 'all', 4),
                             allow_pickle=True)['arr_0']
    negative_bounds = np.load(hyperedges_path+'%s_neg_%s_%dbounds.npz'%\
                            ('prediction', 'all', 4),
                                 allow_pickle=True)['arr_0']

    positive_ex = np.array(list(map(lambda x: x.split(','), positive_ex[positive_bounds==4])))
    negative_ex = np.array(list(map(lambda x: x.split(','), negative_ex[negative_bounds==4])))

    if positive_ex.shape[0]>0 and negative_ex.shape[0]>0:
        open_test = np.concatenate((positive_ex, negative_ex)).astype(int)
    elif positive_ex.shape[0]==0:
        open_test = negative_ex.astype(int)
    else:
        open_test = positive_ex.astype(int)

    y_test = np.array([1,]*positive_ex.shape[0] + [0,]*negative_ex.shape[0]) 

    open_test = np.concatenate((open_test, y_test[:, np.newaxis]), axis=1)
    np.savez_compressed(save_path+'open-quadruples-80-100.npz', open_test)
    
    #############

### Save Reconstruction Scores

In [8]:
P = 1.
N = 10
WALKLEN = 80
SEED = 0

In [9]:
embdim_list = [8, 16, 32, 64, 128, 256, 512, 1024]

In [10]:
for DATASET in DATASETS:

    load_path = WORK_FOLDER + 'processed-output/tables/%s/'%(DATASET)

    _, _, data_train, _ = make_train_test_data(DATASET)
    proj_g = nx.Graph([tuple(s) for s in data_train if len(s)==2])
    nodes_train = sorted(nx.connected_components(proj_g), key=len, reverse=True)[0]

    open_test = np.load(load_path + 'open-quadruples-0-80.npz')['arr_0']
    tetrads_test = open_test[:,:4].astype(str)
    y_test = open_test[:,-1]

    assert(np.unique(y_test).shape[0]>1)
    
    for HASSE_TYPE in HASSE_LIST:

        for max_order in range(1, MAX_ORDER+1):
            for EMBDIM in embdim_list:

                PARAMS = '%s_%s_%s_%s' %\
                                ( 'dim'+str(EMBDIM), 'n'+str(N), 'p'+str(P), 'walklen'+str(WALKLEN))

                load_path = WORK_FOLDER + 'processed-output/embeddings/%s/%s/'%(DATASET, PARAMS)

                save_path = WORK_FOLDER + 'processed-output/tables/%s/%s/'%(DATASET, PARAMS)
                os.makedirs(save_path, exist_ok=True)

                if os.path.isdir(load_path):

                    #Load Embeddings
                    with open(load_path+'s2vembs_%s_%s_maxorder%s.%s.pkl'%\
                                (SG, HASSE_TYPE, max_order, SEED), 'rb') as fh:
                        model_wv = pkl.load(fh)

                    #node embedding
                    tf_arrays = map(lambda a: [(model_wv[h], model_wv[k]) for h,k in combinations(a,2)],
                                            tetrads_test)

                    X_test = np.array(list(map(lambda x: np.mean([a*b for a,b in x], axis=0), tf_arrays)))
                    y_pred = X_test.sum(axis=1)
                    np.savez_compressed(save_path+'open-quadruples-hadamard-sim-0simplex-0-80-%s-%s-%s-maxorder%s.%s.npz'%\
                                            (SG, 's2v', HASSE_TYPE, max_order, SEED), y_pred)

                    #edge embedding
                    tf_arrays = map(lambda a: [(model_wv[h], model_wv[k]) for h,k in
                            combinations([','.join(map(str, sorted(map(int, edge)))) for edge in combinations(a,2)], 2)],
                            tetrads_test)

                    X_test = np.array(list(map(lambda x: np.mean([a*b for a,b in x], axis=0), tf_arrays)))
                    y_pred = X_test.sum(axis=1)
                    np.savez_compressed(save_path+'open-quadruples-hadamard-sim-1simplex-0-80-%s-%s-%s-maxorder%s.%s.npz'%\
                                            (SG, 's2v', HASSE_TYPE, max_order, SEED), y_pred)

                    if max_order>1:

                        #triangle embedding
                        tf_arrays = map(lambda a: [(model_wv[h], model_wv[k]) for h,k in
                                combinations([','.join(map(str, sorted(map(int, tris)))) for tris in combinations(a,3)], 2)],
                                tetrads_test)

                        X_test = np.array(list(map(lambda x: np.mean([a*b for a,b in x], axis=0), tf_arrays)))
                        y_pred = X_test.sum(axis=1)
                        np.savez_compressed(save_path+'open-quadruples-hadamard-sim-2simplex-0-80-%s-%s-%s-maxorder%s.%s.npz'%\
                                            (SG, 's2v', HASSE_TYPE, max_order, SEED), y_pred)

### Reconstruction Search and Plot

In [11]:
for DATASET in DATASETS:
    
    load_path = WORK_FOLDER + 'processed-output/tables/%s/'%(DATASET)

    y_test = np.load(load_path+'open-quadruples-0-80.npz')['arr_0'][:,-1]
    random_baseline = y_test.sum()/len(y_test)
    
    print(DATASET)
    
    print()

    params_folders = glob.glob(load_path+'dim*')
    
    for HASSE_TYPE in HASSE_LIST:

        for max_order in range(2, MAX_ORDER+1):

            print('H'+str(max_order)+':', HASSE_TYPE)

            scores_simplex0 = []
            scores_simplex1 = []
            scores_simplex2 = []
            for f in params_folders:
                PARAMS = f.split('/')[-1]
                EMBDIM = PARAMS.split('_')[0].replace('dim', '')

                y_pred = np.load(f + '/open-quadruples-hadamard-sim-0simplex-0-80-%s-%s-%s-maxorder%s.%s.npz'%\
                                (SG, 's2v', HASSE_TYPE, max_order, 0))['arr_0']
                scores_simplex0.append((classification_score_from_y4(y_test, y_pred), EMBDIM+'dims'))

                y_pred = np.load(f + '/open-quadruples-hadamard-sim-1simplex-0-80-%s-%s-%s-maxorder%s.%s.npz'%\
                                (SG, 's2v', HASSE_TYPE, max_order, 0))['arr_0']
                scores_simplex1.append((classification_score_from_y4(y_test, y_pred), EMBDIM+'dims'))

                if max_order>1:
                    y_pred = np.load(f + '/open-quadruples-hadamard-sim-2simplex-0-80-%s-%s-%s-maxorder%s.%s.npz'%\
                                (SG, 's2v', HASSE_TYPE, max_order, 0))['arr_0']
                    scores_simplex2.append((classification_score_from_y4(y_test, y_pred), EMBDIM+'dims'))

            scores_simplex0 = sorted(scores_simplex0, reverse=True)
            scores_simplex1 = sorted(scores_simplex1, reverse=True)
            scores_simplex2 = sorted(scores_simplex2, reverse=True)
            print('s0 = ', round(scores_simplex0[0][0][0]*100,1), u"\u00B1", round(scores_simplex0[0][0][1]*100,1), '('+scores_simplex0[0][1]+')') 
            print('s1 = ',round(scores_simplex1[0][0][0]*100,1), u"\u00B1", round(scores_simplex1[0][0][1]*100,1), '('+scores_simplex1[0][1]+')') 
            print('s2 = ',round(scores_simplex2[0][0][0]*100,1), u"\u00B1", round(scores_simplex2[0][0][1]*100,1), '('+scores_simplex1[0][1]+')')
        print()

contact-high-school

H2: uniform
s0 =  65.3 ± 3.8 (8dims)
s1 =  51.2 ± 3.8 (8dims)
s2 =  68.1 ± 5.1 (8dims)
H3: uniform
s0 =  67.7 ± 5.7 (16dims)
s1 =  48.2 ± 4.1 (16dims)
s2 =  100.0 ± 0.1 (16dims)

H2: counts
s0 =  64.0 ± 2.7 (8dims)
s1 =  49.0 ± 5.0 (1024dims)
s2 =  60.5 ± 4.4 (1024dims)
H3: counts
s0 =  59.8 ± 3.3 (8dims)
s1 =  47.0 ± 5.0 (16dims)
s2 =  56.0 ± 5.2 (16dims)

H2: NObias
s0 =  53.9 ± 4.4 (8dims)
s1 =  47.7 ± 4.6 (1024dims)
s2 =  58.6 ± 4.3 (1024dims)
H3: NObias
s0 =  57.4 ± 4.4 (8dims)
s1 =  47.1 ± 4.5 (64dims)
s2 =  83.3 ± 3.2 (64dims)

H2: LOexp
s0 =  55.5 ± 7.2 (16dims)
s1 =  64.7 ± 4.4 (8dims)
s2 =  85.0 ± 3.6 (8dims)
H3: LOexp
s0 =  57.7 ± 7.2 (16dims)
s1 =  73.2 ± 3.3 (8dims)
s2 =  90.8 ± 1.9 (8dims)

contact-primary-school

H2: uniform
s0 =  59.4 ± 3.7 (16dims)
s1 =  43.1 ± 3.9 (8dims)
s2 =  72.1 ± 3.1 (8dims)
H3: uniform
s0 =  57.7 ± 3.3 (1024dims)
s1 =  41.7 ± 3.9 (8dims)
s2 =  100.0 ± 0.0 (8dims)

H2: counts
s0 =  51.9 ± 3.9 (8dims)
s1 =  46.8 ± 3.1 (64dims)

### Save Prediction Scores

In [12]:
for DATASET in DATASETS:

    load_path = WORK_FOLDER + 'processed-output/tables/%s/'%(DATASET)

    _, _, data_train, _ = make_train_test_data(DATASET)
    proj_g = nx.Graph([tuple(s) for s in data_train if len(s)==2])
    nodes_train = sorted(nx.connected_components(proj_g), key=len, reverse=True)[0]

    open_test = np.load(load_path + 'open-quadruples-80-100.npz')['arr_0']
    tetrads_test = open_test[:,:4].astype(str)
    y_test = open_test[:,-1]

    assert(np.unique(y_test).shape[0]>1)
    
    for HASSE_TYPE in HASSE_LIST:

        for max_order in range(1, MAX_ORDER+1):
            for EMBDIM in embdim_list:

                PARAMS = '%s_%s_%s_%s' %\
                                ( 'dim'+str(EMBDIM), 'n'+str(N), 'p'+str(P), 'walklen'+str(WALKLEN))

                load_path = WORK_FOLDER + 'processed-output/embeddings/%s/%s/'%(DATASET, PARAMS)

                save_path = WORK_FOLDER + 'processed-output/tables/%s/%s/'%(DATASET, PARAMS)
                os.makedirs(save_path, exist_ok=True)

                if os.path.isdir(load_path):

                    #Load Embeddings
                    with open(load_path+'s2vembs_%s_%s_maxorder%s.%s.pkl'%\
                                (SG, HASSE_TYPE, max_order, SEED), 'rb') as fh:
                        model_wv = pkl.load(fh)

                    #node embedding
                    tf_arrays = map(lambda a: [(model_wv[h], model_wv[k]) for h,k in combinations(a,2)],
                                            tetrads_test)

                    X_test = np.array(list(map(lambda x: np.mean([a*b for a,b in x], axis=0), tf_arrays)))
                    y_pred = X_test.sum(axis=1)
                    np.savez_compressed(save_path+'open-quadruples-hadamard-sim-0simplex-80-100-%s-%s-%s-maxorder%s.%s.npz'%\
                                            (SG, 's2v', HASSE_TYPE, max_order, SEED), y_pred)

                    #edge embedding
                    tf_arrays = map(lambda a: [(model_wv[h], model_wv[k]) for h,k in
                            combinations([','.join(map(str, sorted(map(int, edge)))) for edge in combinations(a,2)], 2)],
                            tetrads_test)

                    X_test = np.array(list(map(lambda x: np.mean([a*b for a,b in x], axis=0), tf_arrays)))
                    y_pred = X_test.sum(axis=1)
                    np.savez_compressed(save_path+'open-quadruples-hadamard-sim-1simplex-80-100-%s-%s-%s-maxorder%s.%s.npz'%\
                                            (SG, 's2v', HASSE_TYPE, max_order, SEED), y_pred)

                    if max_order>1:

                        #triangle embedding
                        tf_arrays = map(lambda a: [(model_wv[h], model_wv[k]) for h,k in
                                combinations([','.join(map(str, sorted(map(int, tris)))) for tris in combinations(a,3)], 2)],
                                tetrads_test)

                        X_test = np.array(list(map(lambda x: np.mean([a*b for a,b in x], axis=0), tf_arrays)))
                        y_pred = X_test.sum(axis=1)
                        np.savez_compressed(save_path+'open-quadruples-hadamard-sim-2simplex-80-100-%s-%s-%s-maxorder%s.%s.npz'%\
                                            (SG, 's2v', HASSE_TYPE, max_order, SEED), y_pred)

### Prediction Search and Plot

In [13]:
for DATASET in DATASETS:

    load_path = WORK_FOLDER + 'processed-output/tables/%s/'%(DATASET)

    y_test = np.load(load_path+'open-quadruples-80-100.npz')['arr_0'][:,-1]
    random_baseline = y_test.sum()/len(y_test)
    
    print(DATASET)
    
    print()

    params_folders = glob.glob(load_path+'dim*')
    
    for HASSE_TYPE in HASSE_LIST:

        for max_order in range(2, MAX_ORDER+1):

            print('H'+str(max_order)+':', HASSE_TYPE)

            scores_simplex0 = []
            scores_simplex1 = []
            scores_simplex2 = []
            for f in params_folders:
                PARAMS = f.split('/')[-1]
                EMBDIM = PARAMS.split('_')[0].replace('dim', '')

                y_pred = np.load(f + '/open-quadruples-hadamard-sim-0simplex-80-100-%s-%s-%s-maxorder%s.%s.npz'%\
                                (SG, 's2v', HASSE_TYPE, max_order, 0))['arr_0']
                scores_simplex0.append((classification_score_from_y4(y_test, y_pred), EMBDIM+'dims'))

                y_pred = np.load(f + '/open-quadruples-hadamard-sim-1simplex-80-100-%s-%s-%s-maxorder%s.%s.npz'%\
                                (SG, 's2v', HASSE_TYPE, max_order, 0))['arr_0']
                scores_simplex1.append((classification_score_from_y4(y_test, y_pred), EMBDIM+'dims'))

                if max_order>1:
                    y_pred = np.load(f + '/open-quadruples-hadamard-sim-2simplex-80-100-%s-%s-%s-maxorder%s.%s.npz'%\
                                (SG, 's2v', HASSE_TYPE, max_order, 0))['arr_0']
                    scores_simplex2.append((classification_score_from_y4(y_test, y_pred), EMBDIM+'dims'))

            scores_simplex0 = sorted(scores_simplex0, reverse=True)
            scores_simplex1 = sorted(scores_simplex1, reverse=True)
            scores_simplex2 = sorted(scores_simplex2, reverse=True)
            print('s0 = ', round(scores_simplex0[0][0][0]*100,1), u"\u00B1", round(scores_simplex0[0][0][1]*100,1), '('+scores_simplex0[0][1]+')') 
            print('s1 = ',round(scores_simplex1[0][0][0]*100,1), u"\u00B1", round(scores_simplex1[0][0][1]*100,1), '('+scores_simplex1[0][1]+')') 
            print('s2 = ',round(scores_simplex2[0][0][0]*100,1), u"\u00B1", round(scores_simplex2[0][0][1]*100,1), '('+scores_simplex1[0][1]+')')
        print()

contact-high-school

H2: uniform
s0 =  68.5 ± 17.3 (512dims)
s1 =  88.8 ± 13.0 (64dims)
s2 =  55.8 ± 11.5 (64dims)
H3: uniform
s0 =  77.5 ± 14.9 (1024dims)
s1 =  85.7 ± 12.2 (128dims)
s2 =  52.6 ± 10.1 (128dims)

H2: counts
s0 =  92.2 ± 9.8 (8dims)
s1 =  96.1 ± 4.6 (1024dims)
s2 =  89.2 ± 6.6 (1024dims)
H3: counts
s0 =  82.5 ± 10.4 (32dims)
s1 =  96.1 ± 4.6 (64dims)
s2 =  89.5 ± 12.1 (64dims)

H2: NObias
s0 =  91.0 ± 5.6 (512dims)
s1 =  91.0 ± 5.6 (64dims)
s2 =  74.3 ± 9.8 (64dims)
H3: NObias
s0 =  91.0 ± 5.6 (128dims)
s1 =  91.0 ± 5.6 (64dims)
s2 =  72.5 ± 9.8 (64dims)

H2: LOexp
s0 =  89.5 ± 12.1 (32dims)
s1 =  81.6 ± 15.0 (256dims)
s2 =  49.2 ± 6.4 (256dims)
H3: LOexp
s0 =  83.6 ± 12.2 (64dims)
s1 =  69.8 ± 13.9 (512dims)
s2 =  76.2 ± 15.1 (512dims)

contact-primary-school

H2: uniform
s0 =  65.3 ± 6.7 (16dims)
s1 =  59.5 ± 4.0 (16dims)
s2 =  83.3 ± 0.0 (16dims)
H3: uniform
s0 =  57.5 ± 8.1 (16dims)
s1 =  62.0 ± 4.2 (8dims)
s2 =  60.9 ± 9.1 (8dims)

H2: counts
s0 =  66.5 ± 9.4 (128d