In [None]:
%pylab inline
import networkx as nx
import pandas as pd

In [None]:
import os

In [None]:
import pickle as pkl

In [None]:
DATASETS = ['contact-high-school', 'contact-primary-school']
WORK_FOLDER = './'

In [None]:
from utils import * 

In [None]:
SAMPLE_SIZE = int(1e7)
TUPLE_SIZE = 3 #,4 

# Sample Node Groups

In [None]:
from multiprocessing import Pool

for DATASET in DATASETS:
    
    #Load Simplices Data
    _, _, data_train, _ = make_train_test_data(DATASET)

    proj_g = nx.Graph([tuple(s) for s in data_train if len(s)==2])
    max_comp = sorted(nx.connected_components(proj_g), key=len, reverse=True)[0]
    proj_g = nx.subgraph(proj_g, max_comp)

    nodes_train = set(proj_g.nodes())
    adj_dict = nx.convert.to_dict_of_lists(proj_g)

    data_train = [s for s in data_train if s.issubset(nodes_train)]

    save_path = WORK_FOLDER + 'processed-output/hyperedges/%s/'%(DATASET)
    os.makedirs(save_path, exist_ok=True)

    #NAIVE RANDOM NEGATIVE SAMPLING
    random_ = naive_negative_sampling(list(nodes_train), sample_size=SAMPLE_SIZE, tuple_size=TUPLE_SIZE)
    np.savez_compressed(save_path + 'negative_random_%dtuple.npz'%TUPLE_SIZE, random_)
    random_ = []
    print('DONE:', DATASET, TUPLE_SIZE, 'indep. sampling')

    #MOTIFS NEGATIVE SAMPLING
    motifs_ = motifs_negative_sampling(list(nodes_train), adj_dict, sample_size=SAMPLE_SIZE, tuple_size=TUPLE_SIZE)
    np.savez_compressed(save_path + 'negative_motifs_%dtuple.npz'%TUPLE_SIZE, motifs_) 
    motifs_ = []
    print('DONE:', DATASET, TUPLE_SIZE, 'motifs sampling')

    #STARS NEGATIVE SAMPLING
    stars_ = stars_negative_sampling(list(nodes_train), adj_dict, sample_size=SAMPLE_SIZE, tuple_size=TUPLE_SIZE)
    np.savez_compressed(save_path + 'negative_stars_%dtuple.npz'%TUPLE_SIZE, stars_)
    stars_ = []
    print('DONE:', DATASET, TUPLE_SIZE, 'stars sampling')

    #CLIQUES NEGATIVE SAMPLING
    cliques_ = cliques_negative_sampling(list(data_train), adj_dict, sample_size=SAMPLE_SIZE, tuple_size=TUPLE_SIZE)
    np.savez_compressed(save_path + 'negative_cliques_%dtuple.npz'% TUPLE_SIZE, cliques_)
    cliques_ = []
    print('DONE:', DATASET, TUPLE_SIZE, 'cliques sampling')

# Construct Positive/Negative Examples for Classification Tasks

In [None]:
for DATASET in DATASETS:
    
    #Load Simplices Data
    _, _, data_train, data_test = make_train_test_data(DATASET)

    proj_g = nx.Graph([tuple(s) for s in data_train if len(s)==2])
    max_comp = sorted(nx.connected_components(proj_g), key=len, reverse=True)[0]
    proj_g = nx.subgraph(proj_g, max_comp)

    nodes_train = set(proj_g.nodes())

    data_train = [s for s in data_train if s.issubset(nodes_train)]
    data_test = [s for s in data_test if s.issubset(nodes_train)]

    #map from frozensets to strings
    train_simplices = np.array(list(map(lambda u: ','.join(map(str, sorted(map(int, u)))), data_train)), dtype=object)
    train_sizes = np.array(list(map(lambda u: len(u.split(',')), train_simplices)))
    test_simplices = np.array(list(map(lambda u: ','.join(map(str, sorted(map(int, u)))), data_test)), dtype=object)
    test_sizes = np.array(list(map(lambda u: len(u.split(',')), test_simplices)))

    save_path = WORK_FOLDER + 'processed-output/hyperedges/%s/'%(DATASET)
    load_path = save_path 
    os.makedirs(save_path, exist_ok=True)    

    neg_tuple = set()
    for neg_sample in ['random', 'stars', 'motifs', 'cliques']:
        #load negative samples
        neg_tuple = neg_tuple | set(np.load(load_path + 'negative_%s_%dtuple.npz'%\
                               (neg_sample, TUPLE_SIZE), allow_pickle=True)['arr_0'])

########################
        
    #reconstruction
    pos_ = train_simplices[train_sizes==TUPLE_SIZE]
    neg_ = np.array(list(neg_tuple - set(train_simplices[train_sizes==TUPLE_SIZE])), dtype=object)

    np.savez_compressed(save_path + '%s_pos_%s_%dstring.npz'%\
        ('reconstruction', 'all', TUPLE_SIZE), pos_)
    np.savez_compressed(save_path + '%s_neg_%s_%dstring.npz'%\
        ('reconstruction', 'all', TUPLE_SIZE), neg_)

    train_boundary = set(train_simplices[train_sizes==TUPLE_SIZE-1])

    negative_boundaries = np.array(list(map(lambda x: np.sum([','.join(map(str, sorted(map(int, i)))) in train_boundary 
                                                for i in combinations(x.split(','), TUPLE_SIZE-1)]), neg_)))
    np.savez_compressed(save_path + '%s_neg_%s_%dbounds.npz'%\
        ('reconstruction', 'all', TUPLE_SIZE), negative_boundaries)
    
    print('DONE:', DATASET, TUPLE_SIZE, 'reconstruction examples')
    
########################

    #prediction
    pos_ = np.array(list(set(test_simplices[test_sizes==TUPLE_SIZE]) 
                  - set(train_simplices[train_sizes==TUPLE_SIZE])), dtype=object)
    neg_ = np.array(list(neg_tuple - 
                    (set(train_simplices[train_sizes==TUPLE_SIZE]) | set(test_simplices[test_sizes==TUPLE_SIZE]))
                      ), dtype=object)

    np.savez_compressed(save_path + '%s_pos_%s_%dstring.npz'%\
        ('prediction', 'all', TUPLE_SIZE), pos_)
    np.savez_compressed(save_path + '%s_neg_%s_%dstring.npz'%\
        ('prediction', 'all', TUPLE_SIZE), neg_)

    train_boundary = set(train_simplices[train_sizes==TUPLE_SIZE-1])

    positive_boundaries = np.array(list(map(lambda x: np.sum([','.join(map(str, sorted(map(int, i)))) in train_boundary 
                                for i in combinations(x.split(','), TUPLE_SIZE-1)]), pos_)))
    negative_boundaries = np.array(list(map(lambda x: np.sum([','.join(map(str, sorted(map(int, i)))) in train_boundary 
                                for i in combinations(x.split(','), TUPLE_SIZE-1)]), neg_)))

    np.savez_compressed(save_path + '%s_pos_%s_%dbounds.npz'%\
        ('prediction', 'all', TUPLE_SIZE), positive_boundaries)
    np.savez_compressed(save_path + '%s_neg_%s_%dbounds.npz'%\
        ('prediction', 'all', TUPLE_SIZE), negative_boundaries)
    
    print('DONE:', DATASET, TUPLE_SIZE, 'prediction examples')