In [None]:
%pylab inline
import networkx as nx
import pandas as pd

In [None]:
import os

In [None]:
import pickle as pkl

In [None]:
import gc

In [None]:
DATASETS = ['contact-high-school', 'contact-primary-school']
SG = 'cbow'
HASSE_LIST = ['uniform', 'counts', 'NObias', 'LOexp'] 
WORK_FOLDER = './'

In [None]:
from utils import *

# Build Hasse Diagram

In [None]:
from simplex2hasse import simplex2hasse_HOexponential, simplex2hasse_LOexponential, simplex2hasse_counts, simplex2hasse_uniform

In [None]:
for DATASET in DATASETS: 

    #load train simplices data
    cliques_train, _, _, _ = make_train_test_data(DATASET)

    save_path = WORK_FOLDER + 'processed-output/walks/%s/'%(DATASET)
    os.makedirs(save_path, exist_ok=True)
    
    #loop over different weighting
    for HASSE_TYPE in HASSE_LIST:

        #loop over interaction orders
        for max_order in range(1, MAX_ORDER+1):

            # build Hasse Diag
            if HASSE_TYPE=='uniform':
                g_hasse = simplex2hasse_uniform(list(set(cliques_train)), max_order=max_order)
            if HASSE_TYPE=='counts' or HASSE_TYPE=='NObias':
                g_hasse = simplex2hasse_counts(cliques_train, max_order=max_order)
            if HASSE_TYPE=='HOexp':
                g_hasse = simplex2hasse_HOexponential(list(set(cliques_train)), max_order=max_order)
            if HASSE_TYPE=='LOexp':
                g_hasse = simplex2hasse_LOexponential(list(set(cliques_train)), max_order=max_order)

            # convert to convenient format
            g_hasse = s2vhasse_to_n2vformat(g_hasse)

            # compute weights without bias towards lower-upper orders
            if HASSE_TYPE=='NObias':
                g_hasse = unbias_weights_n2vformat(g_hasse)

            node_name = np.array(list(g_hasse.nodes()))
            node_index = {node:index for index, node in enumerate(node_name)}
            g_hasse = nx.relabel_nodes(g_hasse, node_index)

            #save Hasse Diag
            np.savez_compressed(save_path + 'hasse_%s_maxorder%d.nodename.npz'%(HASSE_TYPE, max_order), node_name)
            nx.write_weighted_edgelist(g_hasse, 
                              save_path + 'hasse_%s_maxorder%d.edgelist.gz'%(HASSE_TYPE, max_order))

# Sample Random Walks

In [None]:
from snap_node2vec import snap_node2vec

In [None]:
#parameters for random walk sampling
P = 1.
N = 10
WALKLEN = 80

In [None]:
for DATASET in DATASETS: 

    load_path = WORK_FOLDER + 'processed-output/walks/%s/'%(DATASET)
    save_path = WORK_FOLDER + 'processed-output/walks/%s/'%(DATASET)
    os.makedirs(save_path, exist_ok=True)
    
    #loop over different weighting
    for HASSE_TYPE in HASSE_LIST:

        #loop over interaction orders
        for max_order in range(1, MAX_ORDER+1):

            g_hasse = nx.read_weighted_edgelist(load_path + 'hasse_%s_maxorder%d.edgelist.gz'%(HASSE_TYPE, max_order), 
                                                create_using=nx.DiGraph)

            nx.relabel_nodes(g_hasse, {i: int(i) for i in g_hasse.nodes()}, copy=False)

            #sample random walks
            node2vec = snap_node2vec(d=2, max_iter=1, walk_len=WALKLEN, num_walks=N, con_size=5, ret_p=P, inout_p=1.)
            _ = node2vec.save_random_walks(g_hasse, edge_f = None, is_weighted=True, 
                          no_python=True, directed=True, save_directory=save_path, 
                          file_name='%s_walks_simplex2vec_%s_maxorder%d.txt'%('n%s_p%s'%(str(N),str(P)), HASSE_TYPE, max_order),
                          compress=True)
            del node2vec
            g_hasse.clear()
            
            print('DONE:', DATASET, HASSE_TYPE, max_order)

# Train simplex2vec

In [None]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

In [None]:
P = 1.
N = 10
WALKLEN = 80
SEED = 0

In [None]:
embdim_list = [8, 16, 32, 64, 128, 256, 512, 1024]

In [None]:
for DATASET in DATASETS: 

    load_path = WORK_FOLDER + 'processed-output/walks/%s/'%(DATASET)
    save_path = WORK_FOLDER + 'processed-output/embeddings/%s/'%(DATASET)
    os.makedirs(save_path, exist_ok=True)
    
    #loop over different weighting
    for HASSE_TYPE in HASSE_LIST:
        
        #loop over interaction orders
        for max_order in range(1, MAX_ORDER+1):

            node_name = np.load(load_path +'hasse_%s_maxorder%d.nodename.npz' % (HASSE_TYPE, max_order))['arr_0']

            walks_file = load_path + 'n%s_p%s_walks_simplex2vec_%s_maxorder%d.txt.gz'%(str(N), str(P), HASSE_TYPE, max_order)

            for EMBDIM in embdim_list:

                PARAMS = '%s_%s_%s_%s' %\
                            ( 'dim'+str(EMBDIM), 'n'+str(N), 'p'+str(P), 'walklen'+str(WALKLEN))

                save_path = WORK_FOLDER + 'processed-output/embeddings/%s/%s/'%(DATASET, PARAMS)
                os.makedirs(save_path, exist_ok=True)

                save_file = save_path + 's2vembs_%s_%s_maxorder%d.%s.pkl'\
                                          %(SG, HASSE_TYPE, max_order, SEED)
                # fit word2vec
                sents = LineSentence(walks_file)
                model = Word2Vec(sentences=sents, min_count=1, sg=0, 
                                 size=EMBDIM, window=10, workers=30, seed=SEED)

                with open(save_file, 'wb') as fh:
                    pkl.dump(dict(zip(node_name[list(map(int, model.wv.index2word))], 
                                        [_ for _ in model.wv.vectors])), fh, protocol=pkl.HIGHEST_PROTOCOL)
                    
                print('DONE:', DATASET, HASSE_TYPE, max_order, EMBDIM)