In [None]:
import numpy as np
import networkx as nx
import pandas as pd

In [None]:
import os

In [None]:
import pickle as pkl

In [None]:
import gc

In [None]:
from collections import Counter
from subprocess import call

In [None]:
DATASETS = ['contact-high-school', 'contact-primary-school']
SG = 'sg'
WORK_FOLDER = './'

In [None]:
from utils import *

In [None]:
import gzip

# Sample Random Walks

In [None]:
from snap_node2vec import snap_node2vec

In [None]:
#parameters for random walk sampling
P = 1.
N = 10
WALKLEN = 80
SEED = 0

In [None]:
import gudhi
import k_simplex2vec as ks2v

In [None]:
for DATASET in DATASETS:

    save_path = WORK_FOLDER + 'processed-output/walks/%s/'%(DATASET)

    cliques_train, _,data_train , _ = make_train_test_data(DATASET)

    proj_g = nx.Graph([tuple(s) for s in data_train if len(s)==2])
    proj_g.remove_edges_from(nx.selfloop_edges(proj_g))
    max_comp = sorted(nx.connected_components(proj_g), key=len, reverse=True)[0]
    proj_g = nx.subgraph(proj_g, max_comp)

    nodes_train = set(proj_g.nodes())

    simplices = list(map(tuple, [fs for fs in data_train if fs.issubset(nodes_train)]))

    for k in range(MAX_ORDER):

        if k==0:
            node_name = np.array(list(proj_g.nodes()))
            node_index = {node:index for index, node in enumerate(node_name)}
            proj_g = nx.relabel_nodes(proj_g, node_index)
            np.savez_compressed(save_path + '%dsimplex2vec_%s_maxorder%d.nodename.npz'%(k, 'uniform', k), node_name)

            node2vec = snap_node2vec(d=2, max_iter=1, walk_len=80, num_walks=10, con_size=5, ret_p=1., inout_p=1.)
            _ = node2vec.save_random_walks(proj_g, edge_f = None, is_weighted=False, 
                  no_python=True, directed=False, save_directory=save_path, 
                  file_name='%s_walks_%dsimplex2vec_%s_maxorder%d.txt'%('n%s_p%s'%(str(N),str(P)), k, 'uniform', k),
                  compress=True)
        else: 

            # Build a simplicial complex from the graph
            st = gudhi.SimplexTree() #Gudhi simplex tree --> structure to store the simplices
            for simplex in simplices:
                st.insert(list(map(int, simplex)))

            ## build transition matrix for the edges 
            p1 = ks2v.assemble(cplx =st, k=k, scheme="uniform", laziness=None)
            P1 = p1.astype(np.float32).toarray()

            Simplices = list()
            for simplex in st.get_filtration():
                if simplex[1]!= np.inf:
                    Simplices.append(simplex[0])
                else: 
                    break  
            assert(len(Simplices)==p1.shape[0])

            node_name = np.array([','.join(map(str, sorted(map(int, n)))) for n in Simplices])
            np.savez_compressed(save_path + '%dsimplex2vec_%s_maxorder%d.nodename.npz'%(k, 'uniform', k), node_name)

            ## Perform random walks on the edges
            Walks = ks2v.RandomWalks(walk_length=80, number_walks=10, P=P1)
            walks_name = save_path + '%s_walks_%dsimplex2vec_%s_maxorder%d.txt'%('n%s_p%s'%(str(N),str(P)), k, 'uniform', k)
            ks2v.save_random_walks(Walks, walks_name)

            f_in = open(walks_name)
            f_out = gzip.open(walks_name + '.gz', 'wt')
            f_out.writelines(f_in)
            f_out.close()
            f_in.close()
            call('rm ' + walks_name, shell=True)
            
        print('DONE:', DATASET, k)

# Train k-simplex2vec

In [None]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

In [None]:
P = 1.
N = 10
WALKLEN = 80
SEED = 0

In [None]:
embdim_list = [8, 16, 32, 64, 128, 256, 512, 1024]

In [None]:
for DATASET in DATASETS:

    load_path = WORK_FOLDER + 'processed-output/walks/%s/'%(DATASET)
    
    for k in range(MAX_ORDER):

        node_name = np.load(load_path\
                                +'%dsimplex2vec_%s_maxorder%d.nodename.npz'%(k, 'uniform', k))['arr_0']

        walks_file = load_path\
               +'%s_walks_%dsimplex2vec_%s_maxorder%d.txt.gz'%('n%s_p%s'%(str(N),str(P)), k, 'uniform', k)

        for EMBDIM in embdim_list:

            PARAMS = '%s_%s_%s_%s' %\
                    ( 'dim'+str(EMBDIM), 'n'+str(N), 'p'+str(P), 'walklen'+str(WALKLEN))

            save_path = WORK_FOLDER + 'processed-output/embeddings/%s/%s/'%(DATASET, PARAMS)
            os.makedirs(save_path, exist_ok=True)

            save_file = save_path + '%d-s2vembs_%s_%s_maxorder%d.%s.pkl'\
                                        %(k, SG, 'uniform', k, SEED)

            # fit word2vec
            sents = LineSentence(walks_file)
            model = Word2Vec(sentences=sents, min_count=1, sg=1, 
                             size=EMBDIM, window=10, 
                             seed=SEED, workers=30)

            with open(save_file, 'wb') as fh:
                pkl.dump(dict(zip(node_name[list(map(int, model.wv.index2word))], 
                              [_ for _ in model.wv.vectors])), fh, protocol=pkl.HIGHEST_PROTOCOL)

            print('DONE:', DATASET, k, EMBDIM)