In [1]:
import graph as gf
import random
from gensim.models import Word2Vec
import walks as serialized_walks
from skipgram import Skipgram

In [2]:
input_G = './example_graphs/karate.adjlist'
output = 'karate.embeddings'
number_walks = 10
walk_length = 40
max_memory_data_size = 100000
seed = 0
representation_size = 64
window_size = 5
workers = 1
vertex_freq_degree = False

In [3]:
G = gf.load_adjacencylist(input_G, undirected=True)
print("Number of nodes: {}".format(len(G.nodes())))


Number of nodes: 34


In [4]:
num_walks = len(G.nodes()) * number_walks

print("Number of walks: {}".format(num_walks))

data_size = num_walks * walk_length

print("Data size (walks*length): {}".format(data_size))

Number of walks: 340
Data size (walks*length): 13600


In [5]:
if data_size < max_memory_data_size:
    print("Walking...")
    walks = gf.build_deepwalk_corpus(G, num_paths=number_walks,
                                        path_length=walk_length, alpha=0, rand=random.Random(seed))
    print("Training...")
    model = Word2Vec(walks, size=representation_size, window=window_size, min_count=0, sg=1, hs=1, workers=workers)
else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size,max_memory_data_size))
    print("Walking...")

    walks_filebase = output + ".walks"
    walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=number_walks,
                                         path_length=walk_length, alpha=0, rand=random.Random(seed),
                                         num_workers=workers)

    print("Counting vertex frequency...")
    if not vertex_freq_degree:
        vertex_counts = serialized_walks.count_textfiles(walk_files, workers)
    else:
      # use degree distribution for frequency in tree
      vertex_counts = G.degree(nodes=G.iterkeys())

    print("Training...")
    walks_corpus = serialized_walks.WalksCorpus(walk_files)
    model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                     size=representation_size,window=window_size, min_count=0, trim_rule=None, workers=workers)

model.wv.save_word2vec_format(output)

Walking...
Training...
