In [21]:
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import os
import sys
import random
from io import open
from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import logging

import graph
from skipgram import Skipgram
import walks as serialized_walks
from gensim.models import Word2Vec

from six import text_type as unicode
from six import iteritems
from six.moves import range

import psutil
from multiprocessing import cpu_count

p = psutil.Process(os.getpid())
try:
    p.set_cpu_affinity(list(range(cpu_count())))
except AttributeError:
    try:
        p.cpu_affinity(list(range(cpu_count())))
    except AttributeError:
        pass

logger = logging.getLogger(__name__)
LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"


def debug(type_, value, tb):
    if hasattr(sys, 'ps1') or not sys.stderr.isatty():
        sys.__excepthook__(type_, value, tb)
    else:
        import traceback
        import pdb
        traceback.print_exception(type_, value, tb)
        print(u"\n")
        pdb.pm()


def process():

    if format == "adjlist":
        G = graph.load_adjacencylist(input, undirected=undirected)
    elif format == "edgelist":
        G = graph.load_edgelist(input, undirected=undirected)
    elif format == "mat":
        G = graph.load_matfile(input, variable_name=matfile_variable_name, undirected=undirected)
    else:
        raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G, num_paths=number_walks,
                                        path_length=walk_length, alpha=0, rand=random.Random(seed))
        print("Training...")
        model = Word2Vec(walks, size=representation_size, window=window_size, min_count=0, sg=1, hs=1, workers=workers)
    else:
        print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, max_memory_data_size))
        print("Walking...")

        walks_filebase = output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=number_walks,
                                         path_length=walk_length, alpha=0, rand=random.Random(seed),
                                         num_workers=workers)

        print("Counting vertex frequency...")
        if not vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(walk_files, workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=representation_size, window=window_size, min_count=0, trim_rule=None, workers=workers)

    model.wv.save_word2vec_format(output)


# Start the main function here.
#parser = ArgumentParser("deepwalk", formatter_class=ArgumentDefaultsHelpFormatter, conflict_handler='resolve')

# Drop a debugger if an exception is raised.
# parser.add_argument("--debug", dest="debug", action='store_true', default=False, help="drop a debugger if an exception is raised.")
debug = 'False'

# File format of input file.
format = 'adjlist'

# Input graph file.
input = 'Adjlist_FR.tsv'

# Log verbosity level.
# parser.add_argument("-l", "--log", dest="log", default="INFO", help="log verbosity level")
log = "INFO"

# Variable name of adjacency matrix inside a .mat file.
matfile_variable_name = 'network'

# Size to start dumping walks to disk, instead of keeping them in memory.
max_memory_data_size = 1000000000

# Number of random walks to start at each node.
number_walks = 80

# Output representation file.
output = 'success.txt'

# Number of latent dimensions to learn for each node.
representation_size = 128

# Seed for random walk generator.
seed = 0

# Treat graph as undirected.
undirected = True

# Use vertex degree to estimate the frequency of nodes in the random walks. 
# This option is faster than calculating the vocabulary.
vertex_freq_degree = False

# Length of the random walk started at each node
walk_length = 40

# Window size of skipgram model.
window_size = 10

# Number of parallel processes
workers=1 

numeric_level = getattr(logging, log.upper(), None)
logging.basicConfig(format=LOGFORMAT)
logger.setLevel(numeric_level)

if debug:
    sys.excepthook = debug

process()


Number of nodes: 4998
Number of walks: 399840
Data size (walks*length): 15993600
Walking...
Training...
