In [1]:
import random
import networkx as nx
import os
from node2vec.edges import HadamardEmbedder
import numpy as np
from sklearn.linear_model import LogisticRegression
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_net(folder, graph_name):
    """Read network"""
    file_name = graph_name + '.net'
    G = nx.DiGraph(name = file_name)
    with open(os.path.join(folder, file_name), 'r', encoding='utf8') as f:
        f.readline()
        # add nodes
        for line in f:
            if line.startswith("*"):
                break
            else:
                node_info = line.split("\"")
                node = int(node_info[0]) - 1
                label = node_info[1]
                G.add_node(node, label=label)

        # add edges
        for line in f:
            node1_str, node2_str = line.split()[:2]
            G.add_edge(int(node1_str)-1, int(node2_str)-1)
    return G

In [3]:
def train_graph(G, train = 0.8):
  nodes = list(G.nodes())
  edges = list(G.edges())
  random.shuffle(edges)
  
  non_edges = []
  while len(non_edges) < len(edges):
    i = random.choice(nodes)
    j = random.choice(nodes)
    if i != j and not G.has_edge(i, j):
      non_edges.append((i, j))
  
  train = int(train * len(edges))
  G = G.copy()
  G.remove_edges_from(edges[train:])
  
  return G, {"train": {1: edges[:train], 0: non_edges[:train]}, "test": {1: edges[train:], 0: non_edges[train:]}}

In [4]:
G = read_net(os.path.join("..", "..", "data"), "wikilinks_ids")

In [5]:
G_train, split = train_graph(G)

In [7]:
nx.write_edgelist(G, "../../data/wikilinks_train.edg")

In [8]:
with open(os.path.join("..", "..", "data", "train_test_spit.json"), "w") as f:
    json.dump(split, f)