In [1]:
import numpy as np
from data_process import *
import networkx as nx

# Cora

In [None]:
G = nx.Graph()
with open('data/cora/cora.content','r') as f:
    Lines=[x.strip().split('\t') for x in f]
for i in range(len(Lines)):
    lines = Lines[i]
    node = int(lines[0])
    features = list(map(int, lines[1:-1]))
    label = lines[-1]
    G.add_node(node, feature = features, label = label)

In [None]:
with open('data/cora/cora.cites','r') as f:
    Lines=[x.strip().split('\t') for x in f]
for i in range(len(Lines)):
    lines = Lines[i]
    G.add_edge(int(lines[0]), int(lines[1]), weight=1)

In [None]:
nx.write_gpickle(G, "data/cora.gpickle")

# Homo-sapiens PPI 

In [None]:
ppi = np.load("data/ppi/homo_sapiens.npz")

In [None]:
G = nx.Graph()
G.add_edges_from(ppi["edge_list"])

In [None]:
df = pd.DataFrame(ppi["group"])
num = np.arange(1, 51)
df.columns = [str(i) for i in np.arange(1, 51)]

In [None]:
df['label'] = df.idxmax(axis=1)

In [None]:
df.index.name = 'id'

In [None]:
nx.set_node_attributes(G, pd.Series(df.label, index=df.index).to_dict(), 'label')

In [None]:
nx.set_edge_attributes(G, values = 1, name = 'weight')

In [None]:
n_nodes = G.number_of_nodes()
data = [list(range(G.number_of_nodes()))]

def one_hot(data, n_nodes):
    targets = np.array(data).reshape(-1)
    return np.eye(n_nodes)[targets]

out = one_hot(data, n_nodes)

In [None]:
for i in range(out.shape[0]):
    features = list(out[i,:])
    nx.set_node_attributes(G, {i: features}, name="feature")

In [None]:
nx.write_gpickle(G, "data/ppi.gpickle")

# CiteSeer

In [75]:
f = open("data/citeseer/citeseer.content", "r")
Lines = f.readlines()

In [77]:
original_edges = []
for line in Lines:
    original_nodes.append(line.split()[0])

relabled_nodes = {k:original_nodes.index(k) for k in original_nodes}

labels = {}
features = {}
G = nx.Graph()
for line in Lines:
    node = relabled_nodes[line.split()[0]]
    G.add_node(node, feature=line.split()[1:-1], label=line.split()[-1])

In [78]:
f = open("data/citeseer/citeseer.cites", "r")
Lines = f.readlines()

In [79]:
# convert all unique strings to numeric values
edges = []
for line in Lines:
    start = str(relabled_dict[line.split()[0]])
    end = str(relabled_dict[line.split()[1]])
    edges.append([start, end])

In [80]:
G.add_edges_from(edges)
nx.set_edge_attributes(G, values = 1, name = 'weight')

In [92]:
len(nx.get_node_attributes(G, "feature")[0])

3703

In [93]:
G.number_of_nodes()

6639

In [82]:
nx.write_gpickle(G, "data/citeseer.gpickle")

# Pubmed Diabetes

In [None]:
with open('data/pubmed/Pubmed-Diabetes.NODE.paper.tab','r') as f:
    Lines=[x.strip().split('\t') for x in f]

In [None]:
keys = set()
for features in all_features:
    keys = keys | set(features.keys())

In [None]:
G = nx.Graph()
features = {}
all_features = []
for line in Lines[2:]:
    G.add_node(int(line[0]), label = line[1][line[1].find("=")+1:])
    features = dict.fromkeys(list(keys), 0)
    for feature in line[2:-1]:
        features[feature[:feature.find("=")]] = float(feature[feature.find("=")+1:])
    feature = list(features.values())
    nx.set_node_attributes(G, {int(line[0]): feature}, name="feature")

In [None]:
with open('data/pubmed/Pubmed-Diabetes.DIRECTED.cites.tab','r') as f:
    Lines=[x.strip().split('\t') for x in f]

In [None]:
edges = []
for line in Lines[2:]:
    edges.append([int(line[1][line[1].find(":")+1:]), int(line[3][line[3].find(":")+1:])])
G.add_edges_from(edges)
nx.set_edge_attributes(G, values = 1, name = 'weight')

In [None]:
nx.write_gpickle(G, "data/pubmed.gpickle")

# Amazon Co-Purchase

In [2]:
computers = np.load("data/amazon/amazon_electronics_computers.npz")

In [3]:
import dgl
dataset = dgl.data.AmazonCoBuyComputerDataset()

Using backend: pytorch


In [4]:
G = nx.Graph(dgl.to_networkx(dataset[0], node_attrs=dataset[0].ndata, edge_attrs=dataset[0].edata))
for node in G.nodes:
    nx.set_node_attributes(G, {node: G.nodes[node]['feat'].tolist()}, name="feature")
    del G.node[node]['feat']
    nx.set_node_attributes(G, {node: G.nodes[node]['label'].tolist()}, name="label")
nx.set_edge_attributes(G, values = 1, name = 'weight')

In [5]:
nx.write_gpickle(G, "data/amazon.gpickle")

# BlogCatalog3

In [None]:
G = nx.Graph()
file = open("data/blogcatalog/nodes.csv", "r")
csvreader = csv.reader(file)
for row in csvreader:
    G.add_node(int(row[0]))

In [None]:
for node in G.nodes:
    feature = [0] * len(G.nodes)
    feature[node-1] = 1
    nx.set_node_attributes(G, {node: feature}, name="feature")

In [None]:
file = open("data/blogcatalog/edges.csv", "r")
csvreader = csv.reader(file)
edges = []
for row in csvreader:
    edges.append(list(map(int, row)))
G.add_edges_from(edges, weight=1)

In [None]:
file = open("data/blogcatalog/group-edges.csv", "r")
csvreader = csv.reader(file)
for row in csvreader:
    nx.set_node_attributes(G, {int(row[0]): row[1]}, name="label")

In [None]:
nx.write_gpickle(G, "data/blogcatalog.gpickle")

# Github

In [None]:
github = np.load("data/github/github.npz")

In [None]:
G = nx.Graph()
G.add_edges_from(github["edges"])

In [None]:
features = pd.DataFrame(pd.np.column_stack([github["features"]])).T.to_dict('list')

In [None]:
label = {}
for i in range(len(list(github["target"]))):
    label[i] = list(github["target"])[i]

In [None]:
nx.set_node_attributes(G, features, "feature")
nx.set_node_attributes(G, label, "label")
nx.set_edge_attributes(G, values = 1, name = 'weight')

In [None]:
nx.write_gpickle(G, "data/github.gpickle")