From ad128707a8472e81a88ad6556699639ba7351e78 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 7 Aug 2019 16:59:40 +1000 Subject: [PATCH 01/82] add space --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 4ea9305e2..0f8c4fe20 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ pytest==3.9.3 pytest-benchmark>=3.1 pytest-cov>=2.6.0 coveralls>=1.5.1 + From ace4fec29121c894cddf167e76b45885ac8994da Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 7 Aug 2019 18:06:49 +1000 Subject: [PATCH 02/82] Add the definition of the attri2vecLinkGenerator, which helps generate a batch of (feature of target node, index of context node, label of node pair) pairs per iteration, for training the attri2vec model --- stellargraph/mapper/link_mappers.py | 125 ++++++++++++++++++++++++++-- 1 file changed, 119 insertions(+), 6 deletions(-) mode change 100644 => 100755 stellargraph/mapper/link_mappers.py diff --git a/stellargraph/mapper/link_mappers.py b/stellargraph/mapper/link_mappers.py old mode 100644 new mode 100755 index 06f19b3fd..5b086cda3 --- a/stellargraph/mapper/link_mappers.py +++ b/stellargraph/mapper/link_mappers.py @@ -24,6 +24,7 @@ "OnDemandLinkSequence", "GraphSAGELinkGenerator", "HinSAGELinkGenerator", + "attri2vecLinkGenerator" ] import random @@ -178,7 +179,7 @@ class OnDemandLinkSequence(Sequence): :class:`GraphSAGELinkGenerator` ` . Args: - generator: An instance of :class:`GraphSAGELinkGenerator`. + generator: An instance of :class:`GraphSAGELinkGenerator` or 'attri2vecLink Generator'. sampler: An instance of :class:`UnsupervisedSampler` that encapsulates the neighbourhood sampling of a graph. The generator method of this class returns `batch_size` of positive and negative samples on demand. """ @@ -205,11 +206,18 @@ def __init__(self, generator, walker): * self.walker.number_of_walks ) # an estimate of the upper bound on how many samples are generated in each epoch - print( - "Running GraphSAGELinkGenerator with an estimated {} batches generated on the fly per epoch.".format( - round(self.data_size / self.generator.batch_size) - ) - ) + if isinstance(self.generator, GraphSAGELinkGenerator): + print( + "Running GraphSAGELinkGenerator with an estimated {} batches generated on the fly per epoch.".format( + round(self.data_size / self.generator.batch_size) + ) + ) + else: + print( + "Running attri2vecLinkGenerator with an estimated {} batches generated on the fly per epoch.".format( + round(self.data_size / self.generator.batch_size) + ) + ) self._gen = self.walker.generator( self.generator.batch_size @@ -607,3 +615,108 @@ def flow(self, link_ids, targets=None, shuffle=False): ) return LinkSequence(self, link_ids, targets, shuffle) + +class attri2vecLinkGenerator: + """ + A data generator for link prediction with Homogeneous attri2vec models + + At minimum, supply the StellarGraph the batch size. + + The supplied graph should be a StellarGraph object that is ready for + machine learning. Currently the model requires node features for all + nodes in the graph. + + Use the :meth:`.flow` method supplying the nodes and (optionally) targets, + or an UnsupervisedSampler instance that generates node samples on demand, + to get an object that can be used as a Keras data generator. + + Example:: + + G_generator = attri2vecLinkGenerator(G, 50) + train_data_gen = G_generator.flow(edge_ids) + + Args: + G (StellarGraph): A machine-learning ready graph. + batch_size (int): Size of batch of links to return. + num_samples: for compatibility with GraphSAGE + seed (int or str), optional: Random seed for the sampling methods. + name, optional: Name of generator + """ + + def __init__(self, G, batch_size, num_samples=[1,1], seed=None, name=None): + if not isinstance(G, StellarGraphBase): + raise TypeError("Graph must be a StellarGraph object.") + + G.check_graph_for_ml(features=True) + + self.graph = G + self.num_samples = num_samples + self.batch_size = batch_size + self.name = name + + # We need a schema for compatibility with HinSAGE + self.schema = G.create_graph_schema(create_type_maps=True) + + # The sampler used to generate random samples of neighbours + #self.sampler = SampledBreadthFirstWalk(G, graph_schema=self.schema, seed=seed) + + def sample_features(self, head_links, sampling_schema): + """ + Sample content features of the target nodes and the ids of the context nodes + and return these as a list of feature arrays for the attri2vec algorithm. + + Args: + head_links: An iterable of edges to perform sampling for. + sampling_schema: The sampling schema for the model, for compatibility + with GraphSAGE and HinSAGE + + Returns: + a list of feaure arrys, with each element being the feature of a + target node and the id of the corresponding context node + """ + + target_ids = [head_link[0] for head_link in head_links] + context_ids = [head_link[1] for head_link in head_links] + target_feats = self.graph.get_feature_for_nodes(target_ids) + batch_feats = [target_feats, np.array(context_ids)] + + return batch_feats + + def flow(self, link_ids, targets=None, shuffle=False): + """ + Creates a generator/sequence object for training or evaluation + with the supplied edge IDs and numeric targets. + + The edge IDs are the edges to train or inference on. They are + expected to by tuples of (source_id, destination_id). + + The targets are an array of numeric targets corresponding to the + supplied link_ids to be used by the downstream task. They should + be given in the same order as the list of link IDs. + If they are not specified (for example, for use in prediction), + the targets will not be available to the downsteam task. + + Note that the shuffle argument should be True for training and + False for prediction. + + Args: + link_ids (list or UnsupervisedSampler): an iterable of (src_id, dst_id) tuples + specifying the edges or an UnsupervisedSampler object that has a generator + method to generate samples on the fly. + targets (optional, array): a 2D array of numeric targets with shape + `(len(link_ids), target_size)` + shuffle (optional, bool): If True the node_ids will be shuffled at each + epoch, if False the node_ids will be processed in order. + + Returns: + An OnDemandLinkSequence object to use with the attri2vec model. + """ + # Pass sampler to on-demand link sequence generation + if isinstance(link_ids, UnsupervisedSampler): + return OnDemandLinkSequence(self, link_ids) + + else: + raise TypeError( + "Argument to .flow not recognised. " + "Please pass a list of samples or a UnsupervisedSampler object." + ) From 8df75763855a7459beb266bb32ed09ada40b636c Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 7 Aug 2019 18:10:24 +1000 Subject: [PATCH 03/82] Add the implementation of attri2vec, and investigate its performance on out-of-sample node link prediction. --- .../stellargraph-attri2vec-DBLP.ipynb | 769 ++++++++++++++++++ 1 file changed, 769 insertions(+) create mode 100644 demos/embeddings/stellargraph-attri2vec-DBLP.ipynb diff --git a/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb b/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb new file mode 100644 index 000000000..8abfad166 --- /dev/null +++ b/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb @@ -0,0 +1,769 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Infer Representations for Out-of-sample Nodes Through attri2vec" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is the python implementation of the attri2vec algorithm outlined in paper ***[Attributed Network Embedding Via Subspace Discovery](https://arxiv.org/abs/1901.04095)*** D. Zhang, Y. Jie, X. Zhu and C. Zhang, arXiv:1901.04095, [cs.SI], 2019. The implementation uses the stellargraph libraries.\n", + "\n", + "## Dataset\n", + "This demo uses a DBLP citation network, a subgraph extracted from [DBLP-Citation-network V3](https://aminer.org/citation). To form this subgraph, papers from four subjects are extracted according to their venue information: *Database, Data Mining, Artificial Intelligence and Computer Vision*, and papers with no citations are removed. The DBLP network contains 18,448 papers and 45,661 citation relations. From paper titles, we construct 2,476-dimensional binary node feature vectors, with each element indicating the presence/absence of the corresponding word. By ignoring the citation direction, we take the DBLP subgraph as an undirected network.\n", + "\n", + "As papers in DBLP are attached with publication year, the DBLP network with the dynamic property can be used to study the problem of out-of-sample node representation learning. From the DBLP network, we construct four in-sample subgraphs using papers published before 2006, 2007, 2008 and 2009, and denote the four subgraphs as DBLP2006, DBLP2007, DBLP2008, and DBLP2009. For each subgraph, the remaining papers are taken as out-of-sample nodes. We consider the case where new coming nodes have no links. We predict the links of out-of-sample nodes using the learned out-of-sample node representations and compare its performance with the node content feature baseline.\n", + "\n", + "\n", + "\n", + "## attri2vec\n", + "\n", + "For networks attached with node content attributes, attri2vec infers node representations by discovering a latent node attribute subspace that respects network structure in a more consistent way. To transform network nodes from the original attribute space into the targeted subspace, a non-linear mapping is used. To make the mapped images respect structural similarity, [`DeepWalk`](https://dl.acm.org/citation.cfm?id=2623732)/[`node2vec`](https://snap.stanford.edu/node2vec) learning mechanism is used to make nodes sharing similar random walk context nodes represented closely in the subspace. Following [`DeepWalk`](https://dl.acm.org/citation.cfm?id=2623732)/[`node2vec`](https://snap.stanford.edu/node2vec), attri2vec learns node representations by maximizing the occurrence probability of context nodes conditioned on the representation of the target nodes. The probability is modelled by Softmax and negative sampling is used to speed up its calculation. This makes attri2vec equivalent to predict whether a node occurs in the given target node's context in random walks with the representation of the target node, by minimizing the cross-entropy loss. \n", + "\n", + "In implementation, node embeddings are learnt by solving a simple classification task: given a large set of \"positive\" `(target, context)` node pairs generated from random walks performed on the graph (i.e., node pairs that co-occur within a certain context window in random walks), and an equally large set of \"negative\" node pairs that are randomly selected from the graph according to a certain distribution, learn a binary classifier that predicts whether arbitrary node pairs are likely to co-occur in a random walk performed on the graph. Through learning this simple binary node-pair-classification task, the model automatically learns an inductive mapping from attributes of nodes to node embeddings in a low-dimensional vector space, which preserves structural and feature similarities of the nodes. Unlike the embeddings learned by DeepWalk/node2vec, the mapping is inductive. Different from the mapping generated by GraphSAGE, the mapping only performs on node content attributes and does not rely on any link information, which makes it possible to construct representations for new coming nodes having few or no link information from their content attributes.\n", + "\n", + "To train the attri2vec model, we first construct a training set of nodes, which is composed of an equal number of positive and negative `(target, context)` pairs from the graph. The positive `(target, context)` pairs are the node pairs co-occurring on random walks over the graph whereas the negative node pairs are the sampled randomly from the global node degree distribution of the graph. In attri2vec, each node is attached with two kinds of embeddings: 1) the inductive 'input embedding', i.e, the objective embedding, obtained by perform a non-linear transformation on node content features, and 2) 'output embedding', i.e., the parameters used to predict its occurrence as a context node, obtained by looking up a parameter table. Given a `(target, context)`, attri2vec outputs a predictive value to indicate whether it is positive or negative, which is obtained by performing the dot product of the 'input embedding' of the target node and the 'output embedding' of the context node, followed by a sigmoid activation. \n", + "\n", + "The entire model is trained end-to-end by minimizing the loss function of choice (e.g., binary cross-entropy between predicted node pair labels and true link labels) using stochastic gradient descent (SGD) updates of the model parameters, with minibatches of 'training' links generated on demand and fed into the model.\n", + "\n", + "In this demo, we first train the attri2vec model on the in-sample subgraph and obtain a mapping function from node attributes to node representations, then apply the mapping function to the content attributes of out-of-sample nodes and obtain the representations of out-of-sample nodes. We evaluate the quality of inferred out-of-sample node representations by using it to predict the links of out-of-sample nodes." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], + "source": [ + "import networkx as nx\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import random\n", + "\n", + "import stellargraph as sg\n", + "from stellargraph.data import UniformRandomWalk\n", + "from stellargraph.data import UnsupervisedSampler\n", + "from stellargraph.mapper import attri2vecLinkGenerator\n", + "\n", + "import keras \n", + "from keras.utils import Sequence\n", + "from keras.models import Model\n", + "from keras.layers import Input, Dense, Reshape, merge\n", + "from keras.layers.embeddings import Embedding\n", + "\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import roc_auc_score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading DBLP network data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset used in this demo can be downloaded from https://www.kaggle.com/daozhang/dblp-subgraph.\n", + "The following is the description of the dataset:\n", + "\n", + "> The content.txt file contains descriptions of the papers in the following format:\n", + "\n", + " \t\t \n", + " \n", + "> The first entry in each line contains the unique integer ID (ranging from 0 to 18,447) of the paper followed by > binary values indicating whether each word in the vocabulary is present (indicated by 1) or absent (indicated by 0) in the paper. Finally, the last two entries in the line are the class label and the publication year of the paper.\n", + "> The edgeList.txt file contains the citation relations. Each line describes a link in the following format:\n", + "\t\t\n", + " \n", + " \n", + "> Each line contains two paper IDs, with paper2 citing paper1 or paper1 citing paper2.\n", + "\n", + "\n", + "Download and unzip the dblp-subgraph.zip file to a location on your computer and set the `data_dir` variable to\n", + "point to the location of the dataset (the \"DBLP\" directory containing \"content.txt\" and \"edgeList.txt\")." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "data_dir = \"~/data/DBLP\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the graph from the edgelist." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "edgelist = pd.read_csv(os.path.join(data_dir, \"edgeList.txt\"), sep='\\t', header=None, names=[\"source\", \"target\"])\n", + "edgelist[\"label\"] = \"cites\" # set the edge type" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load paper content features, subjects and publishing years." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "feature_names = [\"w_{}\".format(ii) for ii in range(2476)]\n", + "node_column_names = feature_names + [\"subject\", \"year\"]\n", + "node_data = pd.read_csv(os.path.join(data_dir, \"content.txt\"), sep='\\t', header=None, names=node_column_names)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get DBLP Subgraph \n", + "### with papers published before a threshold year" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the edge list connecting in-sample nodes." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "year_thresh = 2006 # the threshold year for in-sample and out-of-sample set split, which can be 2007, 2008 and 2009\n", + "subgraph_edgelist = []\n", + "for ii in range(len(edgelist)):\n", + " source_index = edgelist[\"source\"][ii]\n", + " target_index = edgelist[\"target\"][ii]\n", + " source_year = int(node_data[\"year\"][source_index])\n", + " target_year = int(node_data[\"year\"][target_index])\n", + " if source_year < year_thresh and target_year < year_thresh:\n", + " subgraph_edgelist.append([source_index, target_index])\n", + "subgraph_edgelist = pd.DataFrame(np.array(subgraph_edgelist), columns=[\"source\", \"target\"])\n", + "subgraph_edgelist[\"label\"] = \"cites\" # set the edge type" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Construct the network from the selected edge list." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "Gnx = nx.from_pandas_edgelist(subgraph_edgelist, edge_attr=\"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify node types." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "nx.set_node_attributes(Gnx, \"paper\", \"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get in-sample node features." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "subgraph_node_data = node_data[node_data[\"year\"]<2006]\n", + "subgraph_node_features = subgraph_node_data[feature_names]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the Stellargraph with node features." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "G = sg.StellarGraph(Gnx, node_features=subgraph_node_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "StellarGraph: Undirected multigraph\n", + " Nodes: 11776, Edges: 28937\n", + "\n", + " Node types:\n", + " paper: [11776]\n", + " Edge types: paper-cites->paper\n", + "\n", + " Edge types:\n", + " paper-cites->paper: [28937]\n", + "\n" + ] + } + ], + "source": [ + "print(G.info())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train attri2vec on the DBLP Subgraph" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk, and random seed." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "nodes = list(G.nodes())\n", + "number_of_walks = 1\n", + "length = 5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the UnsupervisedSampler instance with the relevant parameters passed to it." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "unsupervised_samples = UnsupervisedSampler(G, nodes=nodes, length=length, number_of_walks=number_of_walks)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the batch size and the number of epochs. " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 50\n", + "epochs = 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define the attri2vec training generator, which generates a batch of (feature of target node, index of context node, label of node pair) pairs per iteration." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running attri2vecLinkGenerator with an estimated 2355 batches generated on the fly per epoch.\n" + ] + } + ], + "source": [ + "train_gen = attri2vecLinkGenerator(G, batch_size).flow(unsupervised_samples)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following is the keras implementation of attri2vec. The objective attri2vec is equivalent to minimize the cross-entropy loss for predicting the context nodes with target nodes, with the predicted output being the dot product of the \"input embedding\" of target nodes and the \"output embedding\" of target nodes, followed by a sigmoid activation. The \"input embedding\", the objective node representation, is constructed from node content features through a linear transformation followed by a sigmoid activation. The \"out embedding\" is a look-up table for each context node." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: Logging before flag parsing goes to stderr.\n", + "W0807 17:47:53.407229 140736734368704 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", + "\n", + "W0807 17:47:53.423400 140736734368704 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n", + "\n", + "W0807 17:47:53.428951 140736734368704 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n", + "\n", + "W0807 17:47:53.504508 140736734368704 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", + "\n", + "W0807 17:47:53.510684 140736734368704 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.\n", + "\n", + "W0807 17:47:53.517368 140736734368704 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use tf.where in 2.0, which has the same broadcast rule as np.where\n" + ] + } + ], + "source": [ + "embedding_size = 256 # set the embedding dimension to 256.\n", + "input_target = Input((G.node_feature_sizes()[\"paper\"],)) # define the input for target nodes as node content features \n", + "input_context = Input((1,)) # define the input for context nodes as node ids\n", + "\n", + "target = Dense(embedding_size, activation='sigmoid')(input_target) # get \"input embedding\" for the target node\n", + "target = Reshape((embedding_size, 1))(target) \n", + "out_embedding = Embedding(len(node_data), embedding_size, input_length=1, name='out_embedding')\n", + "context = out_embedding(input_context) # get the \"output embedding\" for the context node\n", + "context = Reshape((embedding_size, 1))(context)\n", + "\n", + "# perform dot product between the \"input embedding\" and the \"output embedding\"\n", + "dot_product = keras.layers.dot([target, context], axes = 1, normalize=False) \n", + "dot_product = Reshape((1,))(dot_product) \n", + "\n", + "output = Dense(1, activation='sigmoid')(dot_product) # get the predicted output\n", + "\n", + "model = Model(inputs=[input_target, input_context], outputs=output)\n", + "model.compile(\n", + " optimizer=keras.optimizers.Adam(lr=1e-3),\n", + " loss=keras.losses.binary_crossentropy,\n", + " metrics=[keras.metrics.binary_accuracy],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train the model." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "W0807 17:47:56.943290 140736734368704 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/4\n", + "2356/2356 [==============================] - 136s 58ms/step - loss: 0.7146 - binary_accuracy: 0.5173\n", + "Epoch 2/4\n", + "2356/2356 [==============================] - 136s 58ms/step - loss: 0.6876 - binary_accuracy: 0.5374\n", + "Epoch 3/4\n", + "2356/2356 [==============================] - 136s 58ms/step - loss: 0.6759 - binary_accuracy: 0.5468\n", + "Epoch 4/4\n", + "2356/2356 [==============================] - 136s 58ms/step - loss: 0.6571 - binary_accuracy: 0.5578\n" + ] + } + ], + "source": [ + "history = model.fit_generator(\n", + " train_gen,\n", + " epochs=epochs,\n", + " verbose=1,\n", + " use_multiprocessing=False,\n", + " workers=0,\n", + " shuffle=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Predicting links of out-of-sample nodes with the learned attri2vec model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the node embeddings, for both in-sample and out-of-sample nodes, by applying the learned mapping function to node content features." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_model = Model(inputs = input_target, outputs = Reshape((embedding_size,))(target))\n", + "node_embeddings = embedding_model.predict(node_data[feature_names].values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the positive and negative edges for in-sample nodes and out-of-sample nodes. The edges of the in-sample nodes only include the edges between in-sample nodes, and the edges of out-of-sample nodes are referred to all the edges linked to out-of-sample nodes, including the edges connecting in-sample and out-of-sample edges." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "year_thresh = 2006\n", + "in_sample_edges = []\n", + "out_of_sample_edges = []\n", + "for ii in range(len(edgelist)):\n", + " source_index = edgelist[\"source\"][ii]\n", + " target_index = edgelist[\"target\"][ii]\n", + " if source_index > target_index: # neglect edge direction for the undirected graph\n", + " continue\n", + " source_year = int(node_data[\"year\"][source_index])\n", + " target_year = int(node_data[\"year\"][target_index])\n", + " if source_year < year_thresh and target_year < year_thresh:\n", + " in_sample_edges.append([source_index, target_index, 1]) # get the positive edge\n", + " negative_target_index = unsupervised_samples.random.choices(node_data.index.tolist(), k=1) # generate negative node\n", + " in_sample_edges.append([source_index, negative_target_index[0], 0]) # get the negative node\n", + " else:\n", + " out_of_sample_edges.append([source_index, target_index, 1]) # get the positive edge\n", + " negative_target_index = unsupervised_samples.random.choices(node_data.index.tolist(), k=1) # generate negative node\n", + " out_of_sample_edges.append([source_index, negative_target_index[0], 0]) # get the negative node\n", + "in_sample_edges = np.array(in_sample_edges)\n", + "out_of_sample_edges = np.array(out_of_sample_edges)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Construct the edge features from the learned node representations with l2 normed difference, where edge features are the element-wise square of the difference between the embeddings of two head nodes. Other strategy like element-wise product can also be used to construct edge features." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "in_sample_edge_feat_from_emb = (node_embeddings[in_sample_edges[:,0]]-node_embeddings[in_sample_edges[:,1]])**2\n", + "out_of_sample_edge_feat_from_emb = (node_embeddings[out_of_sample_edges[:,0]]-node_embeddings[out_of_sample_edges[:,1]])**2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train the Logistic Regression classifier from in-sample edges with the edge features constructed from attri2vec embeddings. " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, l1_ratio=None, max_iter=500,\n", + " multi_class='auto', n_jobs=None, penalty='l2',\n", + " random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n", + " warm_start=False)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf_edge_pred_from_emb = LogisticRegression(verbose=0, solver='lbfgs', multi_class=\"auto\", max_iter=500)\n", + "clf_edge_pred_from_emb.fit(in_sample_edge_feat_from_emb, in_sample_edges[:,2])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Predict the edge existence probability with the trained Logistic Regression classifier." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "edge_pred_from_emb = clf_edge_pred_from_emb.predict_proba(out_of_sample_edge_feat_from_emb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the positive class index of `edge_pred_from_emb`." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "if clf_edge_pred_from_emb.classes_[0] == 1:\n", + " positive_class_index = 0\n", + "else:\n", + " positive_class_index = 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Evaluate the AUC score for the prediction with attri2vec embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.736614362152765" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roc_auc_score(out_of_sample_edges[:,2], edge_pred_from_emb[:,positive_class_index])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As the baseline, we also investigate the performance of node content features in predicting the edges of out-of-sample nodes. Firstly, we construct edge features from node content features with the same strategy." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "in_sample_edge_rep_from_feat = (node_data[feature_names].values[in_sample_edges[:,0]]-node_data[feature_names].values[in_sample_edges[:,1]])**2\n", + "out_of_sample_edge_rep_from_feat = (node_data[feature_names].values[out_of_sample_edges[:,0]]-node_data[feature_names].values[out_of_sample_edges[:,1]])**2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we train the Logistic Regression classifier from in-sample edges with the edge features constructed from node content features." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, l1_ratio=None, max_iter=500,\n", + " multi_class='auto', n_jobs=None, penalty='l2',\n", + " random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n", + " warm_start=False)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf_edge_pred_from_feat = LogisticRegression(verbose=0, solver='lbfgs', multi_class=\"auto\", max_iter=500)\n", + "clf_edge_pred_from_feat.fit(in_sample_edge_rep_from_feat, in_sample_edges[:,2])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Predict the edge existence probability with the trained Logistic Regression classifier." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "edge_pred_from_feat = clf_edge_pred_from_feat.predict_proba(out_of_sample_edge_rep_from_feat)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get positive class index of `clf_edge_pred_from_feat`." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "if clf_edge_pred_from_feat.classes_[0] == 1:\n", + " positive_class_index = 0\n", + "else:\n", + " positive_class_index = 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Evaluate the AUC score for the prediction with node content features." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6651658318918301" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roc_auc_score(out_of_sample_edges[:,2], edge_pred_from_feat[:,positive_class_index])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "attri2vec performs much better than node content features in predicting the links of out-of-sample nodes." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 46a77dd4429925479d08229872e1a797794586c5 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Thu, 8 Aug 2019 12:52:25 +1000 Subject: [PATCH 04/82] reformat link_mappers.py with black --- .DS_Store | Bin 0 -> 6148 bytes .gitignore | 1 + demos/.DS_Store | Bin 0 -> 6148 bytes requirements.txt | 1 - stellargraph/.DS_Store | Bin 0 -> 6148 bytes stellargraph/mapper/link_mappers.py | 24 +++++++++++------------- 6 files changed, 12 insertions(+), 14 deletions(-) create mode 100644 .DS_Store create mode 100644 demos/.DS_Store create mode 100644 stellargraph/.DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a7c5630e292e8a22ff249f7a06d0ea30027f092a GIT binary patch literal 6148 zcmeHK%}&BV5T1pi#h=K*M2>s$#sLDz$)rI%cr(H1K@GGifhMI(Xpv~e^fmO2d;(v` z*&j?1xp*+9*=c6K*`L|Xe97)~0D!b-ZVjLW046F&V+WfbgwJzblbjkc5Q*3$bmrm2 z@tL!>Ru5<+kqYsl zI}Lmm3~`iu>_G?-hVSEN!xV<7HcViYddf0I#m0cY5woOEV^$eZ7slMpYJfT}&46a$ z*9`FYfueG>6ebeI(}9gB0TB6bgb=iOEx~=LLQ7#H5ob_{bVZb|M3opsy5l%i>nw$d zMClGh6(2;MOjLzJ#Ob&{70ZEG5_PK?&;F|J-O~(c1~SC}F*@B& z8$(jLbuBoaYbk0Ul?z{&NE9yEsB|n0o{Gz;LU4|91JP2LNW>8o{Uab~(2Zu`PZ{_G DyGxoe literal 0 HcmV?d00001 diff --git a/.gitignore b/.gitignore index 96931b78f..08e33bc8b 100644 --- a/.gitignore +++ b/.gitignore @@ -117,3 +117,4 @@ demos/graphsage-test/log # Vim session Session.vim +*.txt diff --git a/demos/.DS_Store b/demos/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..e03e0ae2fe6fa4a57617c72c9464fc7a55199887 GIT binary patch literal 6148 zcmeHKPfG$p6n~>uYY}+}@;H}n9uzfC!D4pk78rG?&|Ow^L3hPnlQLt!Mt!3`L0_l$ zW=2u8QxGBV!JFTk_h*Lr?J|D=K-9-x2_Odm7Aj#Xi_He1IO&?CjE7LDF+A|VhXA?| zt!1<0KQchyZXY@@02kY(@8?dUq1z8#jDNbrz8}TK;+IIJGdsI`RyJ#853R>$FPt=E zCmu(2XLPT6zSnV$eG`WLh3&YverM9k9bbe&?D#=vw3PmE=LcRmsfXjhi_|#RPTa#< zHBl>x*gXfp&>+J1aGc53DRB~i+4j*W2 zTQZ5z7($n$Uz$AIV4_i%gHSW$IA&&HUMND%j((}q zLD(9(Wd@jmRR+?!o1^pp^5^&eY7zIC0cK#U7!a98vr)$-nX`3madg&7)HW&!ud6b;-k1AofEH@Nsy*Z=?k literal 0 HcmV?d00001 diff --git a/requirements.txt b/requirements.txt index 0f8c4fe20..4ea9305e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,3 @@ pytest==3.9.3 pytest-benchmark>=3.1 pytest-cov>=2.6.0 coveralls>=1.5.1 - diff --git a/stellargraph/.DS_Store b/stellargraph/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..79bde8f3936a96fe721af05b76d9e85f538a6d49 GIT binary patch literal 6148 zcmeHK%}T>S5Z-N5O%*W*!5;VGt%sJh_9TQ@58i}`9#qZMLI|5vw~dg75K=)YDm&18BXpkXid4ji0*Kijv6Cee zCy1PtwQ@i*AQ||74A8qm66BN^NUft=HB1o_c4EqRdL| zbefoUe8YOa>p8-{jH2Mvww){A%LdxPX%wcmAA0e^^4n`abfe6SrlFg#eJ<>D1itK` zf!4@#UGFxD-su)ilJ|P8Ch6$MMWL#PN9~jI;du6#KNZg}oK*}@5R-j@b9e<~A;bqO z4udGXhf(UWg~ky41im#6e<{c>1k7oYxM>WC3#0CCTHrEX$$(_wR|e?&KtL%P0!xnU z>41in0D$y2f(h8PmY_aFpdqm22s0p-Fb%wx_BX>tw6dzcej75cl z+3B!9VapLVILcNsAQ@O@pvt=%&HraV*Z<2-vL_jk46GFcSm|4R6G9TXbtO2OYYFHO rlnPx}a%3*hh;+;hnu;5sOkj>s1JDpya)c2O`6Iw+kd0*EPZ{_C2;P+@ literal 0 HcmV?d00001 diff --git a/stellargraph/mapper/link_mappers.py b/stellargraph/mapper/link_mappers.py index 5b086cda3..a791fac85 100755 --- a/stellargraph/mapper/link_mappers.py +++ b/stellargraph/mapper/link_mappers.py @@ -24,7 +24,7 @@ "OnDemandLinkSequence", "GraphSAGELinkGenerator", "HinSAGELinkGenerator", - "attri2vecLinkGenerator" + "attri2vecLinkGenerator", ] import random @@ -208,16 +208,16 @@ def __init__(self, generator, walker): if isinstance(self.generator, GraphSAGELinkGenerator): print( - "Running GraphSAGELinkGenerator with an estimated {} batches generated on the fly per epoch.".format( - round(self.data_size / self.generator.batch_size) - ) - ) + "Running GraphSAGELinkGenerator with an estimated {} batches generated on the fly per epoch.".format( + round(self.data_size / self.generator.batch_size) + ) + ) else: print( - "Running attri2vecLinkGenerator with an estimated {} batches generated on the fly per epoch.".format( - round(self.data_size / self.generator.batch_size) - ) - ) + "Running attri2vecLinkGenerator with an estimated {} batches generated on the fly per epoch.".format( + round(self.data_size / self.generator.batch_size) + ) + ) self._gen = self.walker.generator( self.generator.batch_size @@ -616,6 +616,7 @@ def flow(self, link_ids, targets=None, shuffle=False): return LinkSequence(self, link_ids, targets, shuffle) + class attri2vecLinkGenerator: """ A data generator for link prediction with Homogeneous attri2vec models @@ -643,7 +644,7 @@ class attri2vecLinkGenerator: name, optional: Name of generator """ - def __init__(self, G, batch_size, num_samples=[1,1], seed=None, name=None): + def __init__(self, G, batch_size, num_samples=[1, 1], seed=None, name=None): if not isinstance(G, StellarGraphBase): raise TypeError("Graph must be a StellarGraph object.") @@ -657,9 +658,6 @@ def __init__(self, G, batch_size, num_samples=[1,1], seed=None, name=None): # We need a schema for compatibility with HinSAGE self.schema = G.create_graph_schema(create_type_maps=True) - # The sampler used to generate random samples of neighbours - #self.sampler = SampledBreadthFirstWalk(G, graph_schema=self.schema, seed=seed) - def sample_features(self, head_links, sampling_schema): """ Sample content features of the target nodes and the ids of the context nodes From 189a6122aece4b9d3fe9090ee6f213a9dafe5355 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Thu, 8 Aug 2019 13:03:10 +1000 Subject: [PATCH 05/82] Update .gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 08e33bc8b..96931b78f 100644 --- a/.gitignore +++ b/.gitignore @@ -117,4 +117,3 @@ demos/graphsage-test/log # Vim session Session.vim -*.txt From 158f32b6719b046bd3e02507b73f4716a953a630 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Thu, 8 Aug 2019 13:09:43 +1000 Subject: [PATCH 06/82] Delete .DS_Store --- stellargraph/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 stellargraph/.DS_Store diff --git a/stellargraph/.DS_Store b/stellargraph/.DS_Store deleted file mode 100644 index 79bde8f3936a96fe721af05b76d9e85f538a6d49..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}T>S5Z-N5O%*W*!5;VGt%sJh_9TQ@58i}`9#qZMLI|5vw~dg75K=)YDm&18BXpkXid4ji0*Kijv6Cee zCy1PtwQ@i*AQ||74A8qm66BN^NUft=HB1o_c4EqRdL| zbefoUe8YOa>p8-{jH2Mvww){A%LdxPX%wcmAA0e^^4n`abfe6SrlFg#eJ<>D1itK` zf!4@#UGFxD-su)ilJ|P8Ch6$MMWL#PN9~jI;du6#KNZg}oK*}@5R-j@b9e<~A;bqO z4udGXhf(UWg~ky41im#6e<{c>1k7oYxM>WC3#0CCTHrEX$$(_wR|e?&KtL%P0!xnU z>41in0D$y2f(h8PmY_aFpdqm22s0p-Fb%wx_BX>tw6dzcej75cl z+3B!9VapLVILcNsAQ@O@pvt=%&HraV*Z<2-vL_jk46GFcSm|4R6G9TXbtO2OYYFHO rlnPx}a%3*hh;+;hnu;5sOkj>s1JDpya)c2O`6Iw+kd0*EPZ{_C2;P+@ From 1c24d09321e9fadabf0579b43feeeca0f8c1f253 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Thu, 8 Aug 2019 13:11:23 +1000 Subject: [PATCH 07/82] Delete .DS_Store --- .DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index a7c5630e292e8a22ff249f7a06d0ea30027f092a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}&BV5T1pi#h=K*M2>s$#sLDz$)rI%cr(H1K@GGifhMI(Xpv~e^fmO2d;(v` z*&j?1xp*+9*=c6K*`L|Xe97)~0D!b-ZVjLW046F&V+WfbgwJzblbjkc5Q*3$bmrm2 z@tL!>Ru5<+kqYsl zI}Lmm3~`iu>_G?-hVSEN!xV<7HcViYddf0I#m0cY5woOEV^$eZ7slMpYJfT}&46a$ z*9`FYfueG>6ebeI(}9gB0TB6bgb=iOEx~=LLQ7#H5ob_{bVZb|M3opsy5l%i>nw$d zMClGh6(2;MOjLzJ#Ob&{70ZEG5_PK?&;F|J-O~(c1~SC}F*@B& z8$(jLbuBoaYbk0Ul?z{&NE9yEsB|n0o{Gz;LU4|91JP2LNW>8o{Uab~(2Zu`PZ{_G DyGxoe From c53a5cd2f7bd9ebc1067466f55b0bd57097a32e6 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Thu, 8 Aug 2019 13:11:51 +1000 Subject: [PATCH 08/82] Delete .DS_Store --- demos/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 demos/.DS_Store diff --git a/demos/.DS_Store b/demos/.DS_Store deleted file mode 100644 index e03e0ae2fe6fa4a57617c72c9464fc7a55199887..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKPfG$p6n~>uYY}+}@;H}n9uzfC!D4pk78rG?&|Ow^L3hPnlQLt!Mt!3`L0_l$ zW=2u8QxGBV!JFTk_h*Lr?J|D=K-9-x2_Odm7Aj#Xi_He1IO&?CjE7LDF+A|VhXA?| zt!1<0KQchyZXY@@02kY(@8?dUq1z8#jDNbrz8}TK;+IIJGdsI`RyJ#853R>$FPt=E zCmu(2XLPT6zSnV$eG`WLh3&YverM9k9bbe&?D#=vw3PmE=LcRmsfXjhi_|#RPTa#< zHBl>x*gXfp&>+J1aGc53DRB~i+4j*W2 zTQZ5z7($n$Uz$AIV4_i%gHSW$IA&&HUMND%j((}q zLD(9(Wd@jmRR+?!o1^pp^5^&eY7zIC0cK#U7!a98vr)$-nX`3madg&7)HW&!ud6b;-k1AofEH@Nsy*Z=?k From 4517c9f9748aecb33cf54b58d5c03289934471b0 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Thu, 8 Aug 2019 13:33:24 +1000 Subject: [PATCH 09/82] Update README.md --- demos/embeddings/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index db2c667f7..7f8544371 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -4,11 +4,13 @@ This folder contains three [Jupyter](http://jupyter.org/) python notebooks demon **Node2Vec** and **Metapath2Vec** notebooks demonstrate the combined use of `stellargraph` and `Gensim` [4] libraries for representation learning on homogeneous and heterogeneous graphs. **Unsupervised GraphSAGE** notebook demonstrates the use of `Stellargraph` library's GraphSAGE implementation for unsupervised learning of node embeddings for homogeneous graphs with node features. +**attri2vec** notebook demonstrates the implementation of attri2vec with the `Stellargraph` library for unsupervised inductive learning of node embeddings for homogeneous graphs with node features, and the evaluation for its ablity to infer the representations of out-of-sample nodes with the out-of-sample node link prediction task. The notebooks demonstrate the following algorithms. - `stellargraph-node2vec.ipynb` The **Node2Vec** algorithm [1] for representation learning on homogeneous graphs - `stellargraph-metapath2vec.ipynb` The **Metapath2Vec** algorithm [2] for representation learning on heterogeneous graphs. - `embeddings-unsupervised-graphsage-cora.ipynb` The **Unsupervised GraphSAGE** algorithm [5] for representation learning on homogeneous graphs with node features. +- `stellargraph-attri2vec-DBLP.ipynb` The **attri2vec** algorithm [6] for representation learning on homogeneous graphs with node features. All examples demonstrate how to calculate embedding vectors for a graph's nodes in just a few lines of Python code. The learned node representations can be used in numerous downstream tasks such as node attribute inference, link @@ -32,3 +34,5 @@ G. S. Corrado, and J. Dean. In Advances in Neural Information Processing System **5.** Inductive Representation Learning on Large Graphs. W.L. Hamilton, R. Ying, and J. Leskovec arXiv:1706.02216 [cs.SI], 2017. ([link](http://snap.stanford.edu/graphsage/)) + +**6.** Attributed Network Embedding Via Subspace Discovery. D. Zhang, Y. Jie, X. Zhu and C. Zhang, arXiv:1901.04095, [cs.SI], 2019. ([link](https://arxiv.org/abs/1901.04095)) From 947c843879eb4811e2c51a68483d7aa691e9a2b9 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Thu, 8 Aug 2019 13:33:46 +1000 Subject: [PATCH 10/82] Update README.md --- demos/embeddings/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index 7f8544371..b62eb6a0a 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -35,4 +35,4 @@ G. S. Corrado, and J. Dean. In Advances in Neural Information Processing System **5.** Inductive Representation Learning on Large Graphs. W.L. Hamilton, R. Ying, and J. Leskovec arXiv:1706.02216 [cs.SI], 2017. ([link](http://snap.stanford.edu/graphsage/)) -**6.** Attributed Network Embedding Via Subspace Discovery. D. Zhang, Y. Jie, X. Zhu and C. Zhang, arXiv:1901.04095, [cs.SI], 2019. ([link](https://arxiv.org/abs/1901.04095)) +**6.** Attributed Network Embedding via Subspace Discovery. D. Zhang, Y. Jie, X. Zhu and C. Zhang, arXiv:1901.04095, [cs.SI], 2019. ([link](https://arxiv.org/abs/1901.04095)) From a37f67c3cc723b8b0c25e007f32fb70b77a4f0c9 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Thu, 8 Aug 2019 13:34:56 +1000 Subject: [PATCH 11/82] Update README.md --- demos/embeddings/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index b62eb6a0a..18f4aa190 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -4,7 +4,7 @@ This folder contains three [Jupyter](http://jupyter.org/) python notebooks demon **Node2Vec** and **Metapath2Vec** notebooks demonstrate the combined use of `stellargraph` and `Gensim` [4] libraries for representation learning on homogeneous and heterogeneous graphs. **Unsupervised GraphSAGE** notebook demonstrates the use of `Stellargraph` library's GraphSAGE implementation for unsupervised learning of node embeddings for homogeneous graphs with node features. -**attri2vec** notebook demonstrates the implementation of attri2vec with the `Stellargraph` library for unsupervised inductive learning of node embeddings for homogeneous graphs with node features, and the evaluation for its ablity to infer the representations of out-of-sample nodes with the out-of-sample node link prediction task. +**attri2vec** notebook demonstrates the implementation of attri2vec with the `Stellargraph` library for unsupervised inductive learning of node embeddings for homogeneous graphs with node features, and the evaluation for its ability to infer the representations of out-of-sample nodes with the out-of-sample node link prediction task. The notebooks demonstrate the following algorithms. - `stellargraph-node2vec.ipynb` The **Node2Vec** algorithm [1] for representation learning on homogeneous graphs From 50ccc0cf7cfa25cb68f45c22459ef08cebcec483 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 21 Aug 2019 17:08:58 +1000 Subject: [PATCH 12/82] rewrite the attri2vec demo by using the Attri2Vec model, Attri2VecNodeGenerator and Attri2VecLinkGenerator --- .../stellargraph-attri2vec-DBLP.ipynb | 342 +++++++++++++----- 1 file changed, 256 insertions(+), 86 deletions(-) diff --git a/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb b/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb index 8abfad166..47aefb365 100644 --- a/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb +++ b/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Infer Representations for Out-of-sample Nodes Through attri2vec" + "# Inferring Representations for Out-of-sample Nodes Through attri2vec" ] }, { @@ -26,9 +26,9 @@ "\n", "In implementation, node embeddings are learnt by solving a simple classification task: given a large set of \"positive\" `(target, context)` node pairs generated from random walks performed on the graph (i.e., node pairs that co-occur within a certain context window in random walks), and an equally large set of \"negative\" node pairs that are randomly selected from the graph according to a certain distribution, learn a binary classifier that predicts whether arbitrary node pairs are likely to co-occur in a random walk performed on the graph. Through learning this simple binary node-pair-classification task, the model automatically learns an inductive mapping from attributes of nodes to node embeddings in a low-dimensional vector space, which preserves structural and feature similarities of the nodes. Unlike the embeddings learned by DeepWalk/node2vec, the mapping is inductive. Different from the mapping generated by GraphSAGE, the mapping only performs on node content attributes and does not rely on any link information, which makes it possible to construct representations for new coming nodes having few or no link information from their content attributes.\n", "\n", - "To train the attri2vec model, we first construct a training set of nodes, which is composed of an equal number of positive and negative `(target, context)` pairs from the graph. The positive `(target, context)` pairs are the node pairs co-occurring on random walks over the graph whereas the negative node pairs are the sampled randomly from the global node degree distribution of the graph. In attri2vec, each node is attached with two kinds of embeddings: 1) the inductive 'input embedding', i.e, the objective embedding, obtained by perform a non-linear transformation on node content features, and 2) 'output embedding', i.e., the parameters used to predict its occurrence as a context node, obtained by looking up a parameter table. Given a `(target, context)`, attri2vec outputs a predictive value to indicate whether it is positive or negative, which is obtained by performing the dot product of the 'input embedding' of the target node and the 'output embedding' of the context node, followed by a sigmoid activation. \n", + "To train the attri2vec model, we first construct a training set of nodes, which is composed of an equal number of positive and negative `(target, context)` pairs from the graph. The positive `(target, context)` pairs are the node pairs co-occurring on random walks over the graph whereas the negative node pairs are the sampled randomly from the global node degree distribution of the graph. In attri2vec, each node is attached with two kinds of embeddings: 1) the inductive 'input embedding', i.e, the objective embedding, obtained by perform a non-linear transformation on node content features, and 2) 'output embedding', i.e., the parameter vector used to predict its occurrence as a context node, obtained by looking up a parameter table. Given a `(target, context)` pair, attri2vec outputs a predictive value to indicate whether it is positive or negative, which is obtained by performing the dot product of the 'input embedding' of the target node and the 'output embedding' of the context node, followed by a sigmoid activation. \n", "\n", - "The entire model is trained end-to-end by minimizing the loss function of choice (e.g., binary cross-entropy between predicted node pair labels and true link labels) using stochastic gradient descent (SGD) updates of the model parameters, with minibatches of 'training' links generated on demand and fed into the model.\n", + "The entire model is trained end-to-end by minimizing the binary cross-entropy loss function with regards to predicted node pair labels and true node pair labels, using stochastic gradient descent (SGD) updates of the model parameters, with minibatches of 'training' node pairs generated on demand and fed into the model.\n", "\n", "In this demo, we first train the attri2vec model on the in-sample subgraph and obtain a mapping function from node attributes to node representations, then apply the mapping function to the content attributes of out-of-sample nodes and obtain the representations of out-of-sample nodes. We evaluate the quality of inferred out-of-sample node representations by using it to predict the links of out-of-sample nodes." ] @@ -56,16 +56,13 @@ "import stellargraph as sg\n", "from stellargraph.data import UniformRandomWalk\n", "from stellargraph.data import UnsupervisedSampler\n", - "from stellargraph.mapper import attri2vecLinkGenerator\n", + "from stellargraph.mapper import Attri2VecLinkGenerator\n", + "from stellargraph.mapper.node_mappers import Attri2VecNodeGenerator\n", + "from stellargraph.layer import Attri2Vec, link_classification\n", "\n", - "import keras \n", - "from keras.utils import Sequence\n", - "from keras.models import Model\n", - "from keras.layers import Input, Dense, Reshape, merge\n", - "from keras.layers.embeddings import Embedding\n", + "import keras\n", "\n", "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import roc_auc_score" ] }, @@ -87,7 +84,7 @@ "\n", " \t\t \n", " \n", - "> The first entry in each line contains the unique integer ID (ranging from 0 to 18,447) of the paper followed by > binary values indicating whether each word in the vocabulary is present (indicated by 1) or absent (indicated by 0) in the paper. Finally, the last two entries in the line are the class label and the publication year of the paper.\n", + "> The first entry in each line contains the unique integer ID (ranging from 0 to 18,447) of the paper followed by binary values indicating whether each word in the vocabulary is present (indicated by 1) or absent (indicated by 0) in the paper. Finally, the last two entries in the line are the class label and the publication year of the paper.\n", "> The edgeList.txt file contains the citation relations. Each line describes a link in the following format:\n", "\t\t\n", " \n", @@ -143,6 +140,96 @@ "node_data = pd.read_csv(os.path.join(data_dir, \"content.txt\"), sep='\\t', header=None, names=node_column_names)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Construct the whole graph from edge list." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "G_all_nx = nx.from_pandas_edgelist(edgelist, edge_attr=\"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify node types." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "nx.set_node_attributes(G_all_nx, \"paper\", \"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get node features." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "all_node_features = node_data[feature_names]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the Stellargraph with node features." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "G_all = sg.StellarGraph(G_all_nx, node_features=all_node_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "StellarGraph: Undirected multigraph\n", + " Nodes: 18448, Edges: 45611\n", + "\n", + " Node types:\n", + " paper: [18448]\n", + " Edge types: paper-cites->paper\n", + "\n", + " Edge types:\n", + " paper-cites->paper: [45611]\n", + "\n" + ] + } + ], + "source": [ + "print(G_all.info())" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -160,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -186,11 +273,11 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "Gnx = nx.from_pandas_edgelist(subgraph_edgelist, edge_attr=\"label\")" + "G_sub_nx = nx.from_pandas_edgelist(subgraph_edgelist, edge_attr=\"label\")" ] }, { @@ -202,11 +289,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "nx.set_node_attributes(Gnx, \"paper\", \"label\")" + "nx.set_node_attributes(G_sub_nx, \"paper\", \"label\")" ] }, { @@ -218,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -235,16 +322,16 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "G = sg.StellarGraph(Gnx, node_features=subgraph_node_features)" + "G_sub = sg.StellarGraph(G_sub_nx, node_features=subgraph_node_features)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -265,7 +352,7 @@ } ], "source": [ - "print(G.info())" + "print(G_sub.info())" ] }, { @@ -284,12 +371,12 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "nodes = list(G.nodes())\n", - "number_of_walks = 1\n", + "nodes = list(G_sub.nodes())\n", + "number_of_walks = 2\n", "length = 5" ] }, @@ -302,11 +389,11 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "unsupervised_samples = UnsupervisedSampler(G, nodes=nodes, length=length, number_of_walks=number_of_walks)" + "unsupervised_samples = UnsupervisedSampler(G_sub, nodes=nodes, length=length, number_of_walks=number_of_walks)" ] }, { @@ -318,7 +405,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -330,36 +417,38 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Define the attri2vec training generator, which generates a batch of (feature of target node, index of context node, label of node pair) pairs per iteration." + "Define an attri2vec training generator, which generates a batch of (feature of target node, index of context node, label of node pair) pairs per iteration." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Running attri2vecLinkGenerator with an estimated 2355 batches generated on the fly per epoch.\n" + "Running Attri2VecLinkGenerator with an estimated 4710 batches generated on the fly per epoch.\n" ] } ], "source": [ - "train_gen = attri2vecLinkGenerator(G, batch_size).flow(unsupervised_samples)" + "train_gen = Attri2VecLinkGenerator(G_sub, batch_size).flow(unsupervised_samples)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The following is the keras implementation of attri2vec. The objective attri2vec is equivalent to minimize the cross-entropy loss for predicting the context nodes with target nodes, with the predicted output being the dot product of the \"input embedding\" of target nodes and the \"output embedding\" of target nodes, followed by a sigmoid activation. The \"input embedding\", the objective node representation, is constructed from node content features through a linear transformation followed by a sigmoid activation. The \"out embedding\" is a look-up table for each context node." + "Building the model: a 1-hidden-layer node representation ('input embedding') of the `target` node and the parameter vector ('output embedding') for predicting the existence of `context node` for each `(target context)` pair, with a link classification layer performed on the dot product of the 'input embedding' of the `target` node and the 'output embedding' of the `context` node.\n", + "\n", + "Attri2Vec part of the model, with a 128-dimenssion hidden layer, no bias term, no dropout and no normalization. (Dropout can be switched on by specifying a positive dropout rate, 0 < dropout < 1 and normalization can be set to 'l2'). " ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -367,40 +456,94 @@ "output_type": "stream", "text": [ "WARNING: Logging before flag parsing goes to stderr.\n", - "W0807 17:47:53.407229 140736734368704 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", - "\n", - "W0807 17:47:53.423400 140736734368704 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n", - "\n", - "W0807 17:47:53.428951 140736734368704 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n", + "W0821 16:57:15.264071 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", + "\n" + ] + } + ], + "source": [ + "layer_sizes = [128]\n", + "attri2vec = Attri2Vec(\n", + " layer_sizes=layer_sizes, generator=train_gen, bias=False, dropout=0.0, normalize=\"None\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "W0821 16:57:18.045172 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n", "\n", - "W0807 17:47:53.504508 140736734368704 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", + "W0821 16:57:18.051191 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n", + "\n" + ] + } + ], + "source": [ + "# Build the model and expose input and output sockets of attri2vec, for node pair inputs:\n", + "x_inp, x_out = attri2vec.build()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the link_classification function to generate the prediction, with the 'ip' edge embedding generation method and the 'sigmoid' activation, which actually performs the dot product of the 'input embedding' of the target node and the 'output embedding' of the context node followed by a sigmoid activation. " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "link_classification: using 'ip' method to combine node embeddings into edge embeddings\n" + ] + } + ], + "source": [ + "prediction = link_classification(\n", + " output_dim=1, output_act=\"sigmoid\", edge_embedding_method='ip'\n", + ")(x_out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Stack the Attri2Vec encoder and prediction layer into a Keras model, and specify the loss." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "W0821 16:57:23.894735 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", "\n", - "W0807 17:47:53.510684 140736734368704 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.\n", + "W0821 16:57:23.901146 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.\n", "\n", - "W0807 17:47:53.517368 140736734368704 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "W0821 16:57:23.909626 140736102417344 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n" ] } ], "source": [ - "embedding_size = 256 # set the embedding dimension to 256.\n", - "input_target = Input((G.node_feature_sizes()[\"paper\"],)) # define the input for target nodes as node content features \n", - "input_context = Input((1,)) # define the input for context nodes as node ids\n", - "\n", - "target = Dense(embedding_size, activation='sigmoid')(input_target) # get \"input embedding\" for the target node\n", - "target = Reshape((embedding_size, 1))(target) \n", - "out_embedding = Embedding(len(node_data), embedding_size, input_length=1, name='out_embedding')\n", - "context = out_embedding(input_context) # get the \"output embedding\" for the context node\n", - "context = Reshape((embedding_size, 1))(context)\n", + "model = keras.Model(inputs=x_inp, outputs=prediction)\n", "\n", - "# perform dot product between the \"input embedding\" and the \"output embedding\"\n", - "dot_product = keras.layers.dot([target, context], axes = 1, normalize=False) \n", - "dot_product = Reshape((1,))(dot_product) \n", - "\n", - "output = Dense(1, activation='sigmoid')(dot_product) # get the predicted output\n", - "\n", - "model = Model(inputs=[input_target, input_context], outputs=output)\n", "model.compile(\n", " optimizer=keras.optimizers.Adam(lr=1e-3),\n", " loss=keras.losses.binary_crossentropy,\n", @@ -417,7 +560,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 24, "metadata": { "scrolled": true }, @@ -426,7 +569,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "W0807 17:47:56.943290 140736734368704 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.\n", + "W0821 16:57:27.992636 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.\n", "\n" ] }, @@ -435,13 +578,13 @@ "output_type": "stream", "text": [ "Epoch 1/4\n", - "2356/2356 [==============================] - 136s 58ms/step - loss: 0.7146 - binary_accuracy: 0.5173\n", + "4711/4711 [==============================] - 93s 20ms/step - loss: 0.6506 - binary_accuracy: 0.6515\n", "Epoch 2/4\n", - "2356/2356 [==============================] - 136s 58ms/step - loss: 0.6876 - binary_accuracy: 0.5374\n", + "4711/4711 [==============================] - 94s 20ms/step - loss: 0.5718 - binary_accuracy: 0.6775\n", "Epoch 3/4\n", - "2356/2356 [==============================] - 136s 58ms/step - loss: 0.6759 - binary_accuracy: 0.5468\n", + "4711/4711 [==============================] - 95s 20ms/step - loss: 0.4567 - binary_accuracy: 0.7736\n", "Epoch 4/4\n", - "2356/2356 [==============================] - 136s 58ms/step - loss: 0.6571 - binary_accuracy: 0.5578\n" + "4711/4711 [==============================] - 94s 20ms/step - loss: 0.2897 - binary_accuracy: 0.8791\n" ] } ], @@ -450,8 +593,8 @@ " train_gen,\n", " epochs=epochs,\n", " verbose=1,\n", - " use_multiprocessing=False,\n", - " workers=0,\n", + " use_multiprocessing=True,\n", + " workers=4,\n", " shuffle=True,\n", ")" ] @@ -467,17 +610,44 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Get the node embeddings, for both in-sample and out-of-sample nodes, by applying the learned mapping function to node content features." + "Build the node based model for predicting node representations from node content attributes with the learned parameters. Below a Keras model is constructed, with x_inp[0] as input and x_out[0] as output. Note that this model's weights are the same as those of the corresponding node encoder in the previously trained node pair classifier." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ - "embedding_model = Model(inputs = input_target, outputs = Reshape((embedding_size,))(target))\n", - "node_embeddings = embedding_model.predict(node_data[feature_names].values)" + "x_inp_src = x_inp[0]\n", + "x_out_src = x_out[0]\n", + "embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the node embeddings, for both in-sample and out-of-sample nodes, by applying the learned mapping function to node content features." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "369/369 [==============================] - 1s 2ms/step\n" + ] + } + ], + "source": [ + "node_ids = node_data.index\n", + "node_gen = Attri2VecNodeGenerator(G_all, batch_size).flow(node_ids)\n", + "node_embeddings = embedding_model.predict_generator(node_gen, workers=4, verbose=1)" ] }, { @@ -489,7 +659,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -524,7 +694,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -541,7 +711,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -554,7 +724,7 @@ " warm_start=False)" ] }, - "execution_count": 20, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -573,7 +743,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -589,7 +759,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -608,16 +778,16 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.736614362152765" + "0.8109590038749268" ] }, - "execution_count": 23, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -635,7 +805,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -652,7 +822,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -665,7 +835,7 @@ " warm_start=False)" ] }, - "execution_count": 25, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -684,7 +854,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -700,7 +870,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -719,16 +889,16 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6651658318918301" + "0.6638026335702683" ] }, - "execution_count": 28, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } From fb7ce3694e1bc8b9490f31d09692b51737c00d16 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 21 Aug 2019 17:10:48 +1000 Subject: [PATCH 13/82] Add the module to define Attri2Vec class --- stellargraph/layer/attri2vec.py | 212 ++++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 stellargraph/layer/attri2vec.py diff --git a/stellargraph/layer/attri2vec.py b/stellargraph/layer/attri2vec.py new file mode 100644 index 000000000..ec4774357 --- /dev/null +++ b/stellargraph/layer/attri2vec.py @@ -0,0 +1,212 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2018 Data61, CSIRO +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +attri2vec + +""" +__all__ = ["Attri2Vec"] + +from keras import Input +from keras import backend as K +from keras.layers import Input, Dense, Lambda, Dropout, Reshape +from keras.layers.embeddings import Embedding +import warnings + + +class Attri2Vec: + """ + Implementation of the attri2vec algorithm of Zhang et al. with Keras layers. + see: https://arxiv.org/abs/1901.04095 + + The model minimally requires specification of the layer sizes as a list of ints + corresponding to the feature dimensions for each hidden layer and a generator object. + + Args: + layer_sizes (list): Hidden feature dimensions for each layer + generator (Sequence): A NodeSequence or LinkSequence. + input_dim (int): The dimensions of the node features used as input to the model. + node_num (int): The number of nodes in the given graph. + bias (bool): If True a bias vector is learnt for each layer in the attri2vec model, default to False. + dropout (float): The dropout supplied to each layer in the attri2vec model, default to 0.0. + normalize (str or None): The normalization used after each layer, default to None. + + """ + + def __init__( + self, + layer_sizes, + generator=None, + input_dim=None, + node_num=None, + bias=False, + dropout=0.0, + normalize=None, + ): + + if normalize == "l2": + self._normalization = Lambda(lambda x: K.l2_normalize(x, axis=-1)) + + elif normalize is None or normalize == "none" or normalize == "None": + self._normalization = Lambda(lambda x: x) + + else: + raise ValueError( + "Normalization should be either 'l2' or 'none'; received '{}'".format( + normalize + ) + ) + + # Get the input_dim and node_num from the generator if it is given + # Use both the schema and head node type from the generator + self.generator = generator + if generator is not None: + feature_sizes = generator.generator.graph.node_feature_sizes() + if len(feature_sizes) > 1: + raise RuntimeError( + "Attri2Vec called on graph with more than one node type." + ) + + self.input_feature_size = feature_sizes.popitem()[1] + self.input_node_num = generator.generator.graph.number_of_nodes() + + elif input_dim is not None and node_num is not None: + self.input_feature_size = input_dim + self.input_node_num = node_num + + else: + raise RuntimeError( + "If generator is not provided, input_dim and node_num must be specified." + ) + + # Model parameters + self.n_layers = len(layer_sizes) + self.bias = bias + self.dropout = dropout + self.activation = "sigmoid" # the activation fuction used in hidden layers + self.initializer = ( + "glorot_uniform" + ) # the initializer for the weights to construct hidden layers + + # Feature dimensions for each layer + self.dims = [self.input_feature_size] + layer_sizes + + def __call__(self, xin): + """ + Construct node representations from node attributes through deep neural network + + Args: + xin (Keras Tensor): Batch input features + + Returns: + Output tensor + """ + # Form Attri2Vec layers iteratively + h_layer = xin + for layer in range(0, self.n_layers): + h_layer = Dropout(self.dropout)(h_layer) + h_layer = Dense( + self.dims[layer + 1], + activation=self.activation, + kernel_initializer=self.initializer, + use_bias=self.bias, + )(h_layer) + h_layer = self._normalization(h_layer) + + return h_layer + + def node_model(self): + """ + Builds a Attri2Vec model for node prediction + + Returns: + tuple: (x_inp, x_out) where ``x_inp`` is a Keras input tensor + for the Attri2Vec model and ``x_out`` is the Keras tensor + for the Attri2Vec model output. + + """ + # Create tensor inputs + x_inp = Input(shape=(self.input_feature_size,)) + + # Output from Attri2Vec model + x_out = self(x_inp) + + return x_inp, x_out + + def link_model(self): + """ + Builds a Attri2Vec model for link or node pair prediction + + Returns: + tuple: (x_inp, x_out) where ``x_inp`` is a list of Keras input tensors for (src, dst) nodes in the node pairs + and ``x_out`` is a list of output tensors for (src, dst) nodes in the node pairs + + """ + # Expose input and output sockets of the model, for source node: + x_inp_src, x_out_src = self.node_model() + + # Expose input and out sockets of the model, for target node: + x_inp_dst = Input(shape=(1,)) + output_embedding = Embedding( + self.input_node_num, + self.dims[self.n_layers], + input_length=1, + name="output_embedding", + ) + x_out_dst = output_embedding(x_inp_dst) + x_out_dst = Reshape((self.dims[self.n_layers],))(x_out_dst) + + x_inp = [x_inp_src, x_inp_dst] + x_out = [x_out_src, x_out_dst] + return x_inp, x_out + + def build(self): + """ + Builds a Attri2Vec model for node or link/node pair prediction, depending on the generator used to construct + the model (whether it is a node or link/node pair generator). + + Returns: + tuple: (x_inp, x_out), where ``x_inp`` contains Keras input tensor(s) + for the specified Attri2Vec model (either node or link/node pair model) and ``x_out`` contains + model output tensor(s) of shape (batch_size, layer_sizes[-1]) + + """ + if self.generator is not None and hasattr(self.generator, "_sampling_schema"): + if len(self.generator._sampling_schema) == 1: + return self.node_model() + elif len(self.generator._sampling_schema) == 2: + return self.link_model() + else: + raise RuntimeError( + "The generator used for model creation is neither a node nor a link generator, " + "unable to figure out how to build the model. Consider using node_model or " + "link_model method explicitly to build node or link prediction model, respectively." + ) + else: + raise RuntimeError( + "Suitable generator is not provided at model creation time, unable to figure out how to build the model. " + "Consider either providing a generator, or using node_model or link_model method explicitly to build node or " + "link prediction model, respectively." + ) + + def default_model(self, flatten_output=True): + warnings.warn( + "The .default_model() method will be deprecated in future versions. " + "Please use .build() method instead.", + PendingDeprecationWarning, + ) + return self.build() From 094650e713b2ad7f726f10aa2ccf90759ea7c0dc Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 21 Aug 2019 17:12:43 +1000 Subject: [PATCH 14/82] Add the definition of the class: Attri2VecLinkGenerator --- stellargraph/mapper/link_mappers.py | 49 +++++++++++++++-------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/stellargraph/mapper/link_mappers.py b/stellargraph/mapper/link_mappers.py index a791fac85..1f3e5b750 100755 --- a/stellargraph/mapper/link_mappers.py +++ b/stellargraph/mapper/link_mappers.py @@ -16,7 +16,7 @@ """ Generators that create batches of data from a machine-learnign ready graph -for link prediction/link attribute inference problems using GraphSAGE and HinSAGE. +for link prediction/link attribute inference problems using GraphSAGE, HinSAGE and Attri2Vec. """ __all__ = [ @@ -24,7 +24,7 @@ "OnDemandLinkSequence", "GraphSAGELinkGenerator", "HinSAGELinkGenerator", - "attri2vecLinkGenerator", + "Attri2VecLinkGenerator", ] import random @@ -56,11 +56,12 @@ class LinkSequence(Sequence): :meth:`keras.Model.evaluate_generator`, and :meth:`keras.Model.predict_generator` This class generates data samples for link inference models and should be created using the :meth:`flow` method of - :class:`GraphSAGELinkGenerator` or :class:`HinSAGELinkGenerator` . + :class:`GraphSAGELinkGenerator` or :class:`HinSAGELinkGenerator` or :class:`Attri2VecLinkGenerator`. Args: - generator: An instance of :class:`GraphSAGELinkGenerator` or :class:`HinSAGELinkGenerator`. + generator: An instance of :class:`GraphSAGELinkGenerator` or :class:`HinSAGELinkGenerator` or + :class:`Attri2VecLinkGenerator`. ids (list or iterable): Link IDs to batch, each link id being a tuple of (src, dst) node ids. - (The graph nodes must have a "feature" attribute that is used as input to the GraphSAGE model.) + (The graph nodes must have a "feature" attribute that is used as input to the GraphSAGE/Attri2Vec model.) These are the links that are to be used to train or inference, and the embeddings calculated for these links via a binary operator applied to their source and destination nodes, are passed to the downstream task of link prediction or link attribute inference. @@ -176,10 +177,10 @@ class OnDemandLinkSequence(Sequence): This class generates data samples for link inference models and should be created using the :meth:`flow` method of - :class:`GraphSAGELinkGenerator` ` . + :class:`GraphSAGELinkGenerator` or :class:`Attri2VecLinkGenerator`. Args: - generator: An instance of :class:`GraphSAGELinkGenerator` or 'attri2vecLink Generator'. + generator: An instance of :class:`GraphSAGELinkGenerator` or 'Attri2VecLinkGenerator'. sampler: An instance of :class:`UnsupervisedSampler` that encapsulates the neighbourhood sampling of a graph. The generator method of this class returns `batch_size` of positive and negative samples on demand. """ @@ -214,7 +215,7 @@ def __init__(self, generator, walker): ) else: print( - "Running attri2vecLinkGenerator with an estimated {} batches generated on the fly per epoch.".format( + "Running Attri2VecLinkGenerator with an estimated {} batches generated on the fly per epoch.".format( round(self.data_size / self.generator.batch_size) ) ) @@ -617,11 +618,11 @@ def flow(self, link_ids, targets=None, shuffle=False): return LinkSequence(self, link_ids, targets, shuffle) -class attri2vecLinkGenerator: +class Attri2VecLinkGenerator: """ - A data generator for link prediction with Homogeneous attri2vec models + A data generator for link/node pair prediction with attri2vec models - At minimum, supply the StellarGraph the batch size. + At minimum, supply the StellarGraph and the batch size. The supplied graph should be a StellarGraph object that is ready for machine learning. Currently the model requires node features for all @@ -633,25 +634,23 @@ class attri2vecLinkGenerator: Example:: - G_generator = attri2vecLinkGenerator(G, 50) + G_generator = Attri2VecLinkGenerator(G, 50) train_data_gen = G_generator.flow(edge_ids) Args: G (StellarGraph): A machine-learning ready graph. batch_size (int): Size of batch of links to return. - num_samples: for compatibility with GraphSAGE - seed (int or str), optional: Random seed for the sampling methods. - name, optional: Name of generator + name, optional: Name of generator. """ - def __init__(self, G, batch_size, num_samples=[1, 1], seed=None, name=None): + def __init__(self, G, batch_size, name=None): if not isinstance(G, StellarGraphBase): raise TypeError("Graph must be a StellarGraph object.") G.check_graph_for_ml(features=True) self.graph = G - self.num_samples = num_samples + self.num_samples = [0] # for compatibility with GraphSAGE self.batch_size = batch_size self.name = name @@ -665,16 +664,16 @@ def sample_features(self, head_links, sampling_schema): Args: head_links: An iterable of edges to perform sampling for. - sampling_schema: The sampling schema for the model, for compatibility - with GraphSAGE and HinSAGE + sampling_schema: The sampling schema for the model, for compatibility with GraphSAGE and HinSAGE. Returns: - a list of feaure arrys, with each element being the feature of a - target node and the id of the corresponding context node + A list of feaure arrys, with each element being the feature of a + target node and the id of the corresponding context node. """ + all_nodes = list(self.graph.nodes()) target_ids = [head_link[0] for head_link in head_links] - context_ids = [head_link[1] for head_link in head_links] + context_ids = [all_nodes.index(head_link[1]) for head_link in head_links] target_feats = self.graph.get_feature_for_nodes(target_ids) batch_feats = [target_feats, np.array(context_ids)] @@ -702,7 +701,7 @@ def flow(self, link_ids, targets=None, shuffle=False): specifying the edges or an UnsupervisedSampler object that has a generator method to generate samples on the fly. targets (optional, array): a 2D array of numeric targets with shape - `(len(link_ids), target_size)` + `(len(link_ids), target_size)`. shuffle (optional, bool): If True the node_ids will be shuffled at each epoch, if False the node_ids will be processed in order. @@ -713,6 +712,10 @@ def flow(self, link_ids, targets=None, shuffle=False): if isinstance(link_ids, UnsupervisedSampler): return OnDemandLinkSequence(self, link_ids) + # Otherwise pass iterable to standard LinkSequence + elif isinstance(link_ids, collections.Iterable): + return LinkSequence(self, link_ids, targets, shuffle) + else: raise TypeError( "Argument to .flow not recognised. " From b9a81c976250eb9712fe42bae907217a980d55c5 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 21 Aug 2019 17:13:14 +1000 Subject: [PATCH 15/82] Add the definition of the class: Attri2VecNodeGenerator --- stellargraph/mapper/node_mappers.py | 142 ++++++++++++++++++++++++++-- 1 file changed, 133 insertions(+), 9 deletions(-) mode change 100644 => 100755 stellargraph/mapper/node_mappers.py diff --git a/stellargraph/mapper/node_mappers.py b/stellargraph/mapper/node_mappers.py old mode 100644 new mode 100755 index 1e9518b9c..b94fc0db4 --- a/stellargraph/mapper/node_mappers.py +++ b/stellargraph/mapper/node_mappers.py @@ -22,6 +22,7 @@ "NodeSequence", "GraphSAGENodeGenerator", "HinSAGENodeGenerator", + "Attri2VecNodeGenerator", "FullBatchNodeGenerator", "FullBatchNodeSequence", "SparseFullBatchNodeSequence", @@ -54,17 +55,22 @@ class NodeSequence(Sequence): This class generated data samples for node inference models and should be created using the `.flow(...)` method of - :class:`GraphSAGENodeGenerator` or :class:`HinSAGENodeGenerator`. - - These Generators are classes that capture the graph structure - and the feature vectors of each node. These generator classes - are used within the NodeSequence to generate samples of k-hop - neighbourhoods in the graph and to return to this class the - features from the sampled neighbourhoods. + :class:`GraphSAGENodeGenerator` or :class:`HinSAGENodeGenerator` + or :class:`Attri2vecNodeGenerator`. + + GraphSAGENodeGenerator and HinSAGENodeGenerator are classes that + capture the graph structure and the feature vectors of each node. + These generator classes are used within the NodeSequence to generate + samples of k-hop neighbourhoods in the graph and to return to this + class the features from the sampled neighbourhoods. + + Attri2VecNodeGenerator is the class that captures node feature vectors + of each node. Args: - generator: GraphSAGENodeGenerator or HinSAGENodeGenerator - The generator object containing the graph information. + generator: GraphSAGENodeGenerator or HinSAGENodeGenerator or + Attri2VecNodeGenerator. The generator object containing the + graph information. ids: list A list of the node_ids to be used as head-nodes in the downstream task. @@ -508,6 +514,124 @@ def flow_from_dataframe(self, node_targets, shuffle=False): ) +class Attri2VecNodeGenerator: + """ + A data generator for node prediction with attri2vec models + + At minimum, supply the StellarGraph and the batch size. + + The supplied graph should be a StellarGraph object that is ready for + machine learning. Currently the model requires node features for all + nodes in the graph. + + Use the :meth:`flow` method supplying the nodes and (optionally) targets + to get an object that can be used as a Keras data generator. + + Example:: + + G_generator = Attri2VecNodeGenerator(G, 50) + train_data_gen = G_generator.flow(train_node_ids, train_node_labels) + test_data_gen = G_generator.flow(test_node_ids) + + Args: + G (StellarGraph): The machine-learning ready graph. + batch_size (int): Size of batch to return. + name (str or None): Name of the generator (optional). + """ + + def __init__(self, G, batch_size, name=None): + if not isinstance(G, StellarGraphBase): + raise TypeError("Graph must be a StellarGraph object.") + + self.graph = G + self.num_samples = [0] # for compatibility with GraphSAGE + self.batch_size = batch_size + self.name = name + + # Check if the graph has features + G.check_graph_for_ml() + + # We need a schema for compatibility with HinSAGE + self.schema = G.create_graph_schema(create_type_maps=True) + + # Check that there is only a single node type for Attri2Vec + if len(self.schema.node_types) > 1: + print( + "Warning: running homogeneous Attri2Vec on a graph with multiple node types" + ) + + def sample_features(self, head_nodes, sampling_schema): + """ + Sample content features of the head nodes, and return these as a list of feature + arrays for the attri2vec algorithm. + + Args: + head_nodes: An iterable of head nodes to perform sampling on. + sampling_schema: The sampling schema for the model, for compatibility with GraphSAGE and HinSAGE. + + Returns: + A list of feaure arrys, with each element being the feature of a + head node. + """ + + batch_feats = self.graph.get_feature_for_nodes(head_nodes) + return batch_feats + + def flow(self, node_ids, targets=None, shuffle=False): + """ + Creates a generator/sequence object for training or evaluation + with the supplied node ids and numeric targets. + + The node IDs are the nodes to train or inference on: the embeddings + calculated for these nodes are passed to the downstream task. These + are a subset of the nodes in the graph. + + The targets are an array of numeric targets corresponding to the + supplied node_ids to be used by the downstream task. They should + be given in the same order as the list of node IDs. + If they are not specified (for example, for use in prediction), + the targets will not be available to the downsteam task. + + Note that the shuffle argument should be True for training and + False for prediction. + + Args: + node_ids: an iterable of node IDs. + targets: a 2D array of numeric targets with shape + `(len(node_ids), target_size)`. + shuffle (bool): If True the node_ids will be shuffled at each + epoch, if False the node_ids will be processed in order. + + Returns: + A NodeSequence object to use with the Attri2Vec model + in Keras methods ``fit_generator``, ``evaluate_generator``, + and ``predict_generator``. + + """ + return NodeSequence(self, node_ids, targets, shuffle=shuffle) + + def flow_from_dataframe(self, node_targets, shuffle=False): + """ + Creates a generator/sequence object for training or evaluation + with the supplied node ids and numeric targets. + + Args: + node_targets: a Pandas DataFrame of numeric targets indexed + by the node ID for that target. + shuffle (bool): If True the node_ids will be shuffled at each + epoch, if False the node_ids will be processed in order. + + Returns: + A NodeSequence object to use with the Attri2Vec model + in Keras methods ``fit_generator``, ``evaluate_generator``, + and ``predict_generator``. + + """ + return NodeSequence( + self, node_targets.index, node_targets.values, shuffle=shuffle + ) + + class SparseFullBatchNodeSequence(Sequence): """ Keras-compatible data generator for for node inference models From 04dbc46b3ed587848263bec4b0e65da998d22dc6 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 21 Aug 2019 17:14:55 +1000 Subject: [PATCH 16/82] Add the test for Attri2VecNodeGenerator --- tests/mapper/test_node_mappers.py | 153 ++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) mode change 100644 => 100755 tests/mapper/test_node_mappers.py diff --git a/tests/mapper/test_node_mappers.py b/tests/mapper/test_node_mappers.py old mode 100644 new mode 100755 index 39c66dc24..9002bf32b --- a/tests/mapper/test_node_mappers.py +++ b/tests/mapper/test_node_mappers.py @@ -682,6 +682,159 @@ def test_hinnodemapper_no_neighbors(): assert np.all(batch_feats[3][:, 0, 0] == np.array([12, 0, 0])) +def test_attri2vec_nodemapper_constructor_nx(): + """ + Attri2VecNodeGenerator requires a StellarGraph object + """ + G = nx.Graph() + G.add_nodes_from(range(4)) + + with pytest.raises(TypeError): + Attri2VecNodeGenerator(G, batch_size=2) + + +def test_attri2vec_nodemapper_constructor_no_feats(): + """ + Attri2VecNodeGenerator requires the graph to have features + """ + + G = example_graph_1() + with pytest.raises(RuntimeError): + Attri2VecNodeGenerator(G, batch_size=2) + + +def test_attri2vec_nodemapper_constructor(): + n_feat = 4 + + G = example_graph_1(feature_size=n_feat) + + generator = Attri2VecNodeGenerator(G, batch_size=2) + + mapper = generator.flow(list(G)) + + assert generator.batch_size == 2 + assert mapper.data_size == 4 + assert len(mapper.ids) == 4 + + +def test_attri2vec_nodemapper_1(): + n_feat = 4 + n_batch = 2 + + # test graph + G1 = example_graph_1(n_feat) + + mapper1 = Attri2VecNodeGenerator(G1, batch_size=n_batch).flow(G1.nodes()) + assert len(mapper1) == 2 + + G2 = example_graph_2(n_feat) + + mapper2 = Attri2VecNodeGenerator(G2, batch_size=n_batch).flow(G2.nodes()) + assert len(mapper2) == 3 + + for mapper in [mapper1, mapper2]: + for ii in range(2): + nf, nl = mapper[ii] + assert nf.shape == (n_batch, n_feat) + assert nl is None + + # Check beyond the graph lengh + with pytest.raises(IndexError): + nf, nl = mapper1[len(mapper1)] + + # Check the last batch + nf, nl = mapper2[len(mapper2) - 1] + assert nf.shape == (1, n_feat) + + # This will fail as the nodes are not in the graph + with pytest.raises(KeyError): + Attri2VecNodeGenerator(G1, batch_size=2).flow(["A", "B"]) + + +def test_attri2vec_nodemapper_shuffle(): + n_feat = 1 + n_batch = 2 + + G = example_graph_2(feature_size=n_feat) + nodes = list(G.nodes()) + + # With shuffle + random.seed(15) + mapper = Attri2VecNodeGenerator(G, batch_size=n_batch).flow( + nodes, nodes, shuffle=True + ) + + expected_node_batches = [[5, 4], [3, 1], [2]] + assert len(mapper) == 3 + for ii in range(len(mapper)): + nf, nl = mapper[ii] + assert all(np.ravel(nf) == expected_node_batches[ii]) + assert all(np.array(nl) == expected_node_batches[ii]) + + # This should re-shuffle the IDs + mapper.on_epoch_end() + expected_node_batches = [[4, 3], [1, 5], [2]] + assert len(mapper) == 3 + for ii in range(len(mapper)): + nf, nl = mapper[ii] + assert all(np.ravel(nf) == expected_node_batches[ii]) + assert all(np.array(nl) == expected_node_batches[ii]) + + # With no shuffle + mapper = Attri2VecNodeGenerator(G, batch_size=n_batch).flow( + nodes, nodes, shuffle=False + ) + expected_node_batches = [[1, 2], [3, 4], [5]] + assert len(mapper) == 3 + for ii in range(len(mapper)): + nf, nl = mapper[ii] + assert all(np.ravel(nf) == expected_node_batches[ii]) + assert all(np.array(nl) == expected_node_batches[ii]) + + +def test_attri2vec_nodemapper_with_labels(): + n_feat = 4 + n_batch = 2 + + # test graph + G2 = example_graph_2(n_feat) + nodes = list(G2) + labels = [n * 2 for n in nodes] + + gen = Attri2VecNodeGenerator(G2, batch_size=n_batch).flow(nodes, labels) + assert len(gen) == 3 + + for ii in range(3): + nf, nl = gen[ii] + + # Check sizes - note batch sizes are (2,2,1) for each iteration + assert nf.shape[1:] == (n_feat,) + + # Check labels + assert all(int(a) == int(2 * b) for a, b in zip(nl, nf[:, 0])) + + # Check beyond the graph lengh + with pytest.raises(IndexError): + nf, nl = gen[len(gen)] + + +def test_attri2vec_nodemapper_incorrect_targets(): + """ + Tests checks on target shape + """ + n_feat = 4 + n_batch = 2 + + # test graph + G = example_graph_1(feature_size=n_feat) + + with pytest.raises(TypeError): + Attri2VecNodeGenerator(G, batch_size=n_batch).flow(list(G), 1) + + with pytest.raises(ValueError): + Attri2VecNodeGenerator(G, batch_size=n_batch).flow(list(G), targets=[]) + + def create_graph_features(): G = nx.Graph() G.add_nodes_from(["a", "b", "c"]) From 3e906417312a6e5e539ccedfe9dd04218f0b1557 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 21 Aug 2019 17:15:17 +1000 Subject: [PATCH 17/82] Add test for Attri2VecLinkGenerator --- tests/mapper/test_link_mappers.py | 151 ++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) mode change 100644 => 100755 tests/mapper/test_link_mappers.py diff --git a/tests/mapper/test_link_mappers.py b/tests/mapper/test_link_mappers.py old mode 100644 new mode 100755 index 7a033a623..b4a428f11 --- a/tests/mapper/test_link_mappers.py +++ b/tests/mapper/test_link_mappers.py @@ -695,3 +695,154 @@ def test_HinSAGELinkGenerator_isolates(self): # With two isolates, all features are zero assert all(pytest.approx(0) == x for x in ne[2:]) + + +class Test_Attri2VecLinkGenerator: + """ + Tests of Attri2VecLinkGenerator class + """ + + n_feat = 4 + batch_size = 2 + + def test_LinkMapper_constructor(self): + + G = example_Graph_1(self.n_feat) + edge_labels = [0] * G.number_of_edges() + + generator = Attri2VecLinkGenerator(G, batch_size=self.batch_size) + mapper = generator.flow(G.edges(), edge_labels) + assert generator.batch_size == self.batch_size + assert mapper.data_size == G.number_of_edges() + assert len(mapper.ids) == G.number_of_edges() + + G = example_DiGraph_1(self.n_feat) + edge_labels = [0] * G.number_of_edges() + generator = Attri2VecLinkGenerator(G, batch_size=self.batch_size) + mapper = generator.flow(G.edges(), edge_labels) + assert generator.batch_size == self.batch_size + assert mapper.data_size == G.number_of_edges() + assert len(mapper.ids) == G.number_of_edges() + + def test_Attri2VecLinkGenerator_1(self): + + G = example_Graph_2(self.n_feat) + data_size = G.number_of_edges() + edge_labels = [0] * data_size + + mapper = Attri2VecLinkGenerator(G, batch_size=self.batch_size).flow( + G.edges(), edge_labels + ) + + assert len(mapper) == 2 + + for batch in range(len(mapper)): + nf, nl = mapper[batch] + assert len(nf) == 2 + assert nf[0].shape == (min(self.batch_size, data_size), self.n_feat) + assert nf[1].shape == (min(self.batch_size, data_size),) + assert len(nl) == min(self.batch_size, data_size) + assert all(nl == 0) + + with pytest.raises(IndexError): + nf, nl = mapper[2] + + def test_Attri2VecLinkGenerator_shuffle(self): + def test_edge_consistency(shuffle): + G = example_Graph_2(1) + edges = list(G.edges()) + nodes = list(G.nodes()) + edge_labels = list(range(len(edges))) + + mapper = Attri2VecLinkGenerator(G, batch_size=2).flow( + edges, edge_labels, shuffle=shuffle + ) + + assert len(mapper) == 2 + + for batch in range(len(mapper)): + nf, nl = mapper[batch] + e1 = edges[nl[0]] + e2 = edges[nl[1]] + assert nf[0][0, 0] == e1[0] + assert nf[1][0] == nodes.index(e1[1]) + assert nf[0][1, 0] == e2[0] + assert nf[1][1] == nodes.index(e2[1]) + + test_edge_consistency(True) + test_edge_consistency(False) + + def test_Attri2VecLinkGenerator_not_Stellargraph(self): + G = nx.Graph() + elist = [(1, 2), (2, 3), (1, 4), (3, 2)] + G.add_edges_from(elist) + + # Add example features + for v in G.nodes(): + G.node[v]["feature"] = np.ones(1) + + with pytest.raises(TypeError): + Attri2VecLinkGenerator(G, batch_size=self.batch_size) + + def test_Attri2VecLinkGenerator_no_targets(self): + """ + This tests link generator's iterator for prediction, i.e., without targets provided + """ + G = example_Graph_2(self.n_feat) + gen = Attri2VecLinkGenerator(G, batch_size=self.batch_size).flow(G.edges()) + for i in range(len(gen)): + assert gen[i][1] is None + + def test_Attri2VecLinkGenerator_unsupervisedSampler_flow(self): + """ + This tests link generator's initialization for on demand link generation i.e. there is no pregenerated list of samples provided to it. + """ + n_feat = 4 + n_batch = 2 + + # test graph + G = example_graph_random( + feature_size=n_feat, n_nodes=6, n_isolates=2, n_edges=10 + ) + + unsupervisedSamples = UnsupervisedSampler(G, nodes=G.nodes) + + gen = Attri2VecLinkGenerator(G, batch_size=n_batch).flow(unsupervisedSamples) + + # The flow method is not passed UnsupervisedSampler object or a list of samples is not passed + with pytest.raises(TypeError): + gen = Attri2VecLinkGenerator(G, batch_size=n_batch).flow( + "not_a_list_of_samples_or_a_sample_generator" + ) + + # The flow method is not passed nothing + with pytest.raises(TypeError): + gen = Attri2VecLinkGenerator(G, batch_size=n_batch).flow() + + def test_Attri2VecLinkGenerator_unsupervisedSampler_sample_generation(self): + + G = example_Graph_2(self.n_feat) + + unsupervisedSamples = UnsupervisedSampler(G) + + mapper = Attri2VecLinkGenerator(G, batch_size=self.batch_size).flow( + unsupervisedSamples + ) + + assert mapper.data_size == 16 + assert self.batch_size == 2 + assert len(mapper) == 8 + + for batch in range(len(mapper)): + nf, nl = mapper[batch] + + assert len(nf) == 2 + assert len(set(mapper.head_node_types)) == 1 + + assert nf[0].shape == (min(self.batch_size, mapper.data_size), self.n_feat) + assert nf[1].shape == (min(self.batch_size, mapper.data_size),) + assert len(nl) == min(self.batch_size, mapper.data_size) + assert sorted(nl) == [0, 1] + + with pytest.raises(IndexError): + nf, nl = mapper[8] From 1a8a981891b07e966fc1fc93af887137025e4929 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 21 Aug 2019 17:16:15 +1000 Subject: [PATCH 18/82] Add tests for Attri2Vec class --- tests/layer/test_attri2vec.py | 138 ++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 tests/layer/test_attri2vec.py diff --git a/tests/layer/test_attri2vec.py b/tests/layer/test_attri2vec.py new file mode 100644 index 000000000..84d188219 --- /dev/null +++ b/tests/layer/test_attri2vec.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2018 Data61, CSIRO +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Attri2Vec tests + +""" +from stellargraph.core.graph import StellarGraph +from stellargraph.mapper.node_mappers import Attri2VecNodeGenerator +from stellargraph.layer.attri2vec import * + +import keras +import numpy as np +import networkx as nx +import pytest + +from keras.engine import saving + + +def example_graph_1(feature_size=None): + G = nx.Graph() + elist = [(1, 2), (2, 3), (1, 4), (3, 2)] + G.add_nodes_from([1, 2, 3, 4], label="default") + G.add_edges_from(elist, label="default") + + # Add example features + if feature_size is not None: + for v in G.nodes(): + G.node[v]["feature"] = np.ones(feature_size) + return StellarGraph(G, node_features="feature") + + else: + return StellarGraph(G) + + +def test_attri2vec_constructor(): + attri2vec = Attri2Vec(layer_sizes=[4], input_dim=2, node_num=4, normalize="l2") + assert attri2vec.dims == [2, 4] + assert attri2vec.input_node_num == 4 + assert attri2vec.n_layers == 1 + assert attri2vec.bias == False + + # Check incorrect normalization flag + with pytest.raises(ValueError): + Attri2Vec(layer_sizes=[4], input_dim=2, node_num=4, normalize=lambda x: x) + + with pytest.raises(ValueError): + Attri2Vec(layer_sizes=[4], input_dim=2, node_num=4, normalize="unknown") + + # Check requirement for generator or input_dim and node_num + with pytest.raises(RuntimeError): + Attri2Vec(layer_sizes=[4]) + + # Construction from generator + G = example_graph_1(feature_size=3) + gen = Attri2VecNodeGenerator(G, batch_size=2).flow([1, 2]) + attri2vec = Attri2Vec(layer_sizes=[4, 8], generator=gen, bias=True) + + assert attri2vec.dims == [3, 4, 8] + assert attri2vec.input_node_num == 4 + assert attri2vec.n_layers == 2 + assert attri2vec.bias + + +def test_attri2vec_apply(): + attri2vec = Attri2Vec( + layer_sizes=[4], bias=False, input_dim=2, node_num=4, normalize=None + ) + + inp = keras.Input(shape=(1, 2)) + out = attri2vec(inp) + model = keras.Model(inputs=inp, outputs=out) + + +def test_attri2vec_apply_1(): + attri2vec = Attri2Vec( + layer_sizes=[2, 2, 2], bias=False, input_dim=2, node_num=4, normalize=None + ) + attri2vec.activation = "linear" + attri2vec.initializer = "ones" + + inp = keras.Input(shape=(2,)) + out = attri2vec(inp) + model = keras.Model(inputs=inp, outputs=out) + + x = np.array([[1, 2]]) + expected = np.array([[12, 12]]) + + actual = model.predict(x) + assert expected == pytest.approx(actual) + + # Use the node model: + xinp, xout = attri2vec.node_model() + model2 = keras.Model(inputs=xinp, outputs=xout) + assert pytest.approx(expected) == model2.predict(x) + + +def test_attri2vec_serialize(): + attri2vec = Attri2Vec( + layer_sizes=[4], bias=False, input_dim=2, node_num=4, normalize=None + ) + attri2vec.activation = "linear" + attri2vec.initializer = "ones" + + inp = keras.Input(shape=(2,)) + out = attri2vec(inp) + model = keras.Model(inputs=inp, outputs=out) + + # Save model + model_json = model.to_json() + + # Set all weights to one + model_weights = [np.ones_like(w) for w in model.get_weights()] + + # Load model from json & set all weights + model2 = keras.models.model_from_json(model_json) + model2.set_weights(model_weights) + + # Test loaded model + x = np.array([[1, 2]]) + expected = np.array([[3, 3, 3, 3]]) + + actual = model2.predict(x) + assert expected == pytest.approx(actual) From 4085790f643df36836be68240d3a0dafe93adc4a Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 21 Aug 2019 17:17:46 +1000 Subject: [PATCH 19/82] Add the Attri2Vec, Attri2VecNodeGenerator and Attri2VecLinkGenerator to api --- docs/api.txt | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) mode change 100644 => 100755 docs/api.txt diff --git a/docs/api.txt b/docs/api.txt old mode 100644 new mode 100755 index c206b3efc..2479042df --- a/docs/api.txt +++ b/docs/api.txt @@ -19,7 +19,7 @@ Generators ----------- .. automodule:: stellargraph.mapper - :members: FullBatchNodeGenerator, GraphSAGENodeGenerator, GraphSAGELinkGenerator, HinSAGENodeGenerator, HinSAGELinkGenerator + :members: FullBatchNodeGenerator, GraphSAGENodeGenerator, GraphSAGELinkGenerator, HinSAGENodeGenerator, HinSAGELinkGenerator, Attri2VecNodeGenerator, Attri2VecLinkGenerator GraphSAGE model @@ -36,6 +36,13 @@ HinSAGE model :members: HinSAGE, MeanHinAggregator +Attri2Vec model +---------------- + +.. automodule:: stellargraph.layer.attri2vec + :members: Attri2Vec + + GCN model ------------- From 37ce44df007c65892a1ef0c453294937cf955325 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 21 Aug 2019 17:20:18 +1000 Subject: [PATCH 20/82] Add attri2vec module to the stellargraph/layer/__init__.py file --- stellargraph/layer/__init__.py | 1 + 1 file changed, 1 insertion(+) mode change 100644 => 100755 stellargraph/layer/__init__.py diff --git a/stellargraph/layer/__init__.py b/stellargraph/layer/__init__.py old mode 100644 new mode 100755 index 87926b106..228c7ce0b --- a/stellargraph/layer/__init__.py +++ b/stellargraph/layer/__init__.py @@ -27,5 +27,6 @@ from .graph_attention import * from .link_inference import * from .gcn import * +from .attri2vec import * from .misc import SqueezedSparseConversion from .preprocessing_layer import GraphPreProcessingLayer From 208835e60368a2908d32ef23f9e3fd04aeab2a43 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 21 Aug 2019 17:33:54 +1000 Subject: [PATCH 21/82] rerun the attri2vec demo --- .../stellargraph-attri2vec-DBLP.ipynb | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb b/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb index 47aefb365..52bd0b6ba 100644 --- a/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb +++ b/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb @@ -456,7 +456,7 @@ "output_type": "stream", "text": [ "WARNING: Logging before flag parsing goes to stderr.\n", - "W0821 16:57:15.264071 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", + "W0821 17:24:39.472835 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", "\n" ] } @@ -477,9 +477,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "W0821 16:57:18.045172 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n", + "W0821 17:24:40.300625 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n", "\n", - "W0821 16:57:18.051191 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n", + "W0821 17:24:40.306243 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n", "\n" ] } @@ -531,11 +531,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "W0821 16:57:23.894735 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", + "W0821 17:24:43.567921 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", "\n", - "W0821 16:57:23.901146 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.\n", + "W0821 17:24:43.574287 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.\n", "\n", - "W0821 16:57:23.909626 140736102417344 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "W0821 17:24:43.582134 140736102417344 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n" ] @@ -569,7 +569,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "W0821 16:57:27.992636 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.\n", + "W0821 17:24:45.546332 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.\n", "\n" ] }, @@ -578,13 +578,13 @@ "output_type": "stream", "text": [ "Epoch 1/4\n", - "4711/4711 [==============================] - 93s 20ms/step - loss: 0.6506 - binary_accuracy: 0.6515\n", + "4711/4711 [==============================] - 92s 20ms/step - loss: 0.6903 - binary_accuracy: 0.6251\n", "Epoch 2/4\n", - "4711/4711 [==============================] - 94s 20ms/step - loss: 0.5718 - binary_accuracy: 0.6775\n", + "4711/4711 [==============================] - 93s 20ms/step - loss: 0.6135 - binary_accuracy: 0.6346\n", "Epoch 3/4\n", - "4711/4711 [==============================] - 95s 20ms/step - loss: 0.4567 - binary_accuracy: 0.7736\n", + "4711/4711 [==============================] - 93s 20ms/step - loss: 0.5289 - binary_accuracy: 0.7100\n", "Epoch 4/4\n", - "4711/4711 [==============================] - 94s 20ms/step - loss: 0.2897 - binary_accuracy: 0.8791\n" + "4711/4711 [==============================] - 94s 20ms/step - loss: 0.3581 - binary_accuracy: 0.8372\n" ] } ], @@ -784,7 +784,7 @@ { "data": { "text/plain": [ - "0.8109590038749268" + "0.7857757735930484" ] }, "execution_count": 32, @@ -895,7 +895,7 @@ { "data": { "text/plain": [ - "0.6638026335702683" + "0.6631950060001668" ] }, "execution_count": 37, From 6179a3adc08bffe76ec74074b0b02c828d637940 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Thu, 22 Aug 2019 10:52:27 +1000 Subject: [PATCH 22/82] update the attri2vec demo by fixing some typos --- .../stellargraph-attri2vec-DBLP.ipynb | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb b/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb index 52bd0b6ba..618865352 100644 --- a/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb +++ b/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb @@ -56,8 +56,7 @@ "import stellargraph as sg\n", "from stellargraph.data import UniformRandomWalk\n", "from stellargraph.data import UnsupervisedSampler\n", - "from stellargraph.mapper import Attri2VecLinkGenerator\n", - "from stellargraph.mapper.node_mappers import Attri2VecNodeGenerator\n", + "from stellargraph.mapper import Attri2VecLinkGenerator, Attri2VecNodeGenerator\n", "from stellargraph.layer import Attri2Vec, link_classification\n", "\n", "import keras\n", @@ -456,7 +455,7 @@ "output_type": "stream", "text": [ "WARNING: Logging before flag parsing goes to stderr.\n", - "W0821 17:24:39.472835 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", + "W0822 10:43:54.482249 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", "\n" ] } @@ -477,9 +476,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "W0821 17:24:40.300625 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n", + "W0822 10:43:56.449973 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n", "\n", - "W0821 17:24:40.306243 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n", + "W0822 10:43:56.455691 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n", "\n" ] } @@ -531,11 +530,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "W0821 17:24:43.567921 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", + "W0822 10:44:02.083713 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", "\n", - "W0821 17:24:43.574287 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.\n", + "W0822 10:44:02.091721 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.\n", "\n", - "W0821 17:24:43.582134 140736102417344 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "W0822 10:44:02.102672 140736102417344 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n" ] @@ -569,7 +568,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "W0821 17:24:45.546332 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.\n", + "W0822 10:44:04.704549 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.\n", "\n" ] }, @@ -578,13 +577,13 @@ "output_type": "stream", "text": [ "Epoch 1/4\n", - "4711/4711 [==============================] - 92s 20ms/step - loss: 0.6903 - binary_accuracy: 0.6251\n", + "4711/4711 [==============================] - 89s 19ms/step - loss: 0.6678 - binary_accuracy: 0.6432\n", "Epoch 2/4\n", - "4711/4711 [==============================] - 93s 20ms/step - loss: 0.6135 - binary_accuracy: 0.6346\n", + "4711/4711 [==============================] - 92s 19ms/step - loss: 0.5859 - binary_accuracy: 0.6642\n", "Epoch 3/4\n", - "4711/4711 [==============================] - 93s 20ms/step - loss: 0.5289 - binary_accuracy: 0.7100\n", + "4711/4711 [==============================] - 91s 19ms/step - loss: 0.4810 - binary_accuracy: 0.7541\n", "Epoch 4/4\n", - "4711/4711 [==============================] - 94s 20ms/step - loss: 0.3581 - binary_accuracy: 0.8372\n" + "4711/4711 [==============================] - 93s 20ms/step - loss: 0.3283 - binary_accuracy: 0.8544\n" ] } ], @@ -676,11 +675,11 @@ " if source_year < year_thresh and target_year < year_thresh:\n", " in_sample_edges.append([source_index, target_index, 1]) # get the positive edge\n", " negative_target_index = unsupervised_samples.random.choices(node_data.index.tolist(), k=1) # generate negative node\n", - " in_sample_edges.append([source_index, negative_target_index[0], 0]) # get the negative node\n", + " in_sample_edges.append([source_index, negative_target_index[0], 0]) # get the negative edge\n", " else:\n", " out_of_sample_edges.append([source_index, target_index, 1]) # get the positive edge\n", " negative_target_index = unsupervised_samples.random.choices(node_data.index.tolist(), k=1) # generate negative node\n", - " out_of_sample_edges.append([source_index, negative_target_index[0], 0]) # get the negative node\n", + " out_of_sample_edges.append([source_index, negative_target_index[0], 0]) # get the negative edge\n", "in_sample_edges = np.array(in_sample_edges)\n", "out_of_sample_edges = np.array(out_of_sample_edges)" ] @@ -784,7 +783,7 @@ { "data": { "text/plain": [ - "0.7857757735930484" + "0.798202033278801" ] }, "execution_count": 32, @@ -895,7 +894,7 @@ { "data": { "text/plain": [ - "0.6631950060001668" + "0.6638897794650095" ] }, "execution_count": 37, From f959e65469f05ab8e620be732ae7f0d56c266ae1 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 25 Sep 2019 10:57:25 +1000 Subject: [PATCH 23/82] code update --- tests/mapper/test_link_mappers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/mapper/test_link_mappers.py b/tests/mapper/test_link_mappers.py index b4a428f11..8442cd088 100755 --- a/tests/mapper/test_link_mappers.py +++ b/tests/mapper/test_link_mappers.py @@ -837,7 +837,6 @@ def test_Attri2VecLinkGenerator_unsupervisedSampler_sample_generation(self): nf, nl = mapper[batch] assert len(nf) == 2 - assert len(set(mapper.head_node_types)) == 1 assert nf[0].shape == (min(self.batch_size, mapper.data_size), self.n_feat) assert nf[1].shape == (min(self.batch_size, mapper.data_size),) From 36a7cd8b2ca384e153687a1ce0298ae527ebc466 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 25 Sep 2019 10:57:46 +1000 Subject: [PATCH 24/82] code update --- tests/layer/test_attri2vec.py | 68 ++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 24 deletions(-) diff --git a/tests/layer/test_attri2vec.py b/tests/layer/test_attri2vec.py index 84d188219..0701128a6 100644 --- a/tests/layer/test_attri2vec.py +++ b/tests/layer/test_attri2vec.py @@ -23,13 +23,11 @@ from stellargraph.mapper.node_mappers import Attri2VecNodeGenerator from stellargraph.layer.attri2vec import * -import keras +from tensorflow import keras import numpy as np import networkx as nx import pytest -from keras.engine import saving - def example_graph_1(feature_size=None): G = nx.Graph() @@ -78,43 +76,65 @@ def test_attri2vec_constructor(): def test_attri2vec_apply(): attri2vec = Attri2Vec( - layer_sizes=[4], bias=False, input_dim=2, node_num=4, normalize=None - ) - - inp = keras.Input(shape=(1, 2)) - out = attri2vec(inp) - model = keras.Model(inputs=inp, outputs=out) - - -def test_attri2vec_apply_1(): - attri2vec = Attri2Vec( - layer_sizes=[2, 2, 2], bias=False, input_dim=2, node_num=4, normalize=None + layer_sizes=[2, 2, 2], + bias=False, + input_dim=2, + node_num=4, + activation="linear", + normalize=None, ) - attri2vec.activation = "linear" - attri2vec.initializer = "ones" - - inp = keras.Input(shape=(2,)) - out = attri2vec(inp) - model = keras.Model(inputs=inp, outputs=out) x = np.array([[1, 2]]) expected = np.array([[12, 12]]) - actual = model.predict(x) + inp = keras.Input(shape=(2,)) + out = attri2vec(inp) + model1 = keras.Model(inputs=inp, outputs=out) + model_weights1 = [np.ones_like(w) for w in model1.get_weights()] + model1.set_weights(model_weights1) + actual = model1.predict(x) assert expected == pytest.approx(actual) # Use the node model: xinp, xout = attri2vec.node_model() model2 = keras.Model(inputs=xinp, outputs=xout) + model_weights2 = [np.ones_like(w) for w in model2.get_weights()] + model2.set_weights(model_weights2) assert pytest.approx(expected) == model2.predict(x) + x1 = np.array([[3, 1]]) + x2 = np.array([[2]]) + y1 = np.array([[16, 16]]) + y2 = np.array([[1, 1]]) + + # Test the build function: + xinp, xout = attri2vec.build() + model3 = keras.Model(inputs=xinp, outputs=xout) + model_weights3 = [np.ones_like(w) for w in model3.get_weights()] + model3.set_weights(model_weights3) + actual = model3.predict([x1, x2]) + assert pytest.approx(y1) == actual[0] + assert pytest.approx(y2) == actual[1] + + # Use the link model: + xinp, xout = attri2vec.link_model() + model4 = keras.Model(inputs=xinp, outputs=xout) + model_weights4 = [np.ones_like(w) for w in model4.get_weights()] + model4.set_weights(model_weights4) + actual = model4.predict([x1, x2]) + assert pytest.approx(y1) == actual[0] + assert pytest.approx(y2) == actual[1] + def test_attri2vec_serialize(): attri2vec = Attri2Vec( - layer_sizes=[4], bias=False, input_dim=2, node_num=4, normalize=None + layer_sizes=[4], + bias=False, + input_dim=2, + node_num=4, + activation="linear", + normalize=None, ) - attri2vec.activation = "linear" - attri2vec.initializer = "ones" inp = keras.Input(shape=(2,)) out = attri2vec(inp) From a50390f943170b5d5b5014985d3e507bc4cae777 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 25 Sep 2019 10:58:46 +1000 Subject: [PATCH 25/82] add attri2vec link prediction demo --- .../stellargraph-attri2vec-DBLP.ipynb | 933 ++++++++++++++++++ 1 file changed, 933 insertions(+) create mode 100644 demos/link-prediction/attri2vec/stellargraph-attri2vec-DBLP.ipynb diff --git a/demos/link-prediction/attri2vec/stellargraph-attri2vec-DBLP.ipynb b/demos/link-prediction/attri2vec/stellargraph-attri2vec-DBLP.ipynb new file mode 100644 index 000000000..984cbb562 --- /dev/null +++ b/demos/link-prediction/attri2vec/stellargraph-attri2vec-DBLP.ipynb @@ -0,0 +1,933 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inductive Node Representation Learning through attri2vec" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is the python implementation of the attri2vec algorithm outlined in paper ***[Attributed Network Embedding Via Subspace Discovery](https://arxiv.org/abs/1901.04095)*** D. Zhang, Y. Jie, X. Zhu and C. Zhang, arXiv:1901.04095, [cs.SI], 2019. The implementation uses the stellargraph libraries.\n", + "\n", + "## attri2vec\n", + "\n", + "attri2vec learns node representations by performing a linear/non-linear mapping on node content attributes. To make the learned node representations respect structural similarity, [`DeepWalk`](https://dl.acm.org/citation.cfm?id=2623732)/[`node2vec`](https://snap.stanford.edu/node2vec) learning mechanism is used to make nodes sharing similar random walk context nodes represented closely in the subspace, which is achieved by maximizing the occurrence probability of context nodes conditioned on the representation of the target nodes. The probability is modelled by Softmax and negative sampling is used to speed up its calculation. This makes attri2vec equivalent to predict whether a node occurs in the given target node's context in random walks with the representation of the target node, by minimizing the cross-entropy loss. \n", + "\n", + "In implementation, node embeddings are learnt by solving a simple classification task: given a large set of \"positive\" `(target, context)` node pairs generated from random walks performed on the graph (i.e., node pairs that co-occur within a certain context window in random walks), and an equally large set of \"negative\" node pairs that are randomly selected from the graph according to a certain distribution, learn a binary classifier that predicts whether arbitrary node pairs are likely to co-occur in a random walk performed on the graph. Through learning this simple binary node-pair-classification task, the model automatically learns an inductive mapping from attributes of nodes to node embeddings in a low-dimensional vector space, which preserves structural and feature similarities of the nodes.\n", + "\n", + "To train the attri2vec model, we first construct a training set of nodes, which is composed of an equal number of positive and negative `(target, context)` pairs from the graph. The positive `(target, context)` pairs are the node pairs co-occurring on random walks over the graph whereas the negative node pairs are the sampled randomly from the global node degree distribution of the graph. In attri2vec, each node is attached with two kinds of embeddings: 1) the inductive 'input embedding', i.e, the objective embedding, obtained by perform a non-linear transformation on node content features, and 2) 'output embedding', i.e., the parameter vector used to predict its occurrence as a context node, obtained by looking up a parameter table. Given a `(target, context)` pair, attri2vec outputs a predictive value to indicate whether it is positive or negative, which is obtained by performing the dot product of the 'input embedding' of the target node and the 'output embedding' of the context node, followed by a sigmoid activation. \n", + "\n", + "The entire model is trained end-to-end by minimizing the binary cross-entropy loss function with regards to predicted node pair labels and true node pair labels, using stochastic gradient descent (SGD) updates of the model parameters, with minibatches of 'training' node pairs generated on demand and fed into the model.\n", + "\n", + "In this demo, we first train the attri2vec model on the in-sample subgraph and obtain a mapping function from node attributes to node representations, then apply the mapping function to the content attributes of out-of-sample nodes and obtain the representations of out-of-sample nodes. We evaluate the quality of inferred out-of-sample node representations by using it to predict the links of out-of-sample nodes." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], + "source": [ + "import networkx as nx\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import random\n", + "\n", + "import stellargraph as sg\n", + "from stellargraph.data import UnsupervisedSampler\n", + "from stellargraph.mapper import Attri2VecLinkGenerator, Attri2VecNodeGenerator\n", + "from stellargraph.layer import Attri2Vec, link_classification\n", + "\n", + "from tensorflow import keras\n", + "\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import roc_auc_score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading DBLP network data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This demo uses a DBLP citation network, a subgraph extracted from [DBLP-Citation-network V3](https://aminer.org/citation). To form this subgraph, papers from four subjects are extracted according to their venue information: *Database, Data Mining, Artificial Intelligence and Computer Vision*, and papers with no citations are removed. The DBLP network contains 18,448 papers and 45,661 citation relations. From paper titles, we construct 2,476-dimensional binary node feature vectors, with each element indicating the presence/absence of the corresponding word. By ignoring the citation direction, we take the DBLP subgraph as an undirected network.\n", + "\n", + "As papers in DBLP are attached with publication year, the DBLP network with the dynamic property can be used to study the problem of out-of-sample node representation learning. From the DBLP network, we construct four in-sample subgraphs using papers published before 2006, 2007, 2008 and 2009, and denote the four subgraphs as DBLP2006, DBLP2007, DBLP2008, and DBLP2009. For each subgraph, the remaining papers are taken as out-of-sample nodes. We consider the case where new coming nodes have no links. We predict the links of out-of-sample nodes using the learned out-of-sample node representations and compare its performance with the node content feature baseline.\n", + "\n", + "The dataset used in this demo can be downloaded from https://www.kaggle.com/daozhang/dblp-subgraph.\n", + "The following is the description of the dataset:\n", + "\n", + "> The content.txt file contains descriptions of the papers in the following format:\n", + "\n", + " \t\t \n", + " \n", + "> The first entry in each line contains the unique integer ID (ranging from 0 to 18,447) of the paper followed by binary values indicating whether each word in the vocabulary is present (indicated by 1) or absent (indicated by 0) in the paper. Finally, the last two entries in the line are the class label and the publication year of the paper.\n", + "> The edgeList.txt file contains the citation relations. Each line describes a link in the following format:\n", + "\t\t\n", + " \n", + " \n", + "> Each line contains two paper IDs, with paper2 citing paper1 or paper1 citing paper2.\n", + "\n", + "\n", + "Download and unzip the dblp-subgraph.zip file to a location on your computer and set the `data_dir` variable to\n", + "point to the location of the dataset (the \"DBLP\" directory containing \"content.txt\" and \"edgeList.txt\")." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "data_dir = \"~/data/DBLP\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the graph from the edgelist." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "edgelist = pd.read_csv(os.path.join(data_dir, \"edgeList.txt\"), sep='\\t', header=None, names=[\"source\", \"target\"])\n", + "edgelist[\"label\"] = \"cites\" # set the edge type" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load paper content features, subjects and publishing years." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "feature_names = [\"w_{}\".format(ii) for ii in range(2476)]\n", + "node_column_names = feature_names + [\"subject\", \"year\"]\n", + "node_data = pd.read_csv(os.path.join(data_dir, \"content.txt\"), sep='\\t', header=None, names=node_column_names)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Construct the whole graph from edge list." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "G_all_nx = nx.from_pandas_edgelist(edgelist, edge_attr=\"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify node types." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "nx.set_node_attributes(G_all_nx, \"paper\", \"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get node features." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "all_node_features = node_data[feature_names]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the Stellargraph with node features." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "G_all = sg.StellarGraph(G_all_nx, node_features=all_node_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "StellarGraph: Undirected multigraph\n", + " Nodes: 18448, Edges: 45611\n", + "\n", + " Node types:\n", + " paper: [18448]\n", + " Edge types: paper-cites->paper\n", + "\n", + " Edge types:\n", + " paper-cites->paper: [45611]\n", + "\n" + ] + } + ], + "source": [ + "print(G_all.info())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get DBLP Subgraph \n", + "### with papers published before a threshold year" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the edge list connecting in-sample nodes." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "year_thresh = 2006 # the threshold year for in-sample and out-of-sample set split, which can be 2007, 2008 and 2009\n", + "subgraph_edgelist = []\n", + "for ii in range(len(edgelist)):\n", + " source_index = edgelist[\"source\"][ii]\n", + " target_index = edgelist[\"target\"][ii]\n", + " source_year = int(node_data[\"year\"][source_index])\n", + " target_year = int(node_data[\"year\"][target_index])\n", + " if source_year < year_thresh and target_year < year_thresh:\n", + " subgraph_edgelist.append([source_index, target_index])\n", + "subgraph_edgelist = pd.DataFrame(np.array(subgraph_edgelist), columns=[\"source\", \"target\"])\n", + "subgraph_edgelist[\"label\"] = \"cites\" # set the edge type" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Construct the network from the selected edge list." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "G_sub_nx = nx.from_pandas_edgelist(subgraph_edgelist, edge_attr=\"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify node types." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "nx.set_node_attributes(G_sub_nx, \"paper\", \"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the ids of the nodes in the selected subgraph." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "subgraph_node_ids = sorted(list(G_sub_nx.nodes))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the node features of the selected subgraph." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "subgraph_node_features = node_data[feature_names].reindex(subgraph_node_ids)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the Stellargraph with node features." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "G_sub = sg.StellarGraph(G_sub_nx, node_features=subgraph_node_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "StellarGraph: Undirected multigraph\n", + " Nodes: 11776, Edges: 28937\n", + "\n", + " Node types:\n", + " paper: [11776]\n", + " Edge types: paper-cites->paper\n", + "\n", + " Edge types:\n", + " paper-cites->paper: [28937]\n", + "\n" + ] + } + ], + "source": [ + "print(G_sub.info())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train attri2vec on the DBLP Subgraph" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "nodes = list(G_sub.nodes())\n", + "number_of_walks = 2\n", + "length = 5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the UnsupervisedSampler instance with the relevant parameters passed to it." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "unsupervised_samples = UnsupervisedSampler(G_sub, nodes=nodes, length=length, number_of_walks=number_of_walks)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the batch size and the number of epochs. " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 50\n", + "epochs = 6" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define an attri2vec training generator, which generates a batch of (feature of target node, index of context node, label of node pair) pairs per iteration." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running Attri2VecLinkGenerator with an estimated 4710 batches generated on the fly per epoch.\n" + ] + } + ], + "source": [ + "train_gen = Attri2VecLinkGenerator(G_sub, batch_size).flow(unsupervised_samples)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Building the model: a 1-hidden-layer node representation ('input embedding') of the `target` node and the parameter vector ('output embedding') for predicting the existence of `context node` for each `(target context)` pair, with a link classification layer performed on the dot product of the 'input embedding' of the `target` node and the 'output embedding' of the `context` node.\n", + "\n", + "Attri2Vec part of the model, with a 128-dimenssion hidden layer, no bias term, no dropout and no normalization. (Dropout can be switched on by specifying a positive dropout rate, 0 < dropout < 1 and normalization can be set to 'l2'). " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "layer_sizes = [128]\n", + "attri2vec = Attri2Vec(\n", + " layer_sizes=layer_sizes, generator=train_gen, bias=False, normalize=None\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: Logging before flag parsing goes to stderr.\n", + "W0925 10:21:27.091693 140736388883392 deprecation.py:506] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n", + "W0925 10:21:27.113364 140736388883392 deprecation.py:506] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n" + ] + } + ], + "source": [ + "# Build the model and expose input and output sockets of attri2vec, for node pair inputs:\n", + "x_inp, x_out = attri2vec.build()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the link_classification function to generate the prediction, with the 'ip' edge embedding generation method and the 'sigmoid' activation, which actually performs the dot product of the 'input embedding' of the target node and the 'output embedding' of the context node followed by a sigmoid activation. " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "link_classification: using 'ip' method to combine node embeddings into edge embeddings\n" + ] + } + ], + "source": [ + "prediction = link_classification(\n", + " output_dim=1, output_act=\"sigmoid\", edge_embedding_method='ip'\n", + ")(x_out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Stack the Attri2Vec encoder and prediction layer into a Keras model, and specify the loss." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "model = keras.Model(inputs=x_inp, outputs=prediction)\n", + "\n", + "model.compile(\n", + " optimizer=keras.optimizers.Adam(lr=1e-3),\n", + " loss=keras.losses.binary_crossentropy,\n", + " metrics=[keras.metrics.binary_accuracy],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train the model." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "W0925 10:21:27.574651 140736388883392 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use tf.where in 2.0, which has the same broadcast rule as np.where\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/6\n", + "4711/4711 [==============================] - 72s 15ms/step - loss: 0.7680 - binary_accuracy: 0.5814\n", + "Epoch 2/6\n", + "4711/4711 [==============================] - 72s 15ms/step - loss: 0.6447 - binary_accuracy: 0.6101\n", + "Epoch 3/6\n", + "4711/4711 [==============================] - 75s 16ms/step - loss: 0.5866 - binary_accuracy: 0.6483\n", + "Epoch 4/6\n", + "4711/4711 [==============================] - 75s 16ms/step - loss: 0.4925 - binary_accuracy: 0.7284\n", + "Epoch 5/6\n", + "4711/4711 [==============================] - 76s 16ms/step - loss: 0.3574 - binary_accuracy: 0.8300\n", + "Epoch 6/6\n", + "4711/4711 [==============================] - 76s 16ms/step - loss: 0.2242 - binary_accuracy: 0.9105\n" + ] + } + ], + "source": [ + "history = model.fit_generator(\n", + " train_gen,\n", + " epochs=epochs,\n", + " verbose=1,\n", + " use_multiprocessing=True,\n", + " workers=4,\n", + " shuffle=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Predicting links of out-of-sample nodes with the learned attri2vec model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Build the node based model for predicting node representations from node content attributes with the learned parameters. Below a Keras model is constructed, with x_inp[0] as input and x_out[0] as output. Note that this model's weights are the same as those of the corresponding node encoder in the previously trained node pair classifier." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "x_inp_src = x_inp[0]\n", + "x_out_src = x_out[0]\n", + "embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the node embeddings, for both in-sample and out-of-sample nodes, by applying the learned mapping function to node content features." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "369/369 [==============================] - 1s 1ms/step\n" + ] + } + ], + "source": [ + "node_ids = node_data.index\n", + "node_gen = Attri2VecNodeGenerator(G_all, batch_size).flow(node_ids)\n", + "node_embeddings = embedding_model.predict_generator(node_gen, workers=4, verbose=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the positive and negative edges for in-sample nodes and out-of-sample nodes. The edges of the in-sample nodes only include the edges between in-sample nodes, and the edges of out-of-sample nodes are referred to all the edges linked to out-of-sample nodes, including the edges connecting in-sample and out-of-sample edges." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "year_thresh = 2006\n", + "in_sample_edges = []\n", + "out_of_sample_edges = []\n", + "for ii in range(len(edgelist)):\n", + " source_index = edgelist[\"source\"][ii]\n", + " target_index = edgelist[\"target\"][ii]\n", + " if source_index > target_index: # neglect edge direction for the undirected graph\n", + " continue\n", + " source_year = int(node_data[\"year\"][source_index])\n", + " target_year = int(node_data[\"year\"][target_index])\n", + " if source_year < year_thresh and target_year < year_thresh:\n", + " in_sample_edges.append([source_index, target_index, 1]) # get the positive edge\n", + " negative_target_index = unsupervised_samples.random.choices(node_data.index.tolist(), k=1) # generate negative node\n", + " in_sample_edges.append([source_index, negative_target_index[0], 0]) # get the negative edge\n", + " else:\n", + " out_of_sample_edges.append([source_index, target_index, 1]) # get the positive edge\n", + " negative_target_index = unsupervised_samples.random.choices(node_data.index.tolist(), k=1) # generate negative node\n", + " out_of_sample_edges.append([source_index, negative_target_index[0], 0]) # get the negative edge\n", + "in_sample_edges = np.array(in_sample_edges)\n", + "out_of_sample_edges = np.array(out_of_sample_edges)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Construct the edge features from the learned node representations with l2 normed difference, where edge features are the element-wise square of the difference between the embeddings of two head nodes. Other strategy like element-wise product can also be used to construct edge features." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "in_sample_edge_feat_from_emb = (node_embeddings[in_sample_edges[:,0]]-node_embeddings[in_sample_edges[:,1]])**2\n", + "out_of_sample_edge_feat_from_emb = (node_embeddings[out_of_sample_edges[:,0]]-node_embeddings[out_of_sample_edges[:,1]])**2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train the Logistic Regression classifier from in-sample edges with the edge features constructed from attri2vec embeddings. " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, l1_ratio=None, max_iter=500,\n", + " multi_class='auto', n_jobs=None, penalty='l2',\n", + " random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n", + " warm_start=False)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf_edge_pred_from_emb = LogisticRegression(verbose=0, solver='lbfgs', multi_class=\"auto\", max_iter=500)\n", + "clf_edge_pred_from_emb.fit(in_sample_edge_feat_from_emb, in_sample_edges[:,2])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Predict the edge existence probability with the trained Logistic Regression classifier." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "edge_pred_from_emb = clf_edge_pred_from_emb.predict_proba(out_of_sample_edge_feat_from_emb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the positive class index of `edge_pred_from_emb`." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "if clf_edge_pred_from_emb.classes_[0] == 1:\n", + " positive_class_index = 0\n", + "else:\n", + " positive_class_index = 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Evaluate the AUC score for the prediction with attri2vec embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7842243817182476" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roc_auc_score(out_of_sample_edges[:,2], edge_pred_from_emb[:,positive_class_index])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As the baseline, we also investigate the performance of node content features in predicting the edges of out-of-sample nodes. Firstly, we construct edge features from node content features with the same strategy." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "in_sample_edge_rep_from_feat = (node_data[feature_names].values[in_sample_edges[:,0]]-node_data[feature_names].values[in_sample_edges[:,1]])**2\n", + "out_of_sample_edge_rep_from_feat = (node_data[feature_names].values[out_of_sample_edges[:,0]]-node_data[feature_names].values[out_of_sample_edges[:,1]])**2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we train the Logistic Regression classifier from in-sample edges with the edge features constructed from node content features." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, l1_ratio=None, max_iter=500,\n", + " multi_class='auto', n_jobs=None, penalty='l2',\n", + " random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n", + " warm_start=False)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf_edge_pred_from_feat = LogisticRegression(verbose=0, solver='lbfgs', multi_class=\"auto\", max_iter=500)\n", + "clf_edge_pred_from_feat.fit(in_sample_edge_rep_from_feat, in_sample_edges[:,2])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Predict the edge existence probability with the trained Logistic Regression classifier." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "edge_pred_from_feat = clf_edge_pred_from_feat.predict_proba(out_of_sample_edge_rep_from_feat)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get positive class index of `clf_edge_pred_from_feat`." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "if clf_edge_pred_from_feat.classes_[0] == 1:\n", + " positive_class_index = 0\n", + "else:\n", + " positive_class_index = 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Evaluate the AUC score for the prediction with node content features." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6666971120688185" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roc_auc_score(out_of_sample_edges[:,2], edge_pred_from_feat[:,positive_class_index])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "attri2vec can inductively infer the representations of out-of-sample nodes from their content attributes. As the inferred node representations well capture both structure and node content information, they perform much better than node content features in predicting the links of out-of-sample nodes." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 181c75e7f57abac9955145091222e081764a3438 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 25 Sep 2019 10:59:42 +1000 Subject: [PATCH 26/82] add attri2vec DBLP embeddings demo --- .../stellargraph-attri2vec-DBLP.ipynb | 541 ++++-------------- 1 file changed, 103 insertions(+), 438 deletions(-) diff --git a/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb b/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb index 618865352..5b9148316 100644 --- a/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb +++ b/demos/embeddings/stellargraph-attri2vec-DBLP.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Inferring Representations for Out-of-sample Nodes Through attri2vec" + "# Node Representation Learning with attri2vec on DBLP" ] }, { @@ -13,24 +13,16 @@ "source": [ "This is the python implementation of the attri2vec algorithm outlined in paper ***[Attributed Network Embedding Via Subspace Discovery](https://arxiv.org/abs/1901.04095)*** D. Zhang, Y. Jie, X. Zhu and C. Zhang, arXiv:1901.04095, [cs.SI], 2019. The implementation uses the stellargraph libraries.\n", "\n", - "## Dataset\n", - "This demo uses a DBLP citation network, a subgraph extracted from [DBLP-Citation-network V3](https://aminer.org/citation). To form this subgraph, papers from four subjects are extracted according to their venue information: *Database, Data Mining, Artificial Intelligence and Computer Vision*, and papers with no citations are removed. The DBLP network contains 18,448 papers and 45,661 citation relations. From paper titles, we construct 2,476-dimensional binary node feature vectors, with each element indicating the presence/absence of the corresponding word. By ignoring the citation direction, we take the DBLP subgraph as an undirected network.\n", - "\n", - "As papers in DBLP are attached with publication year, the DBLP network with the dynamic property can be used to study the problem of out-of-sample node representation learning. From the DBLP network, we construct four in-sample subgraphs using papers published before 2006, 2007, 2008 and 2009, and denote the four subgraphs as DBLP2006, DBLP2007, DBLP2008, and DBLP2009. For each subgraph, the remaining papers are taken as out-of-sample nodes. We consider the case where new coming nodes have no links. We predict the links of out-of-sample nodes using the learned out-of-sample node representations and compare its performance with the node content feature baseline.\n", - "\n", - "\n", "\n", "## attri2vec\n", "\n", - "For networks attached with node content attributes, attri2vec infers node representations by discovering a latent node attribute subspace that respects network structure in a more consistent way. To transform network nodes from the original attribute space into the targeted subspace, a non-linear mapping is used. To make the mapped images respect structural similarity, [`DeepWalk`](https://dl.acm.org/citation.cfm?id=2623732)/[`node2vec`](https://snap.stanford.edu/node2vec) learning mechanism is used to make nodes sharing similar random walk context nodes represented closely in the subspace. Following [`DeepWalk`](https://dl.acm.org/citation.cfm?id=2623732)/[`node2vec`](https://snap.stanford.edu/node2vec), attri2vec learns node representations by maximizing the occurrence probability of context nodes conditioned on the representation of the target nodes. The probability is modelled by Softmax and negative sampling is used to speed up its calculation. This makes attri2vec equivalent to predict whether a node occurs in the given target node's context in random walks with the representation of the target node, by minimizing the cross-entropy loss. \n", - "\n", - "In implementation, node embeddings are learnt by solving a simple classification task: given a large set of \"positive\" `(target, context)` node pairs generated from random walks performed on the graph (i.e., node pairs that co-occur within a certain context window in random walks), and an equally large set of \"negative\" node pairs that are randomly selected from the graph according to a certain distribution, learn a binary classifier that predicts whether arbitrary node pairs are likely to co-occur in a random walk performed on the graph. Through learning this simple binary node-pair-classification task, the model automatically learns an inductive mapping from attributes of nodes to node embeddings in a low-dimensional vector space, which preserves structural and feature similarities of the nodes. Unlike the embeddings learned by DeepWalk/node2vec, the mapping is inductive. Different from the mapping generated by GraphSAGE, the mapping only performs on node content attributes and does not rely on any link information, which makes it possible to construct representations for new coming nodes having few or no link information from their content attributes.\n", + "attri2vec learns node representations by performing a linear/non-linear mapping on node content attributes. To make the learned node representations respect structural similarity, [`DeepWalk`](https://dl.acm.org/citation.cfm?id=2623732)/[`node2vec`](https://snap.stanford.edu/node2vec) learning mechanism is used to make nodes sharing similar random walk context nodes represented closely in the subspace, which is achieved by maximizing the occurrence probability of context nodes conditioned on the representation of the target nodes. The probability is modelled by Softmax and negative sampling is used to speed up its calculation. This makes attri2vec equivalent to predict whether a node occurs in the given target node's context in random walks with the representation of the target node, by minimizing the cross-entropy loss. \n", "\n", - "To train the attri2vec model, we first construct a training set of nodes, which is composed of an equal number of positive and negative `(target, context)` pairs from the graph. The positive `(target, context)` pairs are the node pairs co-occurring on random walks over the graph whereas the negative node pairs are the sampled randomly from the global node degree distribution of the graph. In attri2vec, each node is attached with two kinds of embeddings: 1) the inductive 'input embedding', i.e, the objective embedding, obtained by perform a non-linear transformation on node content features, and 2) 'output embedding', i.e., the parameter vector used to predict its occurrence as a context node, obtained by looking up a parameter table. Given a `(target, context)` pair, attri2vec outputs a predictive value to indicate whether it is positive or negative, which is obtained by performing the dot product of the 'input embedding' of the target node and the 'output embedding' of the context node, followed by a sigmoid activation. \n", + "In implementation, node embeddings are learnt by solving a simple classification task: given a large set of \"positive\" `(target, context)` node pairs generated from random walks performed on the graph (i.e., node pairs that co-occur within a certain context window in random walks), and an equally large set of \"negative\" node pairs that are randomly selected from the graph according to a certain distribution, learn a binary classifier that predicts whether arbitrary node pairs are likely to co-occur in a random walk performed on the graph. Through learning this simple binary node-pair-classification task, the model automatically learns an inductive mapping from attributes of nodes to node embeddings in a low-dimensional vector space, which preserves structural and feature similarities of the nodes. \n", "\n", - "The entire model is trained end-to-end by minimizing the binary cross-entropy loss function with regards to predicted node pair labels and true node pair labels, using stochastic gradient descent (SGD) updates of the model parameters, with minibatches of 'training' node pairs generated on demand and fed into the model.\n", + "To train the attri2vec model, we first construct a training set of nodes, which is composed of an equal number of positive and negative `(target, context)` pairs from the graph. The positive `(target, context)` pairs are the node pairs co-occurring on random walks over the graph whereas the negative node pairs are the sampled randomly from the global node degree distribution of the graph. In attri2vec, each node is attached with two kinds of embeddings: 1) the inductive 'input embedding', i.e, the objective embedding, obtained by perform a linear/non-linear transformation on node content features, and 2) 'output embedding', i.e., the parameter vector used to predict its occurrence as a context node, obtained by looking up a parameter table. Given a `(target, context)` pair, attri2vec outputs a predictive value to indicate whether it is positive or negative, which is obtained by performing the dot product of the 'input embedding' of the target node and the 'output embedding' of the context node, followed by a sigmoid activation. \n", "\n", - "In this demo, we first train the attri2vec model on the in-sample subgraph and obtain a mapping function from node attributes to node representations, then apply the mapping function to the content attributes of out-of-sample nodes and obtain the representations of out-of-sample nodes. We evaluate the quality of inferred out-of-sample node representations by using it to predict the links of out-of-sample nodes." + "The entire model is trained end-to-end by minimizing the binary cross-entropy loss function with regards to predicted node pair labels and true node pair labels, using stochastic gradient descent (SGD) updates of the model parameters, with minibatches of 'training' node pairs generated on demand and fed into the model." ] }, { @@ -54,15 +46,19 @@ "import random\n", "\n", "import stellargraph as sg\n", - "from stellargraph.data import UniformRandomWalk\n", "from stellargraph.data import UnsupervisedSampler\n", "from stellargraph.mapper import Attri2VecLinkGenerator, Attri2VecNodeGenerator\n", "from stellargraph.layer import Attri2Vec, link_classification\n", "\n", - "import keras\n", + "from tensorflow import keras\n", "\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.metrics import roc_auc_score" + "import matplotlib.pyplot as plt\n", + "from sklearn.manifold import TSNE\n", + "from sklearn.decomposition import PCA\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LogisticRegressionCV\n", + "from sklearn.metrics import accuracy_score" ] }, { @@ -76,6 +72,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "This demo uses a DBLP citation network, a subgraph extracted from [DBLP-Citation-network V3](https://aminer.org/citation). To form this subgraph, papers from four subjects are extracted according to their venue information: *Database, Data Mining, Artificial Intelligence and Computer Vision*, and papers with no citations are removed. The DBLP network contains 18,448 papers and 45,661 citation relations. From paper titles, we construct 2,476-dimensional binary node feature vectors, with each element indicating the presence/absence of the corresponding word. By ignoring the citation direction, we take the DBLP subgraph as an undirected network.\n", + "\n", "The dataset used in this demo can be downloaded from https://www.kaggle.com/daozhang/dblp-subgraph.\n", "The following is the description of the dataset:\n", "\n", @@ -143,7 +141,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Construct the whole graph from edge list." + "Construct the graph from edge list." ] }, { @@ -152,7 +150,7 @@ "metadata": {}, "outputs": [], "source": [ - "G_all_nx = nx.from_pandas_edgelist(edgelist, edge_attr=\"label\")" + "G_nx = nx.from_pandas_edgelist(edgelist, edge_attr=\"label\")" ] }, { @@ -168,7 +166,7 @@ "metadata": {}, "outputs": [], "source": [ - "nx.set_node_attributes(G_all_nx, \"paper\", \"label\")" + "nx.set_node_attributes(G_nx, \"paper\", \"label\")" ] }, { @@ -184,7 +182,7 @@ "metadata": {}, "outputs": [], "source": [ - "all_node_features = node_data[feature_names]" + "node_features = node_data[feature_names]" ] }, { @@ -200,7 +198,7 @@ "metadata": {}, "outputs": [], "source": [ - "G_all = sg.StellarGraph(G_all_nx, node_features=all_node_features)" + "G = sg.StellarGraph(G_nx, node_features=node_features)" ] }, { @@ -226,132 +224,7 @@ } ], "source": [ - "print(G_all.info())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get DBLP Subgraph \n", - "### with papers published before a threshold year" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get the edge list connecting in-sample nodes." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "year_thresh = 2006 # the threshold year for in-sample and out-of-sample set split, which can be 2007, 2008 and 2009\n", - "subgraph_edgelist = []\n", - "for ii in range(len(edgelist)):\n", - " source_index = edgelist[\"source\"][ii]\n", - " target_index = edgelist[\"target\"][ii]\n", - " source_year = int(node_data[\"year\"][source_index])\n", - " target_year = int(node_data[\"year\"][target_index])\n", - " if source_year < year_thresh and target_year < year_thresh:\n", - " subgraph_edgelist.append([source_index, target_index])\n", - "subgraph_edgelist = pd.DataFrame(np.array(subgraph_edgelist), columns=[\"source\", \"target\"])\n", - "subgraph_edgelist[\"label\"] = \"cites\" # set the edge type" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Construct the network from the selected edge list." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "G_sub_nx = nx.from_pandas_edgelist(subgraph_edgelist, edge_attr=\"label\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Specify node types." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "nx.set_node_attributes(G_sub_nx, \"paper\", \"label\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get in-sample node features." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "subgraph_node_data = node_data[node_data[\"year\"]<2006]\n", - "subgraph_node_features = subgraph_node_data[feature_names]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create the Stellargraph with node features." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "G_sub = sg.StellarGraph(G_sub_nx, node_features=subgraph_node_features)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "StellarGraph: Undirected multigraph\n", - " Nodes: 11776, Edges: 28937\n", - "\n", - " Node types:\n", - " paper: [11776]\n", - " Edge types: paper-cites->paper\n", - "\n", - " Edge types:\n", - " paper-cites->paper: [28937]\n", - "\n" - ] - } - ], - "source": [ - "print(G_sub.info())" + "print(G.info())" ] }, { @@ -365,17 +238,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk, and random seed." + "Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk." ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "nodes = list(G_sub.nodes())\n", - "number_of_walks = 2\n", + "nodes = list(G.nodes())\n", + "number_of_walks = 4\n", "length = 5" ] }, @@ -388,11 +261,11 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "unsupervised_samples = UnsupervisedSampler(G_sub, nodes=nodes, length=length, number_of_walks=number_of_walks)" + "unsupervised_samples = UnsupervisedSampler(G, nodes=nodes, length=length, number_of_walks=number_of_walks)" ] }, { @@ -404,12 +277,12 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "batch_size = 50\n", - "epochs = 4" + "epochs = 8" ] }, { @@ -421,19 +294,19 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Running Attri2VecLinkGenerator with an estimated 4710 batches generated on the fly per epoch.\n" + "Running Attri2VecLinkGenerator with an estimated 14758 batches generated on the fly per epoch.\n" ] } ], "source": [ - "train_gen = Attri2VecLinkGenerator(G_sub, batch_size).flow(unsupervised_samples)" + "train_gen = Attri2VecLinkGenerator(G, batch_size).flow(unsupervised_samples)" ] }, { @@ -442,44 +315,37 @@ "source": [ "Building the model: a 1-hidden-layer node representation ('input embedding') of the `target` node and the parameter vector ('output embedding') for predicting the existence of `context node` for each `(target context)` pair, with a link classification layer performed on the dot product of the 'input embedding' of the `target` node and the 'output embedding' of the `context` node.\n", "\n", - "Attri2Vec part of the model, with a 128-dimenssion hidden layer, no bias term, no dropout and no normalization. (Dropout can be switched on by specifying a positive dropout rate, 0 < dropout < 1 and normalization can be set to 'l2'). " + "Attri2Vec part of the model, with a 128-dimenssion hidden layer, no bias term and no normalization. (Normalization can be set to 'l2'). " ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 14, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: Logging before flag parsing goes to stderr.\n", - "W0822 10:43:54.482249 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "layer_sizes = [128]\n", "attri2vec = Attri2Vec(\n", - " layer_sizes=layer_sizes, generator=train_gen, bias=False, dropout=0.0, normalize=\"None\"\n", + " layer_sizes=layer_sizes, generator=train_gen, bias=False, normalize=None\n", ")" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "W0822 10:43:56.449973 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n", - "\n", - "W0822 10:43:56.455691 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n", - "\n" + "WARNING: Logging before flag parsing goes to stderr.\n", + "W0924 14:51:32.580700 140736388883392 deprecation.py:506] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n", + "W0924 14:51:32.618021 140736388883392 deprecation.py:506] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n" ] } ], @@ -497,7 +363,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -523,23 +389,9 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 17, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "W0822 10:44:02.083713 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", - "\n", - "W0822 10:44:02.091721 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.\n", - "\n", - "W0822 10:44:02.102672 140736102417344 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Use tf.where in 2.0, which has the same broadcast rule as np.where\n" - ] - } - ], + "outputs": [], "source": [ "model = keras.Model(inputs=x_inp, outputs=prediction)\n", "\n", @@ -559,31 +411,40 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 18, "metadata": { - "scrolled": true + "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "W0822 10:44:04.704549 140736102417344 deprecation_wrapper.py:119] From /anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.\n", - "\n" + "W0924 14:51:33.217601 140736388883392 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use tf.where in 2.0, which has the same broadcast rule as np.where\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Epoch 1/4\n", - "4711/4711 [==============================] - 89s 19ms/step - loss: 0.6678 - binary_accuracy: 0.6432\n", - "Epoch 2/4\n", - "4711/4711 [==============================] - 92s 19ms/step - loss: 0.5859 - binary_accuracy: 0.6642\n", - "Epoch 3/4\n", - "4711/4711 [==============================] - 91s 19ms/step - loss: 0.4810 - binary_accuracy: 0.7541\n", - "Epoch 4/4\n", - "4711/4711 [==============================] - 93s 20ms/step - loss: 0.3283 - binary_accuracy: 0.8544\n" + "Epoch 1/8\n", + "14759/14759 [==============================] - 384s 26ms/step - loss: 0.7016 - binary_accuracy: 0.5404\n", + "Epoch 2/8\n", + "14759/14759 [==============================] - 397s 27ms/step - loss: 0.6156 - binary_accuracy: 0.6309\n", + "Epoch 3/8\n", + "14759/14759 [==============================] - 380s 26ms/step - loss: 0.5221 - binary_accuracy: 0.7362\n", + "Epoch 4/8\n", + "14759/14759 [==============================] - 407s 28ms/step - loss: 0.4527 - binary_accuracy: 0.7909\n", + "Epoch 5/8\n", + "14759/14759 [==============================] - 389s 26ms/step - loss: 0.4029 - binary_accuracy: 0.8219\n", + "Epoch 6/8\n", + "14759/14759 [==============================] - 409s 28ms/step - loss: 0.3673 - binary_accuracy: 0.8424\n", + "Epoch 7/8\n", + "14759/14759 [==============================] - 399s 27ms/step - loss: 0.3387 - binary_accuracy: 0.8573\n", + "Epoch 8/8\n", + "14759/14759 [==============================] - 387s 26ms/step - loss: 0.3189 - binary_accuracy: 0.8679\n" ] } ], @@ -592,8 +453,8 @@ " train_gen,\n", " epochs=epochs,\n", " verbose=1,\n", - " use_multiprocessing=True,\n", - " workers=4,\n", + " use_multiprocessing=False,\n", + " workers=1,\n", " shuffle=True,\n", ")" ] @@ -602,7 +463,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Predicting links of out-of-sample nodes with the learned attri2vec model" + "## Visualise Node Embeddings" ] }, { @@ -614,7 +475,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -627,12 +488,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Get the node embeddings, for both in-sample and out-of-sample nodes, by applying the learned mapping function to node content features." + "Get the node embeddings by applying the learned mapping function to node content features." ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -645,272 +506,76 @@ ], "source": [ "node_ids = node_data.index\n", - "node_gen = Attri2VecNodeGenerator(G_all, batch_size).flow(node_ids)\n", - "node_embeddings = embedding_model.predict_generator(node_gen, workers=4, verbose=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get the positive and negative edges for in-sample nodes and out-of-sample nodes. The edges of the in-sample nodes only include the edges between in-sample nodes, and the edges of out-of-sample nodes are referred to all the edges linked to out-of-sample nodes, including the edges connecting in-sample and out-of-sample edges." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "year_thresh = 2006\n", - "in_sample_edges = []\n", - "out_of_sample_edges = []\n", - "for ii in range(len(edgelist)):\n", - " source_index = edgelist[\"source\"][ii]\n", - " target_index = edgelist[\"target\"][ii]\n", - " if source_index > target_index: # neglect edge direction for the undirected graph\n", - " continue\n", - " source_year = int(node_data[\"year\"][source_index])\n", - " target_year = int(node_data[\"year\"][target_index])\n", - " if source_year < year_thresh and target_year < year_thresh:\n", - " in_sample_edges.append([source_index, target_index, 1]) # get the positive edge\n", - " negative_target_index = unsupervised_samples.random.choices(node_data.index.tolist(), k=1) # generate negative node\n", - " in_sample_edges.append([source_index, negative_target_index[0], 0]) # get the negative edge\n", - " else:\n", - " out_of_sample_edges.append([source_index, target_index, 1]) # get the positive edge\n", - " negative_target_index = unsupervised_samples.random.choices(node_data.index.tolist(), k=1) # generate negative node\n", - " out_of_sample_edges.append([source_index, negative_target_index[0], 0]) # get the negative edge\n", - "in_sample_edges = np.array(in_sample_edges)\n", - "out_of_sample_edges = np.array(out_of_sample_edges)" + "node_gen = Attri2VecNodeGenerator(G, batch_size).flow(node_ids)\n", + "node_embeddings = embedding_model.predict_generator(node_gen, workers=1, verbose=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Construct the edge features from the learned node representations with l2 normed difference, where edge features are the element-wise square of the difference between the embeddings of two head nodes. Other strategy like element-wise product can also be used to construct edge features." + "Get node subjects." ] }, { "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "in_sample_edge_feat_from_emb = (node_embeddings[in_sample_edges[:,0]]-node_embeddings[in_sample_edges[:,1]])**2\n", - "out_of_sample_edge_feat_from_emb = (node_embeddings[out_of_sample_edges[:,0]]-node_embeddings[out_of_sample_edges[:,1]])**2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Train the Logistic Regression classifier from in-sample edges with the edge features constructed from attri2vec embeddings. " - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", - " intercept_scaling=1, l1_ratio=None, max_iter=500,\n", - " multi_class='auto', n_jobs=None, penalty='l2',\n", - " random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n", - " warm_start=False)" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "clf_edge_pred_from_emb = LogisticRegression(verbose=0, solver='lbfgs', multi_class=\"auto\", max_iter=500)\n", - "clf_edge_pred_from_emb.fit(in_sample_edge_feat_from_emb, in_sample_edges[:,2])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Predict the edge existence probability with the trained Logistic Regression classifier." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "edge_pred_from_emb = clf_edge_pred_from_emb.predict_proba(out_of_sample_edge_feat_from_emb)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get the positive class index of `edge_pred_from_emb`." - ] - }, - { - "cell_type": "code", - "execution_count": 31, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "if clf_edge_pred_from_emb.classes_[0] == 1:\n", - " positive_class_index = 0\n", - "else:\n", - " positive_class_index = 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Evaluate the AUC score for the prediction with attri2vec embeddings." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.798202033278801" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "roc_auc_score(out_of_sample_edges[:,2], edge_pred_from_emb[:,positive_class_index])" + "node_targets = [ node_data[\"subject\"][node_id] for node_id in node_ids ]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "As the baseline, we also investigate the performance of node content features in predicting the edges of out-of-sample nodes. Firstly, we construct edge features from node content features with the same strategy." + "Transform the embeddings to 2d space for visualisation." ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "in_sample_edge_rep_from_feat = (node_data[feature_names].values[in_sample_edges[:,0]]-node_data[feature_names].values[in_sample_edges[:,1]])**2\n", - "out_of_sample_edge_rep_from_feat = (node_data[feature_names].values[out_of_sample_edges[:,0]]-node_data[feature_names].values[out_of_sample_edges[:,1]])**2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then we train the Logistic Regression classifier from in-sample edges with the edge features constructed from node content features." + "transform = TSNE # PCA\n", + "\n", + "trans = transform(n_components=2)\n", + "node_embeddings_2d = trans.fit_transform(node_embeddings)" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { + "image/png": "\n", "text/plain": [ - "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", - " intercept_scaling=1, l1_ratio=None, max_iter=500,\n", - " multi_class='auto', n_jobs=None, penalty='l2',\n", - " random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n", - " warm_start=False)" + "
" ] }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "clf_edge_pred_from_feat = LogisticRegression(verbose=0, solver='lbfgs', multi_class=\"auto\", max_iter=500)\n", - "clf_edge_pred_from_feat.fit(in_sample_edge_rep_from_feat, in_sample_edges[:,2])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Predict the edge existence probability with the trained Logistic Regression classifier." - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "edge_pred_from_feat = clf_edge_pred_from_feat.predict_proba(out_of_sample_edge_rep_from_feat)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get positive class index of `clf_edge_pred_from_feat`." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "if clf_edge_pred_from_feat.classes_[0] == 1:\n", - " positive_class_index = 0\n", - "else:\n", - " positive_class_index = 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Evaluate the AUC score for the prediction with node content features." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.6638897794650095" - ] + "metadata": { + "needs_background": "light" }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "roc_auc_score(out_of_sample_edges[:,2], edge_pred_from_feat[:,positive_class_index])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "attri2vec performs much better than node content features in predicting the links of out-of-sample nodes." + "# draw the embedding points, coloring them by the target label (paper subject)\n", + "alpha = 0.7\n", + "label_map = { l: i for i, l in enumerate(np.unique(node_targets)) }\n", + "node_colours = [ label_map[target] for target in node_targets ]\n", + "\n", + "plt.figure(figsize=(7,7))\n", + "plt.axes().set(aspect=\"equal\")\n", + "plt.scatter(node_embeddings_2d[:,0], \n", + " node_embeddings_2d[:,1], \n", + " c=node_colours, cmap=\"jet\", alpha=alpha)\n", + "plt.title('{} visualization of node embeddings'.format(transform.__name__))\n", + "plt.show()" ] } ], From 5e122413bb49dcedf588334b512db3cbf0d0c21a Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 25 Sep 2019 11:00:04 +1000 Subject: [PATCH 27/82] Add attrivec citeseer embeddings demo --- .../stellargraph-attri2vec-citeseer.ipynb | 684 ++++++++++++++++++ 1 file changed, 684 insertions(+) create mode 100644 demos/embeddings/stellargraph-attri2vec-citeseer.ipynb diff --git a/demos/embeddings/stellargraph-attri2vec-citeseer.ipynb b/demos/embeddings/stellargraph-attri2vec-citeseer.ipynb new file mode 100644 index 000000000..41f6b60ad --- /dev/null +++ b/demos/embeddings/stellargraph-attri2vec-citeseer.ipynb @@ -0,0 +1,684 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Node Representation Learning with attri2vec on Citeseer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is the python implementation of the attri2vec algorithm outlined in paper ***[Attributed Network Embedding Via Subspace Discovery](https://arxiv.org/abs/1901.04095)*** D. Zhang, Y. Jie, X. Zhu and C. Zhang, arXiv:1901.04095, [cs.SI], 2019. The implementation uses the stellargraph libraries.\n", + "\n", + "\n", + "## attri2vec\n", + "\n", + "attri2vec learns node representations by performing a linear/non-linear mapping on node content attributes. To make the learned node representations respect structural similarity, [`DeepWalk`](https://dl.acm.org/citation.cfm?id=2623732)/[`node2vec`](https://snap.stanford.edu/node2vec) learning mechanism is used to make nodes sharing similar random walk context nodes represented closely in the subspace, which is achieved by maximizing the occurrence probability of context nodes conditioned on the representation of the target nodes. The probability is modelled by Softmax and negative sampling is used to speed up its calculation. This makes attri2vec equivalent to predict whether a node occurs in the given target node's context in random walks with the representation of the target node, by minimizing the cross-entropy loss. \n", + "\n", + "In implementation, node embeddings are learnt by solving a simple classification task: given a large set of \"positive\" `(target, context)` node pairs generated from random walks performed on the graph (i.e., node pairs that co-occur within a certain context window in random walks), and an equally large set of \"negative\" node pairs that are randomly selected from the graph according to a certain distribution, learn a binary classifier that predicts whether arbitrary node pairs are likely to co-occur in a random walk performed on the graph. Through learning this simple binary node-pair-classification task, the model automatically learns an inductive mapping from attributes of nodes to node embeddings in a low-dimensional vector space, which preserves structural and feature similarities of the nodes. \n", + "\n", + "To train the attri2vec model, we first construct a training set of nodes, which is composed of an equal number of positive and negative `(target, context)` pairs from the graph. The positive `(target, context)` pairs are the node pairs co-occurring on random walks over the graph whereas the negative node pairs are the sampled randomly from the global node degree distribution of the graph. In attri2vec, each node is attached with two kinds of embeddings: 1) the inductive 'input embedding', i.e, the objective embedding, obtained by perform a linear/non-linear transformation on node content features, and 2) 'output embedding', i.e., the parameter vector used to predict its occurrence as a context node, obtained by looking up a parameter table. Given a `(target, context)` pair, attri2vec outputs a predictive value to indicate whether it is positive or negative, which is obtained by performing the dot product of the 'input embedding' of the target node and the 'output embedding' of the context node, followed by a sigmoid activation. \n", + "\n", + "The entire model is trained end-to-end by minimizing the binary cross-entropy loss function with regards to predicted node pair labels and true node pair labels, using stochastic gradient descent (SGD) updates of the model parameters, with minibatches of 'training' node pairs generated on demand and fed into the model." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], + "source": [ + "import networkx as nx\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import random\n", + "\n", + "import stellargraph as sg\n", + "from stellargraph.data import UnsupervisedSampler\n", + "from stellargraph.mapper import Attri2VecLinkGenerator, Attri2VecNodeGenerator\n", + "from stellargraph.layer import Attri2Vec, link_classification\n", + "\n", + "from tensorflow import keras\n", + "\n", + "from pandas.core.indexes.base import Index\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.manifold import TSNE\n", + "from sklearn.decomposition import PCA\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LogisticRegressionCV\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset is the citation network Citeseer.\n", + "\n", + "It can be downloaded by clicking [here](https://linqs-data.soe.ucsc.edu/public/lbc/citesser.tgz)\n", + "\n", + "The following is the description of the dataset from the publisher,\n", + "> The CiteSeer dataset consists of 3312 scientific publications classified into one of six classes. The citation network consists of 4732 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 3703 unique words. The README file in the dataset provides more details.\n", + "\n", + "Download and unzip the citeseer.tgz file to a location on your computer. \n", + "\n", + "We assume that the dataset is stored in the directory\n", + "\n", + "`~/data/citeseer/`\n", + "\n", + "where the files `citeseer.cites` and `citeseer.content` can be located.\n", + "\n", + "We are going to load the data into a networkx object." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "data_dir = \"~/data/citeseer\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load edges in order 'cited-paper' <- 'citing-paper'." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "citeseer_location = os.path.expanduser(os.path.join(data_dir, \"citeseer.cites\"))\n", + "g_nx = nx.read_edgelist(path=citeseer_location, create_using=nx.DiGraph()).reverse()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convert the graph to undirected graph." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "g_nx = g_nx.to_undirected()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the node attribute data." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " interactivity=interactivity, compiler=compiler, result=result)\n" + ] + } + ], + "source": [ + "citeseer_data_location = os.path.expanduser(os.path.join(data_dir, \"citeseer.content\"))\n", + "attr_names = [\"w_{}\".format(ii) for ii in range(3703)]\n", + "node_column_names = attr_names + [\"subject\"]\n", + "node_attr = pd.read_csv(citeseer_data_location, sep='\\t', header=None, names=node_column_names)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Change the type of the indexes of node_attr to str." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "node_attr.index = Index(list(map(str, list(node_attr.index))))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The original graph contains some nodes with no attributes. We remove them here." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "g_nx = g_nx.subgraph(list(node_attr.index))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Select the largest connected component. For clarity we ignore isolated nodes and subgraphs." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Largest subgraph statistics: 2110 nodes, 3720 edges\n" + ] + } + ], + "source": [ + "g_nx_ccs = (g_nx.subgraph(c).copy() for c in nx.connected_components(g_nx))\n", + "g_nx = max(g_nx_ccs, key=len)\n", + "print(\"Largest subgraph statistics: {} nodes, {} edges\".format(\n", + " g_nx.number_of_nodes(), g_nx.number_of_edges()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify node types." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "nx.set_node_attributes(g_nx, \"paper\", \"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the ids of the nodes in the selected largest connected component. " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "node_ids = sorted(list(g_nx.nodes))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get node features." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "node_features = node_attr[attr_names].reindex(node_ids)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the Stellargraph with node features." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "G = sg.StellarGraph(g_nx, node_features=node_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "StellarGraph: Undirected multigraph\n", + " Nodes: 2110, Edges: 3720\n", + "\n", + " Node types:\n", + " paper: [2110]\n", + " Edge types: paper-default->paper\n", + "\n", + " Edge types:\n", + " paper-default->paper: [3720]\n", + "\n" + ] + } + ], + "source": [ + "print(G.info())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train attri2vec on Citeseer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "nodes = list(G.nodes())\n", + "number_of_walks = 4\n", + "length = 5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the UnsupervisedSampler instance with the relevant parameters passed to it." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "unsupervised_samples = UnsupervisedSampler(G, nodes=nodes, length=length, number_of_walks=number_of_walks)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the batch size and the number of epochs. " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 50\n", + "epochs = 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define an attri2vec training generator, which generates a batch of (feature of target node, index of context node, label of node pair) pairs per iteration." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running Attri2VecLinkGenerator with an estimated 1688 batches generated on the fly per epoch.\n" + ] + } + ], + "source": [ + "train_gen = Attri2VecLinkGenerator(G, batch_size).flow(unsupervised_samples)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Building the model: a 1-hidden-layer node representation ('input embedding') of the `target` node and the parameter vector ('output embedding') for predicting the existence of `context node` for each `(target context)` pair, with a link classification layer performed on the dot product of the 'input embedding' of the `target` node and the 'output embedding' of the `context` node.\n", + "\n", + "Attri2Vec part of the model, with a 128-dimenssion hidden layer, no bias term and no normalization. (Normalization can be set to 'l2'). " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "layer_sizes = [128]\n", + "attri2vec = Attri2Vec(\n", + " layer_sizes=layer_sizes, generator=train_gen, bias=False, normalize=None\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: Logging before flag parsing goes to stderr.\n", + "W0925 10:19:03.073729 140736388883392 deprecation.py:506] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n", + "W0925 10:19:03.096633 140736388883392 deprecation.py:506] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n" + ] + } + ], + "source": [ + "# Build the model and expose input and output sockets of attri2vec, for node pair inputs:\n", + "x_inp, x_out = attri2vec.build()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the link_classification function to generate the prediction, with the 'ip' edge embedding generation method and the 'sigmoid' activation, which actually performs the dot product of the 'input embedding' of the target node and the 'output embedding' of the context node followed by a sigmoid activation. " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "link_classification: using 'ip' method to combine node embeddings into edge embeddings\n" + ] + } + ], + "source": [ + "prediction = link_classification(\n", + " output_dim=1, output_act=\"sigmoid\", edge_embedding_method='ip'\n", + ")(x_out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Stack the Attri2Vec encoder and prediction layer into a Keras model, and specify the loss." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "model = keras.Model(inputs=x_inp, outputs=prediction)\n", + "\n", + "model.compile(\n", + " optimizer=keras.optimizers.Adam(lr=1e-3),\n", + " loss=keras.losses.binary_crossentropy,\n", + " metrics=[keras.metrics.binary_accuracy],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train the model." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/4\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "W0925 10:19:03.339447 140736388883392 deprecation.py:323] From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use tf.where in 2.0, which has the same broadcast rule as np.where\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1688/1688 [==============================] - 12s 7ms/step - loss: 0.6820 - binary_accuracy: 0.5580\n", + "Epoch 2/4\n", + "1688/1688 [==============================] - 11s 7ms/step - loss: 0.5126 - binary_accuracy: 0.7584\n", + "Epoch 3/4\n", + "1688/1688 [==============================] - 11s 6ms/step - loss: 0.3356 - binary_accuracy: 0.8780\n", + "Epoch 4/4\n", + "1688/1688 [==============================] - 11s 7ms/step - loss: 0.2456 - binary_accuracy: 0.9161\n" + ] + } + ], + "source": [ + "history = model.fit_generator(\n", + " train_gen,\n", + " epochs=epochs,\n", + " verbose=1,\n", + " use_multiprocessing=False,\n", + " workers=1,\n", + " shuffle=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualise Node Embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Build the node based model for predicting node representations from node content attributes with the learned parameters. Below a Keras model is constructed, with x_inp[0] as input and x_out[0] as output. Note that this model's weights are the same as those of the corresponding node encoder in the previously trained node pair classifier." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "x_inp_src = x_inp[0]\n", + "x_out_src = x_out[0]\n", + "embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the node embeddings by applying the learned mapping function to node content features." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "43/43 [==============================] - 0s 2ms/step\n" + ] + } + ], + "source": [ + "node_gen = Attri2VecNodeGenerator(G, batch_size).flow(node_ids)\n", + "node_embeddings = embedding_model.predict_generator(node_gen, workers=1, verbose=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get node subjects." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "node_targets = [ node_attr[\"subject\"][node_id] for node_id in node_ids ]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transform the embeddings to 2d space for visualisation." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "transform = TSNE # PCA\n", + "\n", + "trans = transform(n_components=2)\n", + "node_embeddings_2d = trans.fit_transform(node_embeddings)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# draw the embedding points, coloring them by the target label (paper subject)\n", + "alpha = 0.7\n", + "label_map = { l: i for i, l in enumerate(np.unique(node_targets)) }\n", + "node_colours = [ label_map[target] for target in node_targets ]\n", + "\n", + "plt.figure(figsize=(7,7))\n", + "plt.axes().set(aspect=\"equal\")\n", + "plt.scatter(node_embeddings_2d[:,0], \n", + " node_embeddings_2d[:,1], \n", + " c=node_colours, cmap=\"jet\", alpha=alpha)\n", + "plt.title('{} visualization of node embeddings'.format(transform.__name__))\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 67c3c452e89e1f7efbff4d254f12bdbd2c1da846 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 25 Sep 2019 11:00:48 +1000 Subject: [PATCH 28/82] Add demos/node-classification/attri2vec/README.md file --- demos/node-classification/attri2vec/README.md | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 demos/node-classification/attri2vec/README.md diff --git a/demos/node-classification/attri2vec/README.md b/demos/node-classification/attri2vec/README.md new file mode 100644 index 000000000..38bab7ac9 --- /dev/null +++ b/demos/node-classification/attri2vec/README.md @@ -0,0 +1,63 @@ +## Node classification using attri2vec [1] + +This folder contains two [Jupyter](http://jupyter.org/) python notebooks demonstrating the combined use of +`stellargraph` (this library), `Scikit-learn` [3] libraries for node classification in two homogeneous graphs +attached with node attributes. + +The first example demonstrates node representation learning and node classification using the citeseer +paper citation network. This demo is included in the Jupyter notebook +`attri2vec-citeseer-node-classification-example.ipynb`. + +The second example demonstrates node representation learning and node classification using the DBLP +paper citation network. This demo is included in the Jupyter notebook +`attri2vec-DBLP-node-classification-example.ipynb`. + +The two Jupyter notebooks include all the information for downloading the corresponding datasets, training the GraphSAGE +models and using them to classify nodes with unknown (to the training algorithm) labels. + +To run the notebooks, install Jupyter to the same Python 3.6 environment as StellarGraph, following the instructions on +the Jupyter project website: http://jupyter.org/install.html + +After starting the Jupyter server on your computer, load either of the two notebooks and follow the instructions inside. + +## Requirements + +All examples use Python 3.6 and the StellarGraph library. To install the StellarGraph library +follow the instructions at: https://github.com/stellargraph/stellargraph + +Additional requirements are Pandas, Numpy and Scikit-Learn which are installed as dependencies +of the StellarGraph library. In addition Juptyer is required to run the notebook version of +the example. + +## Dataset + +The examples in this directory uses the citeseer and DBLP dataset. + +The citeseer dataset can be downloaded from [here](https://linqs-data.soe.ucsc.edu/public/lbc/citesser.tgz). + +The following is the description of the dataset: +> The CiteSeer dataset consists of 3312 scientific publications classified into one of six classes. +> The citation network consists of 4732 links. Each publication in the dataset is described by a +> 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. +> The dictionary consists of 3703 unique words. The README file in the dataset provides more details. + +Download and unzip the [citeseer.tgz](https://linqs-data.soe.ucsc.edu/public/lbc/citesser.tgz) file to a location on your +computer and pass this location as a command line argument to this script. + +The DBLP dataset can be downloaded from [here](https://www.kaggle.com/daozhang/dblp-subgraph). + +The following is the description of the dataset: +> The DBLP citation network is a subgraph extracted from DBLP-Citation-network V3 (https://aminer.org/citation). +> To form this subgraph, papers from four subjects are extracted according to their venue information: +> Database, Data Mining, Artificial Intelligence and Computer Vision, and papers with no citations are removed. +> The DBLP network contains 18,448 papers and 45,661 citation relations. From paper titles, we construct +> 2,476-dimensional binary node feature vectors, with each element indicating the presence/absence of the corresponding word. +> By ignoring the citation direction, we take the DBLP subgraph as an undirected network. + +Download and unzip the [DBLP.zip](https://www.kaggle.com/daozhang/dblp-subgraph) file to a location on your computer +and pass this location as a command line argument to this script. + +## References + +**1.** Attributed Network Embedding via Subspace Discovery. D. Zhang, J, Yin, X. Zhu and C. Zhang, arXiv:1901.04095, +[cs.SI], 2019. ([link](https://arxiv.org/abs/1901.04095)) From 8fb54c1141dfa4d3cbe9182f39471ceb05d33db4 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 25 Sep 2019 11:01:25 +1000 Subject: [PATCH 29/82] add attri2vec description to demos/node-classification/README.md file --- demos/node-classification/README.md | 33 +++++++++++++++++------------ 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/demos/node-classification/README.md b/demos/node-classification/README.md index 9dbfd31c6..f3d42bcc1 100644 --- a/demos/node-classification/README.md +++ b/demos/node-classification/README.md @@ -6,17 +6,22 @@ examples demonstrate using the `StellarGraph` library to build machine learning workflows on both homogeneous and heterogeneous networks. Each folder contains one or more examples of using the StellarGraph implementations of the -state-of-the-art algorithms, GraphSAGE [3], HinSAGE, GCN [5], GAT [6], SGC [8], Node2Vec [1], and Metapath2Vec [2]. -GraphSAGE, HinSAGE, and GAT are variants of Graph Convolutional Neural networks [5]. Node2Vec and +state-of-the-art algorithms, attri2vec[4], GraphSAGE [3], HinSAGE, GCN [6], GAT [7], SGC [9], Node2Vec [1], and Metapath2Vec [2]. +GraphSAGE, HinSAGE, and GAT are variants of Graph Convolutional Neural networks [6]. Node2Vec and Metapath2Vec are methods based on graph random walks and representation learning using the -Word2Vec [4] algorithm. +Word2Vec [5] algorithm. attri2vec[4] is also based on graph random walks and learns node +representations by performing a mapping on node attributes. The examples folder structure is shown below. +* [`/attri2vec`] (https://github.com/stellargraph/stellargraph/tree/master/demos/node-classification/graphsage) + + Examples of supervised node classification for two homogeneous networks with attributes, using the attri2vec algorithm [4]. + * [`/graphsage`](https://github.com/stellargraph/stellargraph/tree/master/demos/node-classification/graphsage) Example of supervised node classification for a homogeneous network with attributed nodes, using the GraphSAGE algorithm [3]. - + * [`/gcn`](https://github.com/stellargraph/stellargraph/tree/master/demos/node-classification/gcn) Example of semi-supervised node classification for a homogeneous network, using the GCN algorithm [5]. @@ -42,28 +47,30 @@ The examples folder structure is shown below. ## References -1. Node2Vec: Scalable Feature Learning for Networks. A. Grover, J. Leskovec. ACM SIGKDD International Conference on +1. Node2Vec: Scalable Feature Learning for Networks. A. Grover, J. Leskovec. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2016. ([link](https://snap.stanford.edu/node2vec/)) -2. Metapath2Vec: Scalable Representation Learning for Heterogeneous Networks. Yuxiao Dong, Nitesh V. Chawla, and +2. Metapath2Vec: Scalable Representation Learning for Heterogeneous Networks. Yuxiao Dong, Nitesh V. Chawla, and Ananthram Swami. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 135–144, 2017 ([link](https://ericdongyx.github.io/metapath2vec/m2v.html)) -3. Inductive Representation Learning on Large Graphs. W.L. Hamilton, R. Ying, and J. Leskovec arXiv:1706.02216 +3. Inductive Representation Learning on Large Graphs. W.L. Hamilton, R. Ying, and J. Leskovec arXiv:1706.02216 [cs.SI], 2017. ([link](http://snap.stanford.edu/graphsage/)) -4. Distributed representations of words and phrases and their compositionality. T. Mikolov, +4. Attributed Network Embedding via Subspace Discovery. D. Zhang, J, Yin, X. Zhu and C. Zhang, arXiv:1901.04095, +[cs.SI], 2019. ([link](https://arxiv.org/abs/1901.04095)) + +5. Distributed representations of words and phrases and their compositionality. T. Mikolov, I. Sutskever, K. Chen, G. S. Corrado, and J. Dean. In Advances in Neural Information Processing Systems (NIPS), pp. 3111-3119, 2013. ([link](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)) -5. Semi-Supervised Classification with Graph Convolutional Networks. T. Kipf, M. Welling. +6. Semi-Supervised Classification with Graph Convolutional Networks. T. Kipf, M. Welling. ICLR 2017. arXiv:1609.02907 ([link](https://arxiv.org/abs/1609.02907)) -6. Graph Attention Networks. P. Velickovic et al. ICLR 2018 ([link](https://arxiv.org/abs/1710.10903)) +7. Graph Attention Networks. P. Velickovic et al. ICLR 2018 ([link](https://arxiv.org/abs/1710.10903)) -7. On Calibration of Modern Neural Networks. C. Guo, G. Pleiss, Y. Sun, and K. Q. Weinberger. +8. On Calibration of Modern Neural Networks. C. Guo, G. Pleiss, Y. Sun, and K. Q. Weinberger. ICML 2017. ([link](https://geoffpleiss.com/nn_calibration)) -8. Simplifying Graph Convolutional Networks. F. Wu, T. Zhang, A. H. de Souza, C. Fifty, T. Yu, and K. Q. Weinberger. +9. Simplifying Graph Convolutional Networks. F. Wu, T. Zhang, A. H. de Souza, C. Fifty, T. Yu, and K. Q. Weinberger. arXiv:1902.07153. ([link](https://arxiv.org/abs/1902.07153)) - From 20046ec788389744c96e1be49fde47df7dfbd944 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 25 Sep 2019 11:01:54 +1000 Subject: [PATCH 30/82] add demos/link-prediction/attri2vec/README.md file --- demos/link-prediction/attri2vec/README.md | 51 +++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 demos/link-prediction/attri2vec/README.md diff --git a/demos/link-prediction/attri2vec/README.md b/demos/link-prediction/attri2vec/README.md new file mode 100644 index 000000000..1fd1b57ba --- /dev/null +++ b/demos/link-prediction/attri2vec/README.md @@ -0,0 +1,51 @@ +# attri2vec Link Prediction for Out-of-sample Nodes + +This is an example of using the attri2vec [1] model, with a link classifier on top, +to predict links for out-of-sample nodes in a homogeneous citation network. + +In this demo, we first train the attri2vec model on the in-sample subgraph and infer +representations for out-of-sample nodes with the trained attri2vec model. The we use the +obtained node representations to perform link prediction for out-of-sample nodes. + +The link prediction problem is treated as a supervised binary classification problem for +`(src, dst)` node pairs that make up links in the graph, with positive examples +representing links that do exist in the graph, and negative examples representing +links that don't. + +In this example, we learn to predict citation links between papers in a DBLP dataset (see below). + +## Requirements +This example assumes the `stellargraph` library and its requirements have been +installed by following the installation instructions in the README +of the library's [root directory](https://github.com/stellargraph/stellargraph). + +## DBLP dataset + +This example is tested on the DBLP dataset. The attri2vec model assumes that node +features are available. + +The following is the description of the dataset: +> The DBLP citation network is a subgraph extracted from DBLP-Citation-network V3 (https://aminer.org/citation). +> To form this subgraph, papers from four subjects are extracted according to their venue information: +> Database, Data Mining, Artificial Intelligence and Computer Vision, and papers with no citations are removed. +> The DBLP network contains 18,448 papers and 45,661 citation relations. From paper titles, we construct +> 2,476-dimensional binary node feature vectors, with each element indicating the presence/absence of the corresponding word. +> By ignoring the citation direction, we take the DBLP subgraph as an undirected network. + +Download and unzip the [DBLP.zip](https://www.kaggle.com/daozhang/dblp-subgraph) file to a location on your computer +and pass this location as a command line argument to this script. + +## Running the notebook +The narrated version of this example is available in the `stellargraph-attri2vec-DBLP.ipynb` notebook. +To run the notebook: + - Activate the python 3.6 environment in which the +`stellargraph` library is installed + - Start `jupyter-notebook` + - note: you may need to first install `jupyter` by running `pip install jupyter` in your python environment + - Navigate to the notebook (`/demos/link-prediction/attri2vec/stellargraph-attri2vec-DBLP.ipynb`), and click on + it to launch the notebook. + +## References + +[1] Attributed Network Embedding via Subspace Discovery. D. Zhang, J, Yin, X. Zhu and C. Zhang, arXiv:1901.04095, +[cs.SI], 2019. ([link](https://arxiv.org/abs/1901.04095)) From 4121e333c56e7c960fc6d319f67bf30d3965e626 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 25 Sep 2019 11:02:23 +1000 Subject: [PATCH 31/82] add attri2vec description to demos/link-prediction/README.md file --- demos/link-prediction/README.md | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/demos/link-prediction/README.md b/demos/link-prediction/README.md index b639e6515..77a98fb58 100644 --- a/demos/link-prediction/README.md +++ b/demos/link-prediction/README.md @@ -6,10 +6,11 @@ examples demonstrate using the `StellarGraph` library to build machine learning workflows on both homogeneous and heterogeneous networks. Each folder contains one or more examples of using the StellarGraph implementations of the -state-of-the-art algorithms, GraphSAGE [3], HinSAGE, GCN [5], GAT [6], Node2Vec [1], and Metapath2Vec [2]. -GraphSAGE, HinSAGE, and GAT are variants of Graph Convolutional Neural networks [5]. Node2Vec and +state-of-the-art algorithms, attri2vec[4], GraphSAGE [3], HinSAGE, GCN [6], GAT [7], Node2Vec [1], and Metapath2Vec [2]. +GraphSAGE, HinSAGE, and GAT are variants of Graph Convolutional Neural networks [6]. Node2Vec and Metapath2Vec are methods based on graph random walks and representation learning using the -Word2Vec [4] algorithm. +Word2Vec [5] algorithm. attri2vec[4] is also based on graph random walks and learns node +representations by performing a mapping on node attributes. The examples folder structure is shown below. @@ -28,27 +29,34 @@ The examples folder structure is shown below. Example of supervised link attribute prediction for a heterogeneous network with attributed nodes of different types, using the HinSAGE algorithm. +* [`/attri2vec`] (https://github.com/stellargraph/stellargraph/tree/master/demos/link-prediction/attri2vec) + + Example of link prediction for out-of-sample nodes for a homogeneous network with attributed nodes, + using the attri2vec algorithm. ## References -1. Node2Vec: Scalable Feature Learning for Networks. A. Grover, J. Leskovec. ACM SIGKDD International Conference on +1. Node2Vec: Scalable Feature Learning for Networks. A. Grover, J. Leskovec. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2016. ([link](https://snap.stanford.edu/node2vec/)) -2. Metapath2Vec: Scalable Representation Learning for Heterogeneous Networks. Yuxiao Dong, Nitesh V. Chawla, and +2. Metapath2Vec: Scalable Representation Learning for Heterogeneous Networks. Yuxiao Dong, Nitesh V. Chawla, and Ananthram Swami. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 135–144, 2017 ([link](https://ericdongyx.github.io/metapath2vec/m2v.html)) -3. Inductive Representation Learning on Large Graphs. W.L. Hamilton, R. Ying, and J. Leskovec arXiv:1706.02216 +3. Inductive Representation Learning on Large Graphs. W.L. Hamilton, R. Ying, and J. Leskovec arXiv:1706.02216 [cs.SI], 2017. ([link](http://snap.stanford.edu/graphsage/)) -4. Distributed representations of words and phrases and their compositionality. T. Mikolov, +4. Attributed Network Embedding via Subspace Discovery. D. Zhang, J, Yin, X. Zhu and C. Zhang, arXiv:1901.04095, +[cs.SI], 2019. ([link](https://arxiv.org/abs/1901.04095)) + +5. Distributed representations of words and phrases and their compositionality. T. Mikolov, I. Sutskever, K. Chen, G. S. Corrado, and J. Dean. In Advances in Neural Information Processing Systems (NIPS), pp. 3111-3119, 2013. ([link](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)) -5. Semi-Supervised Classification with Graph Convolutional Networks. T. Kipf, M. Welling. +6. Semi-Supervised Classification with Graph Convolutional Networks. T. Kipf, M. Welling. ICLR 2017. arXiv:1609.02907 ([link](https://arxiv.org/abs/1609.02907)) -6. Graph Attention Networks. P. Velickovic et al. ICLR 2018 ([link](https://arxiv.org/abs/1710.10903)) +7. Graph Attention Networks. P. Velickovic et al. ICLR 2018 ([link](https://arxiv.org/abs/1710.10903)) -7. On Calibration of Modern Neural Networks. C. Guo, G. Pleiss, Y. Sun, and K. Q. Weinberger. +8. On Calibration of Modern Neural Networks. C. Guo, G. Pleiss, Y. Sun, and K. Q. Weinberger. ICML 2017. ([link](https://geoffpleiss.com/nn_calibration)) From 7b65cab5026f850f9c6f90f07792d856e329e48b Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 25 Sep 2019 11:02:53 +1000 Subject: [PATCH 32/82] add attri2vec description to demos/embeddings/README.md file --- demos/embeddings/README.md | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index 18f4aa190..1a7ce176d 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -1,8 +1,8 @@ ## Representation Learning Examples -This folder contains three [Jupyter](http://jupyter.org/) python notebooks demonstrating the use of unsupervised graph representation learning methods implemented in the `stellargraph` library for homogeneous and hetrogenous graphs with or without node features. The original works are referenced below. +This folder contains three [Jupyter](http://jupyter.org/) python notebooks demonstrating the use of unsupervised graph representation learning methods implemented in the `stellargraph` library for homogeneous and hetrogenous graphs with or without node features. The original works are referenced below. -**Node2Vec** and **Metapath2Vec** notebooks demonstrate the combined use of `stellargraph` and `Gensim` [4] libraries for representation learning on homogeneous and heterogeneous graphs. +**Node2Vec** and **Metapath2Vec** notebooks demonstrate the combined use of `stellargraph` and `Gensim` [4] libraries for representation learning on homogeneous and heterogeneous graphs. **Unsupervised GraphSAGE** notebook demonstrates the use of `Stellargraph` library's GraphSAGE implementation for unsupervised learning of node embeddings for homogeneous graphs with node features. **attri2vec** notebook demonstrates the implementation of attri2vec with the `Stellargraph` library for unsupervised inductive learning of node embeddings for homogeneous graphs with node features, and the evaluation for its ability to infer the representations of out-of-sample nodes with the out-of-sample node link prediction task. @@ -10,24 +10,25 @@ The notebooks demonstrate the following algorithms. - `stellargraph-node2vec.ipynb` The **Node2Vec** algorithm [1] for representation learning on homogeneous graphs - `stellargraph-metapath2vec.ipynb` The **Metapath2Vec** algorithm [2] for representation learning on heterogeneous graphs. - `embeddings-unsupervised-graphsage-cora.ipynb` The **Unsupervised GraphSAGE** algorithm [5] for representation learning on homogeneous graphs with node features. -- `stellargraph-attri2vec-DBLP.ipynb` The **attri2vec** algorithm [6] for representation learning on homogeneous graphs with node features. +- `stellargraph-attri2vec-citeseer.ipynb` The **attri2vec** algorithm [6] for representation learning on the homogeneous graph citeseer with node features. +- `stellargraph-attri2vec-DBLP.ipynb` The **attri2vec** algorithm [6] for representation learning on the homogeneous graph DBLP with node features. -All examples demonstrate how to calculate embedding vectors for a graph's nodes in just a few lines of Python code. +All examples demonstrate how to calculate embedding vectors for a graph's nodes in just a few lines of Python code. The learned node representations can be used in numerous downstream tasks such as node attribute inference, link prediction, and community detection. ## References -**1.** Node2Vec: Scalable Feature Learning for Networks. A. Grover, J. Leskovec. ACM SIGKDD International Conference +**1.** Node2Vec: Scalable Feature Learning for Networks. A. Grover, J. Leskovec. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2016. ([link](https://snap.stanford.edu/node2vec/)) -**2.** Metapath2Vec: Scalable Representation Learning for Heterogeneous Networks. Yuxiao Dong, Nitesh V. Chawla, and -Ananthram Swami. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 135–144, 2017. +**2.** Metapath2Vec: Scalable Representation Learning for Heterogeneous Networks. Yuxiao Dong, Nitesh V. Chawla, and +Ananthram Swami. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 135–144, 2017. ([link](https://ericdongyx.github.io/papers/KDD17-dong-chawla-swami-metapath2vec.pdf)) -**3.** Distributed representations of words and phrases and their compositionality. T. Mikolov, I. Sutskever, K. Chen, -G. S. Corrado, and J. Dean. In Advances in Neural Information Processing Systems (NIPS), pp. 3111-3119, 2013. +**3.** Distributed representations of words and phrases and their compositionality. T. Mikolov, I. Sutskever, K. Chen, +G. S. Corrado, and J. Dean. In Advances in Neural Information Processing Systems (NIPS), pp. 3111-3119, 2013. ([link](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)) **4.** Gensim: Topic modelling for humans. ([link](https://radimrehurek.com/gensim/)) @@ -35,4 +36,5 @@ G. S. Corrado, and J. Dean. In Advances in Neural Information Processing System **5.** Inductive Representation Learning on Large Graphs. W.L. Hamilton, R. Ying, and J. Leskovec arXiv:1706.02216 [cs.SI], 2017. ([link](http://snap.stanford.edu/graphsage/)) -**6.** Attributed Network Embedding via Subspace Discovery. D. Zhang, Y. Jie, X. Zhu and C. Zhang, arXiv:1901.04095, [cs.SI], 2019. ([link](https://arxiv.org/abs/1901.04095)) +**6.** Attributed Network Embedding via Subspace Discovery. D. Zhang, Y. Jie, X. Zhu and C. Zhang, arXiv:1901.04095, +[cs.SI], 2019. ([link](https://arxiv.org/abs/1901.04095)) From c68a62d4cd044066494d012247e23e8a534eb4e9 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 25 Sep 2019 11:04:06 +1000 Subject: [PATCH 33/82] add attri2vec description to demos/README.md file --- demos/README.md | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/demos/README.md b/demos/README.md index 970549801..12c117cd5 100644 --- a/demos/README.md +++ b/demos/README.md @@ -7,10 +7,10 @@ examples demonstrate using the `StellarGraph` library to build machine learning workflows on both homogeneous and heterogeneous networks. Each folder contains one or more examples of using the StellarGraph implementations of the -state-of-the-art algorithms, GraphSAGE [3], HinSAGE, GCN [5], GAT [6], Node2Vec [1], and Metapath2Vec [2]. -GraphSAGE, HinSAGE, and GAT are variants of Graph Convolutional Neural networks [5]. Node2Vec and +state-of-the-art algorithms, attri2vec [4], GraphSAGE [3], HinSAGE, GCN [6], GAT [7], Node2Vec [1], and Metapath2Vec [2]. +GraphSAGE, HinSAGE, and GAT are variants of Graph Convolutional Neural networks [6]. Node2Vec and Metapath2Vec are methods based on graph random walks and representation learning using the -Word2Vec [4] algorithm. +Word2Vec [5] algorithm. The examples folder structure is shown below. @@ -30,8 +30,8 @@ The examples folder structure is shown below. * [`/ensembles`](https://github.com/stellargraph/stellargraph/tree/master/demos/ensembles) Examples of using ensembles of graph convolutional neural networks, e.g., GraphSAGE, GCN, HinSAGE, etc., for - node classification and link prediction. Model ensembles usually yield better predictions than single models, - while also providing estimates of prediction uncertainty as a bonus. + node classification and link prediction. Model ensembles usually yield better predictions than single models, + while also providing estimates of prediction uncertainty as a bonus. * [`/calibration`](https://github.com/stellargraph/stellargraph/tree/master/demos/calibration) @@ -49,24 +49,27 @@ The examples folder structure is shown below. ## References -1. Node2Vec: Scalable Feature Learning for Networks. A. Grover, J. Leskovec. ACM SIGKDD International Conference on +1. Node2Vec: Scalable Feature Learning for Networks. A. Grover, J. Leskovec. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2016. ([link](https://snap.stanford.edu/node2vec/)) -2. Metapath2Vec: Scalable Representation Learning for Heterogeneous Networks. Yuxiao Dong, Nitesh V. Chawla, and +2. Metapath2Vec: Scalable Representation Learning for Heterogeneous Networks. Yuxiao Dong, Nitesh V. Chawla, and Ananthram Swami. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 135–144, 2017 ([link](https://ericdongyx.github.io/metapath2vec/m2v.html)) -3. Inductive Representation Learning on Large Graphs. W.L. Hamilton, R. Ying, and J. Leskovec arXiv:1706.02216 +3. Inductive Representation Learning on Large Graphs. W.L. Hamilton, R. Ying, and J. Leskovec arXiv:1706.02216 [cs.SI], 2017. ([link](http://snap.stanford.edu/graphsage/)) -4. Distributed representations of words and phrases and their compositionality. T. Mikolov, +4. Attributed Network Embedding via Subspace Discovery. D. Zhang, Y. Jie, X. Zhu and C. Zhang, arXiv:1901.04095, +[cs.SI], 2019. ([link](https://arxiv.org/abs/1901.04095)) + +5. Distributed representations of words and phrases and their compositionality. T. Mikolov, I. Sutskever, K. Chen, G. S. Corrado, and J. Dean. In Advances in Neural Information Processing Systems (NIPS), pp. 3111-3119, 2013. ([link](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)) -5. Semi-Supervised Classification with Graph Convolutional Networks. T. Kipf, M. Welling. +6. Semi-Supervised Classification with Graph Convolutional Networks. T. Kipf, M. Welling. ICLR 2017. arXiv:1609.02907 ([link](https://arxiv.org/abs/1609.02907)) -6. Graph Attention Networks. P. Velickovic et al. ICLR 2018 ([link](https://arxiv.org/abs/1710.10903)) +7. Graph Attention Networks. P. Velickovic et al. ICLR 2018 ([link](https://arxiv.org/abs/1710.10903)) -7. On Calibration of Modern Neural Networks. C. Guo, G. Pleiss, Y. Sun, and K. Q. Weinberger. +8. On Calibration of Modern Neural Networks. C. Guo, G. Pleiss, Y. Sun, and K. Q. Weinberger. ICML 2017. ([link](https://geoffpleiss.com/nn_calibration)) From a00b4cd3c42fe9753879bdf47526c2ac1ff22419 Mon Sep 17 00:00:00 2001 From: Daokun Zhang Date: Wed, 25 Sep 2019 11:04:27 +1000 Subject: [PATCH 34/82] add attri2vec description to README.md file --- README.md | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 63c55800c..18bf87413 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ The StellarGraph library can be used to solve tasks using graph-structured data, - Representation learning for nodes and edges, to be used for visualisation and various downstream machine learning tasks; - Classification and attribute inference of nodes or edges; - Link prediction; -- Interpretation of node classification through calculated importances of edges and neighbours for selected nodes [7]. +- Interpretation of node classification through calculated importances of edges and neighbours for selected nodes [8]. We provide [examples](https://github.com/stellargraph/stellargraph/tree/master/demos/) of using `StellarGraph` to solve such tasks using several real-world datasets. @@ -130,7 +130,7 @@ can be downloaded and installed from [python.org](https://python.org/). Alternat environment, available from [anaconda.com](https://www.anaconda.com/download/). *Note*: while the library works on Python 3.7 it is based on Keras which does not officially support Python 3.7. -Therefore, there may be unforseen bugs and you there are many warnings from the Python libraries that +Therefore, there may be unforseen bugs and you there are many warnings from the Python libraries that StellarGraph depends upon.