From 56e7bdfac232695eac3073ebed38b01d44684a35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B5=85=E6=A2=A6?= Date: Sat, 18 Apr 2026 22:15:53 +0800 Subject: [PATCH 1/8] Update CI TF matrix and add example smoke coverage --- .github/workflows/ci.yml | 136 +++++++++++++++++++---------------- README.md | 29 ++++++-- examples/alias.py | 42 ++++++----- examples/deepwalk_wiki.py | 86 +++++++++++++--------- examples/line_wiki.py | 81 ++++++++++++--------- examples/node2vec_flight.py | 124 +++++++++++++++----------------- examples/node2vec_wiki.py | 93 +++++++++++++++--------- examples/sdne_wiki.py | 86 +++++++++++++--------- examples/struc2vec_flight.py | 133 +++++++++++++++++----------------- ge/__init__.py | 9 ++- ge/models/__init__.py | 17 ++++- ge/models/line.py | 41 +++++++---- ge/models/sdne.py | 65 ++++++++++------- setup.py | 51 +++++-------- tests/deepwalk_test.py | 21 ++++-- tests/examples_test.py | 47 ++++++++++++ tests/line_test.py | 20 ++++-- tests/node2vec_test.py | 34 ++++++--- tests/sdne_test.py | 23 ++++-- tests/struct2vec_test.py | 33 +++++++-- 20 files changed, 712 insertions(+), 459 deletions(-) create mode 100644 tests/examples_test.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cc92791..fd004c0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,74 +1,84 @@ name: CI -on: +on: push: - path: - - 'ge/*' - - 'tests/*' + paths: + - "ge/**" + - "tests/**" + - "examples/**" + - ".github/workflows/**" + - "setup.py" + - "README.md" pull_request: - path: - - 'ge/*' - - 'tests/*' - + paths: + - "ge/**" + - "tests/**" + - "examples/**" + - ".github/workflows/**" + - "setup.py" + - "README.md" + jobs: build: - - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 timeout-minutes: 180 strategy: + fail-fast: false matrix: - python-version: [3.6,3.7,3.8] - tf-version: [1.4.0,1.15.0,2.5.0,2.6.0,2.7.0,2.8.0,2.9.0] - - exclude: - - python-version: 3.7 - tf-version: 1.4.0 - - python-version: 3.7 - tf-version: 1.15.0 - - python-version: 3.8 - tf-version: 1.4.0 - - python-version: 3.8 - tf-version: 1.14.0 - - python-version: 3.8 - tf-version: 1.15.0 - - python-version: 3.6 - tf-version: 2.7.0 - - python-version: 3.6 - tf-version: 2.8.0 - - python-version: 3.6 - tf-version: 2.9.0 - - python-version: 3.9 - tf-version: 1.4.0 - - python-version: 3.9 - tf-version: 1.15.0 - - python-version: 3.9 - tf-version: 2.2.0 + include: + - python-version: "3.7" + tf-version: "1.15.5" + use-legacy-keras: "0" + - python-version: "3.10" + tf-version: "2.10.0" + use-legacy-keras: "0" + - python-version: "3.10" + tf-version: "2.15.0" + use-legacy-keras: "0" + - python-version: "3.11" + tf-version: "2.15.0" + use-legacy-keras: "0" + - python-version: "3.10" + tf-version: "2.20.0" + use-legacy-keras: "1" + - python-version: "3.11" + tf-version: "2.20.0" + use-legacy-keras: "1" + - python-version: "3.12" + tf-version: "2.20.0" + use-legacy-keras: "0" + steps: - - - uses: actions/checkout@v3 - - - name: Setup python environment - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} + - uses: actions/checkout@v4 + + - name: Setup Python environment + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -q "tensorflow==${{ matrix.tf-version }}" + if [[ "${{ matrix.tf-version }}" == 1.* ]]; then + python -m pip install -q "protobuf==3.20.3" + fi + if [[ "${{ matrix.use-legacy-keras }}" == "1" ]]; then + python -m pip install -q "tf-keras~=2.20" + fi + python -m pip install -e ".[test]" + + - name: Test with pytest + timeout-minutes: 180 + env: + TF_USE_LEGACY_KERAS: ${{ matrix.use-legacy-keras }} + run: | + pytest --cov=ge --cov=examples --cov-report=xml - - name: Install dependencies - run: | - pip3 install -q tensorflow==${{ matrix.tf-version }} - pip install -q protobuf==3.19.0 - pip install -q requests - pip install -e . - - name: Test with pytest - timeout-minutes: 180 - run: | - pip install -q pytest - pip install -q pytest-cov - pip install -q python-coveralls - pytest --cov=ge --cov-report=xml - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3.1.0 - with: - token: ${{secrets.CODECOV_TOKEN}} - file: ./coverage.xml - flags: pytest - name: py${{ matrix.python-version }}-tf${{ matrix.tf-version }} + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ./coverage.xml + flags: pytest + name: py${{ matrix.python-version }}-tf${{ matrix.tf-version }} diff --git a/README.md b/README.md index 2a17812..d0dca71 100644 --- a/README.md +++ b/README.md @@ -21,15 +21,34 @@ | Struc2Vec | [KDD 2017][struc2vec: Learning Node Representations from Structural Identity](https://arxiv.org/pdf/1704.03165.pdf) | [【Graph Embedding】Struc2Vec:算法原理,实现和应用](https://zhuanlan.zhihu.com/p/56733145) | +# CI Compatibility Matrix + +The CI matrix currently validates the following Python/TensorFlow combinations: + +| Python | TensorFlow | `TF_USE_LEGACY_KERAS` | +| :----: | :--------: | :-------------------: | +| 3.7 | 1.15.5 | 0 | +| 3.10 | 2.10.0 | 0 | +| 3.10 | 2.15.0 | 0 | +| 3.11 | 2.15.0 | 0 | +| 3.10 | 2.20.0 | 1 | +| 3.11 | 2.20.0 | 1 | +| 3.12 | 2.20.0 | 0 | + +For TensorFlow 2.16+ jobs that need legacy Keras behavior, CI installs `tf-keras` and sets `TF_USE_LEGACY_KERAS=1`. + # How to run examples -1. clone the repo and make sure you have installed `tensorflow` or `tensorflow-gpu` on your local machine. -2. run following commands + +1. Clone the repo and install dependencies. +2. Run one example script. + ```bash -python setup.py install -cd examples -python deepwalk_wiki.py +pip install -e .[cpu] +python examples/deepwalk_wiki.py ``` +Each example now exposes `main(smoke=False, show=True)`. CI executes all `examples/*.py` in smoke mode (`smoke=True`, `show=False`) to keep runtime short while still validating the training/import paths. + ## DisscussionGroup & Related Projects diff --git a/examples/alias.py b/examples/alias.py index 8d2324e..e3f2233 100644 --- a/examples/alias.py +++ b/examples/alias.py @@ -4,27 +4,37 @@ from ge.alias import alias_sample, create_alias_table -def gen_prob_dist(N): - p = np.random.randint(0, 100, N) - return p/np.sum(p) +def gen_prob_dist(size): + probabilities = np.random.randint(0, 100, size) + return probabilities / np.sum(probabilities) -def simulate(N=100, k=10000,): +def simulate(size=100, sample_count=10000): + truth = gen_prob_dist(size) + accept, alias = create_alias_table(truth) - truth = gen_prob_dist(N) + sampled = np.zeros(size) + for _ in range(sample_count): + sampled[alias_sample(accept, alias)] += 1 + return sampled / np.sum(sampled), truth - area_ratio = truth - accept, alias = create_alias_table(area_ratio) - ans = np.zeros(N) - for _ in range(k): - i = alias_sample(accept, alias) - ans[i] += 1 - return ans/np.sum(ans), truth +def main(smoke=False, show=True): + size = 20 if smoke else 100 + sample_count = 300 if smoke else 10000 + alias_result, truth = simulate(size=size, sample_count=sample_count) + + assert np.isclose(alias_result.sum(), 1.0) + assert np.isclose(truth.sum(), 1.0) + + if show: + plt.bar(list(range(len(alias_result))), alias_result, label="alias_result") + plt.bar(list(range(len(truth))), truth, label="truth") + plt.legend() + plt.show() + + return alias_result, truth if __name__ == "__main__": - alias_result, truth = simulate() - plt.bar(list(range(len(alias_result))), alias_result, label='alias_result') - plt.bar(list(range(len(truth))), truth, label='truth') - plt.legend() + main() diff --git a/examples/deepwalk_wiki.py b/examples/deepwalk_wiki.py index e81e025..ad84c3a 100644 --- a/examples/deepwalk_wiki.py +++ b/examples/deepwalk_wiki.py @@ -1,53 +1,73 @@ +from pathlib import Path +import matplotlib.pyplot as plt +import networkx as nx import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.manifold import TSNE -from ge.classify import read_node_label, Classifier from ge import DeepWalk -from sklearn.linear_model import LogisticRegression +from ge.classify import Classifier, read_node_label -import matplotlib.pyplot as plt -import networkx as nx -from sklearn.manifold import TSNE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +WIKI_GRAPH_PATH = PROJECT_ROOT / "data" / "wiki" / "Wiki_edgelist.txt" +WIKI_LABEL_PATH = PROJECT_ROOT / "data" / "wiki" / "wiki_labels.txt" +SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" -def evaluate_embeddings(embeddings): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - tr_frac = 0.8 - print("Training classifier using {:.2f}% nodes...".format( - tr_frac * 100)) +def evaluate_embeddings(embeddings, label_path): + x_data, y_data = read_node_label(str(label_path)) + train_fraction = 0.8 + print("Training classifier using {:.2f}% nodes...".format(train_fraction * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) - clf.split_train_evaluate(X, Y, tr_frac) - + clf.split_train_evaluate(x_data, y_data, train_fraction) -def plot_embeddings(embeddings,): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - emb_list = [] - for k in X: - emb_list.append(embeddings[k]) - emb_list = np.array(emb_list) +def plot_embeddings(embeddings, label_path, show=True): + x_data, y_data = read_node_label(str(label_path)) - model = TSNE(n_components=2) - node_pos = model.fit_transform(emb_list) + embedding_list = np.array([embeddings[node] for node in x_data]) + node_pos = TSNE(n_components=2).fit_transform(embedding_list) color_idx = {} - for i in range(len(X)): - color_idx.setdefault(Y[i][0], []) - color_idx[Y[i][0]].append(i) + for index, label in enumerate(y_data): + color_idx.setdefault(label[0], []) + color_idx[label[0]].append(index) - for c, idx in color_idx.items(): - plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) + for label, indexes in color_idx.items(): + plt.scatter(node_pos[indexes, 0], node_pos[indexes, 1], label=label) plt.legend() - plt.show() + if show: + plt.show() + else: + plt.close() -if __name__ == "__main__": - G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) +def main(smoke=False, show=True): + graph_path = SMOKE_GRAPH_PATH if smoke else WIKI_GRAPH_PATH + graph = nx.read_edgelist( + str(graph_path), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = DeepWalk(G, walk_length=10, num_walks=80, workers=1) - model.train(window_size=5, iter=3) + model = DeepWalk( + graph, + walk_length=3 if smoke else 10, + num_walks=2 if smoke else 80, + workers=1, + ) + model.train(window_size=2 if smoke else 5, iter=1 if smoke else 3, workers=1) embeddings = model.get_embeddings() + assert len(embeddings) > 0 + + if not smoke: + evaluate_embeddings(embeddings, WIKI_LABEL_PATH) + plot_embeddings(embeddings, WIKI_LABEL_PATH, show=show) - evaluate_embeddings(embeddings) - plot_embeddings(embeddings) + return embeddings + + +if __name__ == "__main__": + main() diff --git a/examples/line_wiki.py b/examples/line_wiki.py index 5771d99..b31ace1 100644 --- a/examples/line_wiki.py +++ b/examples/line_wiki.py @@ -1,53 +1,68 @@ +from pathlib import Path +import matplotlib.pyplot as plt +import networkx as nx import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.manifold import TSNE -from ge.classify import read_node_label, Classifier from ge import LINE -from sklearn.linear_model import LogisticRegression +from ge.classify import Classifier, read_node_label -import matplotlib.pyplot as plt -import networkx as nx -from sklearn.manifold import TSNE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +WIKI_GRAPH_PATH = PROJECT_ROOT / "data" / "wiki" / "Wiki_edgelist.txt" +WIKI_LABEL_PATH = PROJECT_ROOT / "data" / "wiki" / "wiki_labels.txt" +SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" -def evaluate_embeddings(embeddings): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - tr_frac = 0.8 - print("Training classifier using {:.2f}% nodes...".format( - tr_frac * 100)) +def evaluate_embeddings(embeddings, label_path): + x_data, y_data = read_node_label(str(label_path)) + train_fraction = 0.8 + print("Training classifier using {:.2f}% nodes...".format(train_fraction * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) - clf.split_train_evaluate(X, Y, tr_frac) - + clf.split_train_evaluate(x_data, y_data, train_fraction) -def plot_embeddings(embeddings,): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - emb_list = [] - for k in X: - emb_list.append(embeddings[k]) - emb_list = np.array(emb_list) +def plot_embeddings(embeddings, label_path, show=True): + x_data, y_data = read_node_label(str(label_path)) - model = TSNE(n_components=2) - node_pos = model.fit_transform(emb_list) + embedding_list = np.array([embeddings[node] for node in x_data]) + node_pos = TSNE(n_components=2).fit_transform(embedding_list) color_idx = {} - for i in range(len(X)): - color_idx.setdefault(Y[i][0], []) - color_idx[Y[i][0]].append(i) + for index, label in enumerate(y_data): + color_idx.setdefault(label[0], []) + color_idx[label[0]].append(index) - for c, idx in color_idx.items(): - plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) + for label, indexes in color_idx.items(): + plt.scatter(node_pos[indexes, 0], node_pos[indexes, 1], label=label) plt.legend() - plt.show() + if show: + plt.show() + else: + plt.close() -if __name__ == "__main__": - G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) +def main(smoke=False, show=True): + graph_path = SMOKE_GRAPH_PATH if smoke else WIKI_GRAPH_PATH + graph = nx.read_edgelist( + str(graph_path), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = LINE(G, embedding_size=128, order='second') - model.train(batch_size=1024, epochs=50, verbose=2) + model = LINE(graph, embedding_size=8 if smoke else 128, order="second") + model.train(batch_size=2 if smoke else 1024, epochs=1 if smoke else 50, verbose=0 if smoke else 2) embeddings = model.get_embeddings() + assert len(embeddings) > 0 + + if not smoke: + evaluate_embeddings(embeddings, WIKI_LABEL_PATH) + plot_embeddings(embeddings, WIKI_LABEL_PATH, show=show) - evaluate_embeddings(embeddings) - plot_embeddings(embeddings) + return embeddings + + +if __name__ == "__main__": + main() diff --git a/examples/node2vec_flight.py b/examples/node2vec_flight.py index a37a880..a2ddaec 100644 --- a/examples/node2vec_flight.py +++ b/examples/node2vec_flight.py @@ -1,88 +1,76 @@ -import numpy as np - - - -from ge.classify import read_node_label,Classifier - -from ge import Node2Vec - -from sklearn.linear_model import LogisticRegression - - +from pathlib import Path import matplotlib.pyplot as plt - import networkx as nx - +import numpy as np +from sklearn.linear_model import LogisticRegression from sklearn.manifold import TSNE +from ge import Node2Vec +from ge.classify import Classifier, read_node_label +PROJECT_ROOT = Path(__file__).resolve().parents[1] +FLIGHT_GRAPH_PATH = PROJECT_ROOT / "data" / "flight" / "brazil-airports.edgelist" +FLIGHT_LABEL_PATH = PROJECT_ROOT / "data" / "flight" / "labels-brazil-airports.txt" +SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" -def evaluate_embeddings(embeddings): - - X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True) - - tr_frac = 0.8 - - print("Training classifier using {:.2f}% nodes...".format( - - tr_frac * 100)) +def evaluate_embeddings(embeddings, label_path): + x_data, y_data = read_node_label(str(label_path), skip_head=True) + train_fraction = 0.8 + print("Training classifier using {:.2f}% nodes...".format(train_fraction * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) - - clf.split_train_evaluate(X, Y, tr_frac) - - - + clf.split_train_evaluate(x_data, y_data, train_fraction) -def plot_embeddings(embeddings,): - - X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True) - - - - emb_list = [] - - for k in X: - - emb_list.append(embeddings[k]) - - emb_list = np.array(emb_list) - - - - model = TSNE(n_components=2) - - node_pos = model.fit_transform(emb_list) - +def plot_embeddings(embeddings, label_path, show=True): + x_data, y_data = read_node_label(str(label_path), skip_head=True) + embedding_list = np.array([embeddings[node] for node in x_data]) + node_pos = TSNE(n_components=2).fit_transform(embedding_list) color_idx = {} + for index, label in enumerate(y_data): + color_idx.setdefault(label[0], []) + color_idx[label[0]].append(index) - for i in range(len(X)): - - color_idx.setdefault(Y[i][0], []) - - color_idx[Y[i][0]].append(i) - - - - for c, idx in color_idx.items(): - - plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # c=node_colors) - + for label, indexes in color_idx.items(): + plt.scatter(node_pos[indexes, 0], node_pos[indexes, 1], label=label) plt.legend() + if show: + plt.show() + else: + plt.close() + + +def main(smoke=False, show=True): + graph_path = SMOKE_GRAPH_PATH if smoke else FLIGHT_GRAPH_PATH + graph = nx.read_edgelist( + str(graph_path), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) + + model = Node2Vec( + graph, + walk_length=3 if smoke else 10, + num_walks=2 if smoke else 80, + workers=1, + p=0.25, + q=2, + use_rejection_sampling=False, + ) + model.train(window_size=2 if smoke else 5, iter=1 if smoke else 3, workers=1) + embeddings = model.get_embeddings() + assert len(embeddings) > 0 - plt.show() + if not smoke: + evaluate_embeddings(embeddings, FLIGHT_LABEL_PATH) + plot_embeddings(embeddings, FLIGHT_LABEL_PATH, show=show) -if __name__ == "__main__": - G = nx.read_edgelist('../data/flight/brazil-airports.edgelist', create_using=nx.DiGraph(), nodetype=None, - data=[('weight', int)]) + return embeddings - model = Node2Vec(G, 10, 80, workers=1, p=0.25, q=2, use_rejection_sampling=0) - model.train() - embeddings = model.get_embeddings() - evaluate_embeddings(embeddings) - plot_embeddings(embeddings) +if __name__ == "__main__": + main() diff --git a/examples/node2vec_wiki.py b/examples/node2vec_wiki.py index 45ea1c0..00f922f 100644 --- a/examples/node2vec_wiki.py +++ b/examples/node2vec_wiki.py @@ -1,53 +1,76 @@ +from pathlib import Path +import matplotlib.pyplot as plt +import networkx as nx import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.manifold import TSNE -from ge.classify import read_node_label, Classifier from ge import Node2Vec -from sklearn.linear_model import LogisticRegression +from ge.classify import Classifier, read_node_label -import matplotlib.pyplot as plt -import networkx as nx -from sklearn.manifold import TSNE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +WIKI_GRAPH_PATH = PROJECT_ROOT / "data" / "wiki" / "Wiki_edgelist.txt" +WIKI_LABEL_PATH = PROJECT_ROOT / "data" / "wiki" / "wiki_labels.txt" +SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" -def evaluate_embeddings(embeddings): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - tr_frac = 0.8 - print("Training classifier using {:.2f}% nodes...".format( - tr_frac * 100)) +def evaluate_embeddings(embeddings, label_path): + x_data, y_data = read_node_label(str(label_path)) + train_fraction = 0.8 + print("Training classifier using {:.2f}% nodes...".format(train_fraction * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) - clf.split_train_evaluate(X, Y, tr_frac) - + clf.split_train_evaluate(x_data, y_data, train_fraction) -def plot_embeddings(embeddings,): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - emb_list = [] - for k in X: - emb_list.append(embeddings[k]) - emb_list = np.array(emb_list) +def plot_embeddings(embeddings, label_path, show=True): + x_data, y_data = read_node_label(str(label_path)) - model = TSNE(n_components=2) - node_pos = model.fit_transform(emb_list) + embedding_list = np.array([embeddings[node] for node in x_data]) + node_pos = TSNE(n_components=2).fit_transform(embedding_list) color_idx = {} - for i in range(len(X)): - color_idx.setdefault(Y[i][0], []) - color_idx[Y[i][0]].append(i) + for index, label in enumerate(y_data): + color_idx.setdefault(label[0], []) + color_idx[label[0]].append(index) - for c, idx in color_idx.items(): - plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) + for label, indexes in color_idx.items(): + plt.scatter(node_pos[indexes, 0], node_pos[indexes, 1], label=label) plt.legend() - plt.show() + if show: + plt.show() + else: + plt.close() + + +def main(smoke=False, show=True): + graph_path = SMOKE_GRAPH_PATH if smoke else WIKI_GRAPH_PATH + graph = nx.read_edgelist( + str(graph_path), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) + + model = Node2Vec( + graph, + walk_length=3 if smoke else 10, + num_walks=2 if smoke else 80, + p=0.25, + q=4, + workers=1, + use_rejection_sampling=False, + ) + model.train(window_size=2 if smoke else 5, iter=1 if smoke else 3, workers=1) + embeddings = model.get_embeddings() + assert len(embeddings) > 0 + + if not smoke: + evaluate_embeddings(embeddings, WIKI_LABEL_PATH) + plot_embeddings(embeddings, WIKI_LABEL_PATH, show=show) + + return embeddings if __name__ == "__main__": - G=nx.read_edgelist('../data/wiki/Wiki_edgelist.txt', - create_using = nx.DiGraph(), nodetype = None, data = [('weight', int)]) - model = Node2Vec(G, walk_length=10, num_walks=80, - p=0.25, q=4, workers=1, use_rejection_sampling=0) - model.train(window_size = 5, iter = 3) - embeddings=model.get_embeddings() - - evaluate_embeddings(embeddings) - plot_embeddings(embeddings) + main() diff --git a/examples/sdne_wiki.py b/examples/sdne_wiki.py index 9cfc467..aa65c55 100644 --- a/examples/sdne_wiki.py +++ b/examples/sdne_wiki.py @@ -1,54 +1,72 @@ +from pathlib import Path +import matplotlib.pyplot as plt +import networkx as nx import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.manifold import TSNE -from ge.classify import read_node_label, Classifier from ge import SDNE -from sklearn.linear_model import LogisticRegression +from ge.classify import Classifier, read_node_label -import matplotlib.pyplot as plt -import networkx as nx -from sklearn.manifold import TSNE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +WIKI_GRAPH_PATH = PROJECT_ROOT / "data" / "wiki" / "Wiki_edgelist.txt" +WIKI_LABEL_PATH = PROJECT_ROOT / "data" / "wiki" / "wiki_labels.txt" +SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" -def evaluate_embeddings(embeddings): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - tr_frac = 0.8 - print("Training classifier using {:.2f}% nodes...".format( - tr_frac * 100)) +def evaluate_embeddings(embeddings, label_path): + x_data, y_data = read_node_label(str(label_path)) + train_fraction = 0.8 + print("Training classifier using {:.2f}% nodes...".format(train_fraction * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) - clf.split_train_evaluate(X, Y, tr_frac) - + clf.split_train_evaluate(x_data, y_data, train_fraction) -def plot_embeddings(embeddings,): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - emb_list = [] - for k in X: - emb_list.append(embeddings[k]) - emb_list = np.array(emb_list) +def plot_embeddings(embeddings, label_path, show=True): + x_data, y_data = read_node_label(str(label_path)) - model = TSNE(n_components=2) - node_pos = model.fit_transform(emb_list) + embedding_list = np.array([embeddings[node] for node in x_data]) + node_pos = TSNE(n_components=2).fit_transform(embedding_list) color_idx = {} - for i in range(len(X)): - color_idx.setdefault(Y[i][0], []) - color_idx[Y[i][0]].append(i) + for index, label in enumerate(y_data): + color_idx.setdefault(label[0], []) + color_idx[label[0]].append(index) - for c, idx in color_idx.items(): - plt.scatter(node_pos[idx, 0], node_pos[idx, 1], - label=c) # c=node_colors) + for label, indexes in color_idx.items(): + plt.scatter(node_pos[indexes, 0], node_pos[indexes, 1], label=label) plt.legend() - plt.show() + if show: + plt.show() + else: + plt.close() -if __name__ == "__main__": - G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) +def main(smoke=False, show=True): + graph_path = SMOKE_GRAPH_PATH if smoke else WIKI_GRAPH_PATH + graph = nx.read_edgelist( + str(graph_path), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = SDNE(G, hidden_size=[256, 128],) - model.train(batch_size=3000, epochs=40, verbose=2) + model = SDNE(graph, hidden_size=[8, 4] if smoke else [256, 128]) + model.train( + batch_size=2 if smoke else 3000, + epochs=1 if smoke else 40, + verbose=0 if smoke else 2, + ) embeddings = model.get_embeddings() + assert len(embeddings) > 0 + + if not smoke: + evaluate_embeddings(embeddings, WIKI_LABEL_PATH) + plot_embeddings(embeddings, WIKI_LABEL_PATH, show=show) - evaluate_embeddings(embeddings) - plot_embeddings(embeddings) + return embeddings + + +if __name__ == "__main__": + main() diff --git a/examples/struc2vec_flight.py b/examples/struc2vec_flight.py index 8863675..10454cd 100644 --- a/examples/struc2vec_flight.py +++ b/examples/struc2vec_flight.py @@ -1,88 +1,83 @@ -import numpy as np - - - -from ge.classify import read_node_label,Classifier - -from ge import Struc2Vec - -from sklearn.linear_model import LogisticRegression - - +from pathlib import Path +import tempfile import matplotlib.pyplot as plt - import networkx as nx - +import numpy as np +from sklearn.linear_model import LogisticRegression from sklearn.manifold import TSNE +from ge import Struc2Vec +from ge.classify import Classifier, read_node_label +PROJECT_ROOT = Path(__file__).resolve().parents[1] +FLIGHT_GRAPH_PATH = PROJECT_ROOT / "data" / "flight" / "brazil-airports.edgelist" +FLIGHT_LABEL_PATH = PROJECT_ROOT / "data" / "flight" / "labels-brazil-airports.txt" +SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" -def evaluate_embeddings(embeddings): - - X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True) - - tr_frac = 0.8 - - print("Training classifier using {:.2f}% nodes...".format( - - tr_frac * 100)) +def evaluate_embeddings(embeddings, label_path): + x_data, y_data = read_node_label(str(label_path), skip_head=True) + train_fraction = 0.8 + print("Training classifier using {:.2f}% nodes...".format(train_fraction * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) - - clf.split_train_evaluate(X, Y, tr_frac) - - + clf.split_train_evaluate(x_data, y_data, train_fraction) +def plot_embeddings(embeddings, label_path, show=True): + x_data, y_data = read_node_label(str(label_path), skip_head=True) -def plot_embeddings(embeddings,): - - X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True) - - - - emb_list = [] - - for k in X: - - emb_list.append(embeddings[k]) - - emb_list = np.array(emb_list) - - - - model = TSNE(n_components=2) - - node_pos = model.fit_transform(emb_list) - - + embedding_list = np.array([embeddings[node] for node in x_data]) + node_pos = TSNE(n_components=2).fit_transform(embedding_list) color_idx = {} + for index, label in enumerate(y_data): + color_idx.setdefault(label[0], []) + color_idx[label[0]].append(index) - for i in range(len(X)): - - color_idx.setdefault(Y[i][0], []) - - color_idx[Y[i][0]].append(i) - - - - for c, idx in color_idx.items(): - - plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # c=node_colors) - + for label, indexes in color_idx.items(): + plt.scatter(node_pos[indexes, 0], node_pos[indexes, 1], label=label) plt.legend() + if show: + plt.show() + else: + plt.close() + + +def main(smoke=False, show=True): + graph_path = SMOKE_GRAPH_PATH if smoke else FLIGHT_GRAPH_PATH + graph = nx.read_edgelist( + str(graph_path), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) + + with tempfile.TemporaryDirectory(prefix="struc2vec-") as temp_dir: + model = Struc2Vec( + graph, + walk_length=3 if smoke else 10, + num_walks=1 if smoke else 80, + workers=1 if smoke else 4, + verbose=0 if smoke else 40, + temp_path=temp_dir + "/", + ) + model.train( + embed_size=8 if smoke else 128, + window_size=2 if smoke else 5, + workers=1, + iter=1 if smoke else 3, + ) + embeddings = model.get_embeddings() + + assert len(embeddings) > 0 + + if not smoke: + evaluate_embeddings(embeddings, FLIGHT_LABEL_PATH) + plot_embeddings(embeddings, FLIGHT_LABEL_PATH, show=show) + + return embeddings - plt.show() if __name__ == "__main__": - G = nx.read_edgelist('../data/flight/brazil-airports.edgelist', create_using=nx.DiGraph(), nodetype=None, - data=[('weight', int)]) - - model = Struc2Vec(G, 10, 80, workers=4, verbose=40, ) - model.train() - embeddings = model.get_embeddings() - - evaluate_embeddings(embeddings) - plot_embeddings(embeddings) \ No newline at end of file + main() diff --git a/ge/__init__.py b/ge/__init__.py index cf4f59d..89d2335 100644 --- a/ge/__init__.py +++ b/ge/__init__.py @@ -1 +1,8 @@ -from .models import * \ No newline at end of file +from .alias import alias_sample, create_alias_table + +__all__ = ["alias_sample", "create_alias_table"] + +try: + from .models import * # noqa: F401,F403 +except ImportError: + pass diff --git a/ge/models/__init__.py b/ge/models/__init__.py index d2375e9..c008e9f 100644 --- a/ge/models/__init__.py +++ b/ge/models/__init__.py @@ -1,8 +1,19 @@ from .deepwalk import DeepWalk from .node2vec import Node2Vec -from .line import LINE -from .sdne import SDNE from .struc2vec import Struc2Vec +__all__ = ["DeepWalk", "Node2Vec", "Struc2Vec"] -__all__ = ["DeepWalk", "Node2Vec", "LINE", "SDNE", "Struc2Vec"] +try: + from .line import LINE + + __all__.append("LINE") +except ImportError: + LINE = None + +try: + from .sdne import SDNE + + __all__.append("SDNE") +except ImportError: + SDNE = None diff --git a/ge/models/line.py b/ge/models/line.py index 993a5aa..981324d 100644 --- a/ge/models/line.py +++ b/ge/models/line.py @@ -21,17 +21,17 @@ import random import numpy as np -from deepctr.layers.utils import reduce_sum -from tensorflow.python.keras import backend as K -from tensorflow.python.keras.layers import Embedding, Input, Lambda -from tensorflow.python.keras.models import Model +import tensorflow as tf +from tensorflow.keras import backend as K +from tensorflow.keras.layers import Embedding, Input, Lambda +from tensorflow.keras.models import Model from ..alias import create_alias_table, alias_sample from ..utils import preprocess_nxgraph def line_loss(y_true, y_pred): - return -K.mean(K.log(K.sigmoid(y_true * y_pred))) + return -K.mean(tf.math.log_sigmoid(y_true * y_pred)) def create_model(numNodes, embedding_size, order='second'): @@ -48,10 +48,14 @@ def create_model(numNodes, embedding_size, order='second'): v_i_emb_second = second_emb(v_i) v_j_context_emb = context_emb(v_j) - first = Lambda(lambda x: reduce_sum( - x[0] * x[1], axis=-1, keep_dims=False), name='first_order')([v_i_emb, v_j_emb]) - second = Lambda(lambda x: reduce_sum( - x[0] * x[1], axis=-1, keep_dims=False), name='second_order')([v_i_emb_second, v_j_context_emb]) + first = Lambda( + lambda x: tf.reduce_sum(x[0] * x[1], axis=-1, keepdims=False), + name='first_order', + )([v_i_emb, v_j_emb]) + second = Lambda( + lambda x: tf.reduce_sum(x[0] * x[1], axis=-1, keepdims=False), + name='second_order', + )([v_i_emb_second, v_j_context_emb]) if order == 'first': output_list = [first] @@ -205,8 +209,21 @@ def get_embeddings(self, ): def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1): self.reset_training_config(batch_size, times) - hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, - steps_per_epoch=self.steps_per_epoch, - verbose=verbose) + try: + hist = self.model.fit( + self.batch_it, + epochs=epochs, + initial_epoch=initial_epoch, + steps_per_epoch=self.steps_per_epoch, + verbose=verbose, + ) + except TypeError: + hist = self.model.fit_generator( + self.batch_it, + epochs=epochs, + initial_epoch=initial_epoch, + steps_per_epoch=self.steps_per_epoch, + verbose=verbose, + ) return hist diff --git a/ge/models/sdne.py b/ge/models/sdne.py index 923586d..a09c82f 100644 --- a/ge/models/sdne.py +++ b/ge/models/sdne.py @@ -22,39 +22,43 @@ import numpy as np import scipy.sparse as sp import tensorflow as tf -from tensorflow.python.keras import backend as K -from tensorflow.python.keras.callbacks import History -from tensorflow.python.keras.layers import Dense, Input -from tensorflow.python.keras.models import Model -from tensorflow.python.keras.regularizers import l1_l2 +from tensorflow.keras.callbacks import History +from tensorflow.keras.layers import Dense, Input +from tensorflow.keras.models import Model +from tensorflow.keras.regularizers import l1_l2 from ..utils import preprocess_nxgraph def l_2nd(beta): def loss_2nd(y_true, y_pred): - b_ = np.ones_like(y_true) - b_[y_true != 0] = beta - x = K.square((y_true - y_pred) * b_) - t = K.sum(x, axis=-1, ) - return K.mean(t) + beta_weight = tf.cast(beta, y_true.dtype) + ones = tf.ones_like(y_true) + b_ = tf.where(tf.not_equal(y_true, 0), beta_weight * ones, ones) + x = tf.square((y_true - y_pred) * b_) + return tf.reduce_mean(tf.reduce_sum(x, axis=-1)) return loss_2nd def l_1st(alpha): def loss_1st(y_true, y_pred): - L = y_true - Y = y_pred - batch_size = tf.to_float(K.shape(L)[0]) - return alpha * 2 * tf.linalg.trace(tf.matmul(tf.matmul(Y, L, transpose_a=True), Y)) / batch_size + laplacian = y_true + embeddings = y_pred + batch_size = tf.cast(tf.shape(laplacian)[0], embeddings.dtype) + alpha_weight = tf.cast(alpha, embeddings.dtype) + return ( + alpha_weight + * 2.0 + * tf.linalg.trace(tf.matmul(tf.matmul(embeddings, laplacian, transpose_a=True), embeddings)) + / batch_size + ) return loss_1st def create_model(node_size, hidden_size=[256, 128], l1=1e-5, l2=1e-4): A = Input(shape=(node_size,)) - L = Input(shape=(None,)) fc = A for i in range(len(hidden_size)): if i == len(hidden_size) - 1: @@ -69,7 +73,7 @@ def create_model(node_size, hidden_size=[256, 128], l1=1e-5, l2=1e-4): kernel_regularizer=l1_l2(l1, l2))(fc) A_ = Dense(node_size, 'relu', name='2nd')(fc) - model = Model(inputs=[A, L], outputs=[A_, Y]) + model = Model(inputs=A, outputs=[A_, Y]) emb = Model(inputs=A, outputs=Y) return model, emb @@ -90,7 +94,6 @@ def __init__(self, graph, hidden_size=[32, 16], alpha=1e-6, beta=5., nu1=1e-5, n self.A, self.L = _create_A_L(self.graph, self.node2idx) # Adj Matrix,L Matrix self.reset_model() - self.inputs = [self.A, self.L] self._embeddings = {} def reset_model(self, opt='adam'): @@ -101,14 +104,22 @@ def reset_model(self, opt='adam'): self.get_embeddings() def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1): + adjacency = self.A.toarray().astype(np.float32) + laplacian = self.L.toarray().astype(np.float32) if batch_size >= self.node_size: if batch_size > self.node_size: print('batch_size({0}) > node_size({1}),set batch_size = {1}'.format( batch_size, self.node_size)) batch_size = self.node_size - return self.model.fit([self.A.todense(), self.L.todense()], [self.A.todense(), self.L.todense()], - batch_size=batch_size, epochs=epochs, initial_epoch=initial_epoch, verbose=verbose, - shuffle=False, ) + return self.model.fit( + adjacency, + [adjacency, laplacian], + batch_size=batch_size, + epochs=epochs, + initial_epoch=initial_epoch, + verbose=verbose, + shuffle=False, + ) else: steps_per_epoch = (self.node_size - 1) // batch_size + 1 hist = History() @@ -120,10 +131,9 @@ def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1): for i in range(steps_per_epoch): index = np.arange( i * batch_size, min((i + 1) * batch_size, self.node_size)) - A_train = self.A[index, :].todense() - L_mat_train = self.L[index][:, index].todense() - inp = [A_train, L_mat_train] - batch_losses = self.model.train_on_batch(inp, inp) + A_train = adjacency[index, :] + L_mat_train = laplacian[index][:, index] + batch_losses = np.asarray(self.model.train_on_batch(A_train, [A_train, L_mat_train])) losses += batch_losses losses = losses / steps_per_epoch @@ -139,11 +149,14 @@ def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1): return hist def evaluate(self, ): - return self.model.evaluate(x=self.inputs, y=self.inputs, batch_size=self.node_size) + adjacency = self.A.toarray().astype(np.float32) + laplacian = self.L.toarray().astype(np.float32) + return self.model.evaluate(x=adjacency, y=[adjacency, laplacian], batch_size=self.node_size) def get_embeddings(self): self._embeddings = {} - embeddings = self.emb_model.predict(self.A.todense(), batch_size=self.node_size) + adjacency = self.A.toarray().astype(np.float32) + embeddings = self.emb_model.predict(adjacency, batch_size=self.node_size, verbose=0) look_back = self.idx2node for i, embedding in enumerate(embeddings): self._embeddings[look_back[i]] = embedding diff --git a/setup.py b/setup.py index 616afb6..72f4737 100644 --- a/setup.py +++ b/setup.py @@ -2,55 +2,40 @@ with open("README.md", "r") as fh: - long_description = fh.read() REQUIRED_PACKAGES = [ - # 'tensorflow>=1.4.0', - 'gensim>=4.0.0', - 'networkx', - 'joblib', - 'fastdtw', - 'tqdm', - 'numpy', - 'scikit-learn', - 'pandas', - 'matplotlib', - 'deepctr' + "gensim>=4.0.0", + "networkx", + "joblib", + "fastdtw", + "tqdm", + "numpy", + "scikit-learn", + "pandas", + "matplotlib", ] setuptools.setup( - name="ge", - version="0.0.0", - author="Weichen Shen", - author_email="weichenswc@163.com", - url="https://github.com/shenweichen/GraphEmbedding", - packages=setuptools.find_packages(exclude=[]), - - python_requires='>=3.5', # 3.4.6 - + python_requires=">=3.7", install_requires=REQUIRED_PACKAGES, - extras_require={ - - "cpu": ['tensorflow>=1.4.0,!=1.7.*,!=1.8.*'], - - "gpu": ['tensorflow-gpu>=1.4.0,!=1.7.*,!=1.8.*'], - - }, - - entry_points={ - + "cpu": ["tensorflow>=1.15.5"], + "gpu": ["tensorflow>=1.15.5"], + "test": [ + "pytest>=7.0.0", + "pytest-cov>=4.0.0", + "python-coveralls>=2.9.3", + ], }, + entry_points={}, license="MIT license", - - ) diff --git a/tests/deepwalk_test.py b/tests/deepwalk_test.py index 10a83a6..d0c034d 100644 --- a/tests/deepwalk_test.py +++ b/tests/deepwalk_test.py @@ -1,15 +1,28 @@ +from pathlib import Path + import networkx as nx +import pytest +pytest.importorskip("gensim") +pytest.importorskip("pandas") from ge import DeepWalk +TEST_GRAPH_PATH = Path(__file__).resolve().parent / "Wiki_edgelist.txt" + def test_DeepWalk(): - G = nx.read_edgelist('./tests/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) + graph = nx.read_edgelist( + str(TEST_GRAPH_PATH), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = DeepWalk(G, walk_length=3, num_walks=2, workers=1) - model.train(window_size=3, iter=1) + model = DeepWalk(graph, walk_length=3, num_walks=2, workers=1) + model.train(embed_size=8, window_size=2, iter=1, workers=1) embeddings = model.get_embeddings() + assert len(embeddings) == graph.number_of_nodes() + assert all(len(vector) == 8 for vector in embeddings.values()) if __name__ == "__main__": diff --git a/tests/examples_test.py b/tests/examples_test.py new file mode 100644 index 0000000..aec32ed --- /dev/null +++ b/tests/examples_test.py @@ -0,0 +1,47 @@ +import importlib.util +from pathlib import Path + +import pytest + +EXAMPLES_DIR = Path(__file__).resolve().parents[1] / "examples" +EXAMPLE_FILES = [ + "alias.py", + "deepwalk_wiki.py", + "line_wiki.py", + "node2vec_flight.py", + "node2vec_wiki.py", + "sdne_wiki.py", + "struc2vec_flight.py", +] +TF_EXAMPLES = {"line_wiki.py", "sdne_wiki.py"} +GENSIM_EXAMPLES = {"deepwalk_wiki.py", "node2vec_flight.py", "node2vec_wiki.py", "struc2vec_flight.py"} + + +def load_example_module(example_file): + module_path = EXAMPLES_DIR / example_file + spec = importlib.util.spec_from_file_location(f"example_{module_path.stem}", module_path) + assert spec is not None and spec.loader is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +@pytest.mark.parametrize("example_file", EXAMPLE_FILES) +def test_examples_smoke(example_file): + if example_file in TF_EXAMPLES: + pytest.importorskip("tensorflow") + if example_file in GENSIM_EXAMPLES: + pytest.importorskip("gensim") + pytest.importorskip("pandas") + if example_file == "struc2vec_flight.py": + pytest.importorskip("fastdtw") + + module = load_example_module(example_file) + result = module.main(smoke=True, show=False) + + if isinstance(result, dict): + assert len(result) > 0 + elif isinstance(result, tuple): + assert all(item is not None for item in result) + else: + assert result is not None diff --git a/tests/line_test.py b/tests/line_test.py index 2b2e2b7..832320d 100644 --- a/tests/line_test.py +++ b/tests/line_test.py @@ -1,15 +1,27 @@ +from pathlib import Path + import networkx as nx +import pytest +pytest.importorskip("tensorflow") from ge import LINE +TEST_GRAPH_PATH = Path(__file__).resolve().parent / "Wiki_edgelist.txt" + def test_LINE(): - G = nx.read_edgelist('./tests/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) + graph = nx.read_edgelist( + str(TEST_GRAPH_PATH), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = LINE(G, embedding_size=2, order='second') - model.train(batch_size=2, epochs=1, verbose=2) + model = LINE(graph, embedding_size=4, order="second") + model.train(batch_size=2, epochs=1, verbose=0) embeddings = model.get_embeddings() + assert len(embeddings) == graph.number_of_nodes() + assert all(len(vector) == 4 for vector in embeddings.values()) if __name__ == "__main__": diff --git a/tests/node2vec_test.py b/tests/node2vec_test.py index 3ca9756..26215c1 100644 --- a/tests/node2vec_test.py +++ b/tests/node2vec_test.py @@ -1,21 +1,39 @@ +from pathlib import Path + import networkx as nx import pytest +pytest.importorskip("gensim") +pytest.importorskip("pandas") from ge import Node2Vec +TEST_GRAPH_PATH = Path(__file__).resolve().parent / "Wiki_edgelist.txt" + @pytest.mark.parametrize( - 'use_rejection_sampling', - [True, False - ] + "use_rejection_sampling", + [True, False], ) def test_Node2Vec(use_rejection_sampling): - G = nx.read_edgelist('./tests/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) - model = Node2Vec(G, walk_length=10, num_walks=80, - p=0.25, q=4, workers=1, use_rejection_sampling=use_rejection_sampling) - model.train(window_size=5, iter=3) + graph = nx.read_edgelist( + str(TEST_GRAPH_PATH), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) + model = Node2Vec( + graph, + walk_length=3, + num_walks=2, + p=0.25, + q=4, + workers=1, + use_rejection_sampling=use_rejection_sampling, + ) + model.train(embed_size=8, window_size=2, iter=1, workers=1) embeddings = model.get_embeddings() + assert len(embeddings) == graph.number_of_nodes() + assert all(len(vector) == 8 for vector in embeddings.values()) if __name__ == "__main__": diff --git a/tests/sdne_test.py b/tests/sdne_test.py index 5393414..1dac226 100644 --- a/tests/sdne_test.py +++ b/tests/sdne_test.py @@ -1,18 +1,27 @@ +from pathlib import Path + import networkx as nx -import tensorflow as tf +import pytest +pytest.importorskip("tensorflow") from ge import SDNE +TEST_GRAPH_PATH = Path(__file__).resolve().parent / "Wiki_edgelist.txt" + def test_SDNE(): - if tf.__version__ >= '1.15.0': - return #todo - G = nx.read_edgelist('./tests/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) + graph = nx.read_edgelist( + str(TEST_GRAPH_PATH), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = SDNE(G, hidden_size=[8, 4], ) - model.train(batch_size=2, epochs=1, verbose=2) + model = SDNE(graph, hidden_size=[8, 4]) + model.train(batch_size=2, epochs=1, verbose=0) embeddings = model.get_embeddings() + assert len(embeddings) == graph.number_of_nodes() + assert all(len(vector) == 4 for vector in embeddings.values()) if __name__ == "__main__": diff --git a/tests/struct2vec_test.py b/tests/struct2vec_test.py index 4bf408e..ce3685d 100644 --- a/tests/struct2vec_test.py +++ b/tests/struct2vec_test.py @@ -1,15 +1,38 @@ +from pathlib import Path +import tempfile + import networkx as nx +import pytest +pytest.importorskip("fastdtw") +pytest.importorskip("gensim") +pytest.importorskip("pandas") from ge import Struc2Vec +TEST_GRAPH_PATH = Path(__file__).resolve().parent / "Wiki_edgelist.txt" + def test_Struc2Vec(): - G = nx.read_edgelist('./tests/Wiki_edgelist.txt', create_using=nx.DiGraph(), nodetype=None, - data=[('weight', int)]) + graph = nx.read_edgelist( + str(TEST_GRAPH_PATH), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = Struc2Vec(G, 3, 1, workers=1, verbose=40, ) - model.train() - embeddings = model.get_embeddings() + with tempfile.TemporaryDirectory(prefix="struc2vec-test-") as temp_dir: + model = Struc2Vec( + graph, + walk_length=3, + num_walks=1, + workers=1, + verbose=0, + temp_path=temp_dir + "/", + ) + model.train(embed_size=8, window_size=2, workers=1, iter=1) + embeddings = model.get_embeddings() + assert len(embeddings) == graph.number_of_nodes() + assert all(len(vector) == 8 for vector in embeddings.values()) if __name__ == "__main__": From 9f1a81203c7839fdd986f1f81ef3842c4296913b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B5=85=E6=A2=A6?= Date: Sat, 18 Apr 2026 22:36:19 +0800 Subject: [PATCH 2/8] Fix SDNE custom History callback in manual training loop --- ge/models/sdne.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ge/models/sdne.py b/ge/models/sdne.py index a09c82f..4e55bb0 100644 --- a/ge/models/sdne.py +++ b/ge/models/sdne.py @@ -123,6 +123,7 @@ def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1): else: steps_per_epoch = (self.node_size - 1) // batch_size + 1 hist = History() + hist.set_model(self.model) hist.on_train_begin() logs = {} for epoch in range(initial_epoch, epochs): From 168e91c200d860cd69cbcbc87f6f9f592bc187ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B5=85=E6=A2=A6?= Date: Sat, 18 Apr 2026 22:41:20 +0800 Subject: [PATCH 3/8] Fix LINE generator output and pin numpy for TF 2.10 CI --- .github/workflows/ci.yml | 6 ++++++ ge/models/line.py | 12 ++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fd004c0..ac5e794 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,6 +59,9 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + if [[ "${{ matrix.tf-version }}" == "2.10.0" ]]; then + python -m pip install -q "numpy<2" + fi python -m pip install -q "tensorflow==${{ matrix.tf-version }}" if [[ "${{ matrix.tf-version }}" == 1.* ]]; then python -m pip install -q "protobuf==3.20.3" @@ -67,6 +70,9 @@ jobs: python -m pip install -q "tf-keras~=2.20" fi python -m pip install -e ".[test]" + if [[ "${{ matrix.tf-version }}" == "2.10.0" ]]; then + python -m pip install -q "numpy<2" + fi - name: Test with pytest timeout-minutes: 180 diff --git a/ge/models/line.py b/ge/models/line.py index 981324d..6bae314 100644 --- a/ge/models/line.py +++ b/ge/models/line.py @@ -166,18 +166,20 @@ def batch_iter(self, node2idx): cur_t = edges[shuffle_indices[i]][1] h.append(cur_h) t.append(cur_t) - sign = np.ones(len(h)) + sign = np.ones(len(h), dtype=np.float32) else: - sign = np.ones(len(h)) * -1 + sign = np.ones(len(h), dtype=np.float32) * -1 t = [] for i in range(len(h)): t.append(alias_sample( self.node_accept, self.node_alias)) + heads = np.asarray(h, dtype=np.int32) + tails = np.asarray(t, dtype=np.int32) if self.order == 'all': - yield ([np.array(h), np.array(t)], [sign, sign]) + yield ((heads, tails), (sign, sign)) else: - yield ([np.array(h), np.array(t)], [sign]) + yield ((heads, tails), (sign,)) mod += 1 mod %= mod_size if mod == 0: @@ -218,6 +220,8 @@ def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1): verbose=verbose, ) except TypeError: + if not hasattr(self.model, "fit_generator"): + raise hist = self.model.fit_generator( self.batch_it, epochs=epochs, From 1d8b167ecd4e89806979c23e4ab7bd5154333ba7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B5=85=E6=A2=A6?= Date: Sat, 18 Apr 2026 23:09:17 +0800 Subject: [PATCH 4/8] Prefer local package imports in examples and clarify TF env errors --- examples/alias.py | 7 +++++++ examples/deepwalk_wiki.py | 6 +++++- examples/line_wiki.py | 14 ++++++++++++-- examples/node2vec_flight.py | 6 +++++- examples/node2vec_wiki.py | 6 +++++- examples/sdne_wiki.py | 14 ++++++++++++-- examples/struc2vec_flight.py | 6 +++++- 7 files changed, 51 insertions(+), 8 deletions(-) diff --git a/examples/alias.py b/examples/alias.py index e3f2233..b345a14 100644 --- a/examples/alias.py +++ b/examples/alias.py @@ -1,6 +1,13 @@ +import sys +from pathlib import Path + import matplotlib.pyplot as plt import numpy as np +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + from ge.alias import alias_sample, create_alias_table diff --git a/examples/deepwalk_wiki.py b/examples/deepwalk_wiki.py index ad84c3a..a452d60 100644 --- a/examples/deepwalk_wiki.py +++ b/examples/deepwalk_wiki.py @@ -1,4 +1,5 @@ from pathlib import Path +import sys import matplotlib.pyplot as plt import networkx as nx @@ -6,10 +7,13 @@ from sklearn.linear_model import LogisticRegression from sklearn.manifold import TSNE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + from ge import DeepWalk from ge.classify import Classifier, read_node_label -PROJECT_ROOT = Path(__file__).resolve().parents[1] WIKI_GRAPH_PATH = PROJECT_ROOT / "data" / "wiki" / "Wiki_edgelist.txt" WIKI_LABEL_PATH = PROJECT_ROOT / "data" / "wiki" / "wiki_labels.txt" SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" diff --git a/examples/line_wiki.py b/examples/line_wiki.py index b31ace1..bc30233 100644 --- a/examples/line_wiki.py +++ b/examples/line_wiki.py @@ -1,4 +1,5 @@ from pathlib import Path +import sys import matplotlib.pyplot as plt import networkx as nx @@ -6,10 +7,19 @@ from sklearn.linear_model import LogisticRegression from sklearn.manifold import TSNE -from ge import LINE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +try: + from ge import LINE +except ImportError as exc: + raise ImportError( + "Unable to import LINE. Use a supported Python/TensorFlow environment " + "(for example Python 3.10-3.12 with tensorflow installed)." + ) from exc from ge.classify import Classifier, read_node_label -PROJECT_ROOT = Path(__file__).resolve().parents[1] WIKI_GRAPH_PATH = PROJECT_ROOT / "data" / "wiki" / "Wiki_edgelist.txt" WIKI_LABEL_PATH = PROJECT_ROOT / "data" / "wiki" / "wiki_labels.txt" SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" diff --git a/examples/node2vec_flight.py b/examples/node2vec_flight.py index a2ddaec..e3127ef 100644 --- a/examples/node2vec_flight.py +++ b/examples/node2vec_flight.py @@ -1,4 +1,5 @@ from pathlib import Path +import sys import matplotlib.pyplot as plt import networkx as nx @@ -6,10 +7,13 @@ from sklearn.linear_model import LogisticRegression from sklearn.manifold import TSNE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + from ge import Node2Vec from ge.classify import Classifier, read_node_label -PROJECT_ROOT = Path(__file__).resolve().parents[1] FLIGHT_GRAPH_PATH = PROJECT_ROOT / "data" / "flight" / "brazil-airports.edgelist" FLIGHT_LABEL_PATH = PROJECT_ROOT / "data" / "flight" / "labels-brazil-airports.txt" SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" diff --git a/examples/node2vec_wiki.py b/examples/node2vec_wiki.py index 00f922f..fb6db4d 100644 --- a/examples/node2vec_wiki.py +++ b/examples/node2vec_wiki.py @@ -1,4 +1,5 @@ from pathlib import Path +import sys import matplotlib.pyplot as plt import networkx as nx @@ -6,10 +7,13 @@ from sklearn.linear_model import LogisticRegression from sklearn.manifold import TSNE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + from ge import Node2Vec from ge.classify import Classifier, read_node_label -PROJECT_ROOT = Path(__file__).resolve().parents[1] WIKI_GRAPH_PATH = PROJECT_ROOT / "data" / "wiki" / "Wiki_edgelist.txt" WIKI_LABEL_PATH = PROJECT_ROOT / "data" / "wiki" / "wiki_labels.txt" SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" diff --git a/examples/sdne_wiki.py b/examples/sdne_wiki.py index aa65c55..09553ae 100644 --- a/examples/sdne_wiki.py +++ b/examples/sdne_wiki.py @@ -1,4 +1,5 @@ from pathlib import Path +import sys import matplotlib.pyplot as plt import networkx as nx @@ -6,10 +7,19 @@ from sklearn.linear_model import LogisticRegression from sklearn.manifold import TSNE -from ge import SDNE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +try: + from ge import SDNE +except ImportError as exc: + raise ImportError( + "Unable to import SDNE. Use a supported Python/TensorFlow environment " + "(for example Python 3.10-3.12 with tensorflow installed)." + ) from exc from ge.classify import Classifier, read_node_label -PROJECT_ROOT = Path(__file__).resolve().parents[1] WIKI_GRAPH_PATH = PROJECT_ROOT / "data" / "wiki" / "Wiki_edgelist.txt" WIKI_LABEL_PATH = PROJECT_ROOT / "data" / "wiki" / "wiki_labels.txt" SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" diff --git a/examples/struc2vec_flight.py b/examples/struc2vec_flight.py index 10454cd..dedb578 100644 --- a/examples/struc2vec_flight.py +++ b/examples/struc2vec_flight.py @@ -1,4 +1,5 @@ from pathlib import Path +import sys import tempfile import matplotlib.pyplot as plt @@ -7,10 +8,13 @@ from sklearn.linear_model import LogisticRegression from sklearn.manifold import TSNE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + from ge import Struc2Vec from ge.classify import Classifier, read_node_label -PROJECT_ROOT = Path(__file__).resolve().parents[1] FLIGHT_GRAPH_PATH = PROJECT_ROOT / "data" / "flight" / "brazil-airports.edgelist" FLIGHT_LABEL_PATH = PROJECT_ROOT / "data" / "flight" / "labels-brazil-airports.txt" SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" From a0580c664cc7e842bcb190b6566383a03192dd01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B5=85=E6=A2=A6?= Date: Sat, 18 Apr 2026 23:13:05 +0800 Subject: [PATCH 5/8] Add Python 3.13 with TensorFlow 2.20 to CI matrix --- .github/workflows/ci.yml | 3 +++ README.md | 1 + 2 files changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ac5e794..0b73c91 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,6 +47,9 @@ jobs: - python-version: "3.12" tf-version: "2.20.0" use-legacy-keras: "0" + - python-version: "3.13" + tf-version: "2.20.0" + use-legacy-keras: "0" steps: - uses: actions/checkout@v4 diff --git a/README.md b/README.md index d0dca71..f675bdf 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ The CI matrix currently validates the following Python/TensorFlow combinations: | 3.10 | 2.20.0 | 1 | | 3.11 | 2.20.0 | 1 | | 3.12 | 2.20.0 | 0 | +| 3.13 | 2.20.0 | 0 | For TensorFlow 2.16+ jobs that need legacy Keras behavior, CI installs `tf-keras` and sets `TF_USE_LEGACY_KERAS=1`. From efe52e84febd61515d5778044763ec4d81774fe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B5=85=E6=A2=A6?= Date: Sat, 18 Apr 2026 23:20:41 +0800 Subject: [PATCH 6/8] Simplify extras_require to tf and test --- README.md | 17 +---------------- setup.py | 5 ++--- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index f675bdf..f99f37e 100644 --- a/README.md +++ b/README.md @@ -21,22 +21,7 @@ | Struc2Vec | [KDD 2017][struc2vec: Learning Node Representations from Structural Identity](https://arxiv.org/pdf/1704.03165.pdf) | [【Graph Embedding】Struc2Vec:算法原理,实现和应用](https://zhuanlan.zhihu.com/p/56733145) | -# CI Compatibility Matrix -The CI matrix currently validates the following Python/TensorFlow combinations: - -| Python | TensorFlow | `TF_USE_LEGACY_KERAS` | -| :----: | :--------: | :-------------------: | -| 3.7 | 1.15.5 | 0 | -| 3.10 | 2.10.0 | 0 | -| 3.10 | 2.15.0 | 0 | -| 3.11 | 2.15.0 | 0 | -| 3.10 | 2.20.0 | 1 | -| 3.11 | 2.20.0 | 1 | -| 3.12 | 2.20.0 | 0 | -| 3.13 | 2.20.0 | 0 | - -For TensorFlow 2.16+ jobs that need legacy Keras behavior, CI installs `tf-keras` and sets `TF_USE_LEGACY_KERAS=1`. # How to run examples @@ -44,7 +29,7 @@ For TensorFlow 2.16+ jobs that need legacy Keras behavior, CI installs `tf-keras 2. Run one example script. ```bash -pip install -e .[cpu] +pip install -e .[tf] python examples/deepwalk_wiki.py ``` diff --git a/setup.py b/setup.py index 72f4737..990b387 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ setuptools.setup( name="ge", - version="0.0.0", + version="0.1.0", author="Weichen Shen", author_email="weichenswc@163.com", url="https://github.com/shenweichen/GraphEmbedding", @@ -28,8 +28,7 @@ python_requires=">=3.7", install_requires=REQUIRED_PACKAGES, extras_require={ - "cpu": ["tensorflow>=1.15.5"], - "gpu": ["tensorflow>=1.15.5"], + "tf": ["tensorflow>=1.15.5"], "test": [ "pytest>=7.0.0", "pytest-cov>=4.0.0", From 180472897a39cb4074cc9ab12da8b5d63ace0923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B5=85=E6=A2=A6?= Date: Sat, 18 Apr 2026 23:30:32 +0800 Subject: [PATCH 7/8] Remove README note about example CI behavior Delete the paragraph describing examples' `main(smoke=False, show=True)` signature and CI executing examples in smoke mode. The note was outdated or redundant and has been removed to tidy the README. --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index f99f37e..f369c36 100644 --- a/README.md +++ b/README.md @@ -33,8 +33,6 @@ pip install -e .[tf] python examples/deepwalk_wiki.py ``` -Each example now exposes `main(smoke=False, show=True)`. CI executes all `examples/*.py` in smoke mode (`smoke=True`, `show=False`) to keep runtime short while still validating the training/import paths. - ## DisscussionGroup & Related Projects From bc7a32ae755c9f9603857d62e44c5c0ce1d67c29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B5=85=E6=A2=A6?= Date: Sat, 18 Apr 2026 23:32:20 +0800 Subject: [PATCH 8/8] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f369c36..91ab902 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![GitHub Issues](https://img.shields.io/github/issues/shenweichen/graphembedding.svg )](https://github.com/shenweichen/graphembedding/issues) -![CI status](https://github.com/shenweichen/graphembedding/workflows/CI/badge.svg) +[![CI status](https://github.com/shenweichen/graphembedding/actions/workflows/ci.yml/badge.svg)](https://github.com/shenweichen/graphembedding/actions/workflows/ci.yml) [![codecov](https://codecov.io/gh/shenweichen/graphembedding/branch/master/graph/badge.svg)](https://codecov.io/gh/shenweichen/graphembedding) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/c46407f5931f40048e28860dccf7dabc)](https://www.codacy.com/gh/shenweichen/GraphEmbedding/dashboard?utm_source=github.com&utm_medium=referral&utm_content=shenweichen/GraphEmbedding&utm_campaign=Badge_Grade) [![Disscussion](https://img.shields.io/badge/chat-wechat-brightgreen?style=flat)](./README.md#disscussiongroup--related-projects)