In [28]:
from graphdatascience import GraphDataScience
import sys
import re

# import streamlit as st
# from streamlit_jupyter import StreamlitPatcher, tqdm
# StreamlitPatcher().jupyter()  # register streamlit with jupyter-compatible wrappers

# neo4j desktop v5.11.0

In [29]:
host = "bolt://localhost:7687"
user = "neo4j"
password= "j4oenj4oen"

gds = GraphDataScience(host, auth=(user, password))
print(gds.version())
# st.write(gds.version())

# params
KEY = "AIzaSyAPQNUpCCFrsJhX2A-CgvOG4fDWlxuA8ec" # api key
nphrase = 50 # number of nouns extracted from each article
DATA_CLASS = "DNP" # DNP or WIKI
DATA_TYPE = "TXT" # TXT or URL (currently txt is used for dnp data)
DATA_URL = "" # input data
QUERY_DICT = {} # query dict {QUERY_NAME: QUERY_URL}
if DATA_CLASS == "DNP":
    if DATA_TYPE == "TXT":
        DATA_URL = "data/newsrelease_B-1-100_C-1-4/"
        QUERY_DICT["C-1"] = DATA_URL + "C-1.txt"
        QUERY_DICT["C-2"] = DATA_URL + "C-2.txt"
        QUERY_DICT["C-3"] = DATA_URL + "C-3.txt"
        QUERY_DICT["C-4"] = DATA_URL + "C-4.txt"
    elif DATA_TYPE == "URL":
        DATA_URL = "https://raw.githubusercontent.com/smallcat9603/graph/main/dnp/kg/data/articles.csv"
        QUERY_DICT["C-1"] = "https://www.holdings.toppan.com/ja/news/2023/10/newsrelease231004_1.html"
        QUERY_DICT["C-2"] = "https://www.holdings.toppan.com/ja/news/2023/10/newsrelease231004_2.html"
        QUERY_DICT["C-3"] = "https://www.holdings.toppan.com/ja/news/2023/10/newsrelease231004_3.html"
        QUERY_DICT["C-4"] = "https://www.holdings.toppan.com/ja/news/2023/10/newsrelease231003_1.html"
elif DATA_CLASS == "WIKI_FP100":
    DATA_URL = "https://raw.githubusercontent.com/smallcat9603/graph/main/dnp/kg/data/wikidata_footballplayer_100.csv"
    QUERY_DICT["Thierry Henry"] = "https://en.wikipedia.org/wiki/Thierry_Henry"
elif DATA_CLASS == "WIKI_P100":
    DATA_URL = "https://raw.githubusercontent.com/smallcat9603/graph/main/dnp/kg/data/wikidata_persons_100.csv"  
    QUERY_DICT["Joe Biden"] = "https://en.wikipedia.org/wiki/Joe_Biden"
else:
    print("DATA ERROR")
    sys.exit(1)

query = """
CREATE CONSTRAINT id_unique IF NOT EXISTS 
For (a:Article) REQUIRE a.url IS UNIQUE;
"""
gds.run_cypher(query)

2.5.4


# Create Article-[Noun]-Article Graph

## create url nodes (article, person, ...)

In [30]:
if DATA_CLASS == "DNP" and DATA_TYPE == "TXT":
  for idx in range(1, 101):
    node = "B-" + str(idx)
    file = DATA_URL + node + ".txt"
    content = ""
    with open(file, 'r') as f:
      content = f.read()
      content = re.sub('\n+', ' ', content)
    query = f"""
    MERGE (a:Article {{ name: "{node}", url: "{file}", body: "{content}" }})
    """
    gds.run_cypher(query)
else:
  query = f"""
  CALL apoc.periodic.iterate(
    "LOAD CSV WITH HEADERS FROM '{DATA_URL}' AS row
    RETURN row",
    "MERGE (a:Article {{name: row.id, url: row.url}})
    SET a.grp = CASE WHEN 'occupation' IN keys(row) THEN row.occupation ELSE null END
    SET a.grp1 = CASE WHEN 'nationality' IN keys(row) THEN row.nationality ELSE null END
    WITH a
    CALL apoc.load.html(a.url, {{
      title: 'title',
      h2: 'h2',
      body: 'body p'
    }})
    YIELD value
    WITH a,
          reduce(texts = '', n IN range(0, size(value.body)-1) | texts + ' ' + coalesce(value.body[n].text, '')) AS body,
          value.title[0].text AS title
    SET a.body = body, a.title = title",
    {{batchSize: 5, parallel: true}}
  )
  YIELD batches, total, timeTaken, committedOperations
  RETURN batches, total, timeTaken, committedOperations
  """
  gds.run_cypher(query)

## set phrase and salience properties

In [31]:
query = f"""
CALL apoc.periodic.iterate(
  "MATCH (a:Article)
   WHERE a.processed IS NULL
   RETURN a",
  "CALL apoc.nlp.gcp.entities.stream([item in $_batch | item.a], {{
     nodeProperty: 'body',
     key: '{KEY}'
   }})
   YIELD node, value
   SET node.processed = true
   WITH node, value
   UNWIND value.entities AS entity
   SET node.phrase = coalesce(node.phrase, []) + entity['name']
   SET node.salience = coalesce(node.salience, []) + entity['salience']",
  {{batchMode: "BATCH_SINGLE", batchSize: 10}})
YIELD batches, total, timeTaken, committedOperations
RETURN batches, total, timeTaken, committedOperations
"""
gds.run_cypher(query)

Unnamed: 0,batches,total,timeTaken,committedOperations
0,10,100,113,100


## create noun-url relationships

In [32]:
query = """
MATCH (a:Article)
WHERE a.processed IS NOT NULL
FOREACH (word IN a.phrase[0..$nphrase] |
  MERGE (n:Noun {name: word})
  MERGE (a)-[r:CONTAINS]-(n)
  SET r.rank = apoc.coll.indexOf(a.phrase, word) + 1
  SET r.score = a.salience[apoc.coll.indexOf(a.phrase, word)]
  SET r.weight = $nphrase - apoc.coll.indexOf(a.phrase, word)
)
"""
gds.run_cypher(query, {'nphrase': nphrase})

## query

In [33]:
if DATA_CLASS == "DNP" and DATA_TYPE == "TXT":
  for QUERY_NAME, QUERY_URL in QUERY_DICT.items():
    content = ""
    with open(QUERY_URL, 'r') as f:
      content = f.read()
      content = re.sub('\n+', ' ', content)
    query = f"""
    MERGE (q:Query {{ name: "{QUERY_NAME}", url: "{QUERY_URL}", body: "{content}" }})
    """
    gds.run_cypher(query)
else:
  for QUERY_NAME, QUERY_URL in QUERY_DICT.items():
    query = """
    MERGE (q:Query {name: $name, url: $url})
    WITH q
    CALL apoc.load.html(i.url, {
    title: "title",
    h2: "h2",
    body: "body p"
    })
    YIELD value
    WITH q,
        reduce(texts = "", n IN range(0, size(value.body)-1) | texts + " " + coalesce(value.body[n].text, "")) AS body,
        value.title[0].text AS title
    SET q.body = body, q.title = title
    RETURN q.title, q.body
    """
    gds.run_cypher(query, {"name": QUERY_NAME, "url": QUERY_URL})
    
# set phrase and salience properties (Query)
query = f"""
MATCH (q:Query)
CALL apoc.nlp.gcp.entities.stream(q, {{
 nodeProperty: 'body',
 key: '{KEY}'
}})
YIELD node, value
SET node.processed = true
WITH node, value
UNWIND value.entities AS entity
SET node.phrase = coalesce(node.phrase, []) + entity['name']
SET node.salience = coalesce(node.salience, []) + entity['salience']
"""
gds.run_cypher(query)

# create noun-article relationships (Query)
query = """
MATCH (q:Query)
WHERE q.processed IS NOT NULL
FOREACH (word IN q.phrase[0..$nphrase] |
  MERGE (n:Noun {name: word})
  MERGE (q)-[r:CONTAINS]-(n)
  SET r.rank = apoc.coll.indexOf(q.phrase, word) + 1
  SET r.score = q.salience[apoc.coll.indexOf(q.phrase, word)]
  SET r.weight = $nphrase - apoc.coll.indexOf(q.phrase, word)
)
"""
gds.run_cypher(query, {'nphrase': nphrase})

## evaluate (naive by rank)

In [34]:
query = """
MATCH (q:Query)-[r:CONTAINS]-(n:Noun)-[c:CONTAINS]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, collect(n.name) AS Common, SUM((1.0/r.rank)*(1.0/c.rank)) AS Similarity 
ORDER BY Query, Similarity DESC
LIMIT 10
"""
gds.run_cypher(query)

Unnamed: 0,Query,Article,URL,Group,Group1,Common,Similarity
0,C-1,B-75,data/newsrelease_B-1-100_C-1-4/B-75.txt,,,"[CO2排出量, パッケージ, 以下, 削減, 本社]",0.094219
1,C-1,B-84,data/newsrelease_B-1-100_C-1-4/B-84.txt,,,"[パッケージ, 製造, 以下, 削減, 本社, 開発]",0.03161
2,C-1,B-52,data/newsrelease_B-1-100_C-1-4/B-52.txt,,,"[CO2排出量, 製造, 削減, 環境配慮型, 開発]",0.025126
3,C-1,B-50,data/newsrelease_B-1-100_C-1-4/B-50.txt,,,"[CO2排出量, 削減]",0.018691
4,C-1,B-5,data/newsrelease_B-1-100_C-1-4/B-5.txt,,,"[以下, 東京都, 本社]",0.01867
5,C-1,B-98,data/newsrelease_B-1-100_C-1-4/B-98.txt,,,"[CO2排出量, 以下, 本社]",0.015085
6,C-1,B-72,data/newsrelease_B-1-100_C-1-4/B-72.txt,,,"[製造, 以下, 本社, 開発]",0.013842
7,C-1,B-27,data/newsrelease_B-1-100_C-1-4/B-27.txt,,,"[グループ会社, 以下, 削減, 本社]",0.012094
8,C-1,B-17,data/newsrelease_B-1-100_C-1-4/B-17.txt,,,"[代表取締役, 以下, 東京都, 本社, 開発]",0.010256
9,C-1,B-19,data/newsrelease_B-1-100_C-1-4/B-19.txt,,,"[CO2排出量, ブース, 幕張メッセ, 社会]",0.009834


## create article-article relationships

In [35]:
query = """
MATCH (a1:Article), (a2:Article)
WHERE a1 <> a2 AND any(x IN a1.phrase[0..$nphrase] WHERE x IN a2.phrase[0..$nphrase])
MERGE (a1)-[r:CORRELATES]-(a2)
SET r.common = [x IN a1.phrase[0..$nphrase] WHERE x IN a2.phrase[0..$nphrase]]
"""
gds.run_cypher(query, {'nphrase': nphrase})

#query
query = """
MATCH (q:Query), (a:Article)
WHERE any(x IN q.phrase[0..$nphrase] WHERE x IN a.phrase[0..$nphrase])
MERGE (q)-[r:CORRELATES]-(a)
SET r.common = [x IN q.phrase[0..$nphrase] WHERE x IN a.phrase[0..$nphrase]]
"""
gds.run_cypher(query, {'nphrase': nphrase})

## evaluate (still naive by salience)

In [36]:
query = """
MATCH (q:Query)-[r:CORRELATES]-(a:Article)
WITH q, r, a, reduce(s = 0.0, word IN r.common | 
s + q.salience[apoc.coll.indexOf(q.phrase, word)] + a.salience[apoc.coll.indexOf(a.phrase, word)]) AS Similarity
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.common, Similarity 
ORDER BY Query, Similarity DESC
LIMIT 10
"""
gds.run_cypher(query)

Unnamed: 0,Query,Article,URL,Group,Group1,r.common,Similarity
0,C-1,B-75,data/newsrelease_B-1-100_C-1-4/B-75.txt,,,"[CO2排出量, 削減, CO2排出量, 本社, 以下, パッケージ, CO2排出量]",0.387933
1,C-1,B-52,data/newsrelease_B-1-100_C-1-4/B-52.txt,,,"[CO2排出量, 削減, CO2排出量, 環境配慮型, 製造, 開発, CO2排出量]",0.367524
2,C-1,B-50,data/newsrelease_B-1-100_C-1-4/B-50.txt,,,"[CO2排出量, 削減, CO2排出量, CO2排出量]",0.278183
3,C-1,B-84,data/newsrelease_B-1-100_C-1-4/B-84.txt,,,"[削減, 本社, 以下, パッケージ, 製造, 開発]",0.203138
4,C-1,B-98,data/newsrelease_B-1-100_C-1-4/B-98.txt,,,"[CO2排出量, CO2排出量, 本社, 以下, CO2排出量]",0.17776
5,C-1,B-72,data/newsrelease_B-1-100_C-1-4/B-72.txt,,,"[本社, 以下, 製造, 開発]",0.142443
6,C-1,B-19,data/newsrelease_B-1-100_C-1-4/B-19.txt,,,"[CO2排出量, CO2排出量, ブース, CO2排出量, 社会, 幕張メッセ]",0.139062
7,C-1,B-71,data/newsrelease_B-1-100_C-1-4/B-71.txt,,,"[CO2排出量, CO2排出量, 本社, 以下, パッケージ, CO2排出量, 社会]",0.117298
8,C-1,B-36,data/newsrelease_B-1-100_C-1-4/B-36.txt,,,"[CO2排出量, CO2排出量, パッケージ, 開発, CO2排出量]",0.116453
9,C-1,B-5,data/newsrelease_B-1-100_C-1-4/B-5.txt,,,"[東京都, 本社, 以下]",0.111021


## project graph to memory

In [37]:
node_projection = ["Query", "Article", "Noun"]
# # why raising error "java.lang.UnsupportedOperationException: Loading of values of type StringArray is currently not supported" ???
# node_projection = {"Query": {"properties": 'phrase'}, "Article": {"properties": 'phrase'}, "Noun": {}}
relationship_projection = {
    "CONTAINS": {"orientation": "UNDIRECTED", "properties": ["rank", "score", "weight"]},
    # "CORRELATES": {"orientation": "UNDIRECTED", "properties": ["common"]} # Unsupported type [TEXT_ARRAY] of value StringArray[DNP]. Please use a numeric property.
    }
# # how to project node properties???
# node_properties = { 
#     "nodeProperties": {
#         "phrase": {"defaultValue": []},
#         "salience": {"defaultValue": []}
#     }
# }
graph_name = "testgraph"
exists_result = gds.graph.exists(graph_name)
if exists_result["exists"]:
    G = gds.graph.get(graph_name)
    G.drop()
G, result = gds.graph.project(graph_name, node_projection, relationship_projection)
print(f"The projection took {result['projectMillis']} ms")
print(f"Graph '{G.name()}' node count: {G.node_count()}")
print(f"Graph '{G.name()}' node labels: {G.node_labels()}")
print(f"Graph '{G.name()}' relationship count: {G.relationship_count()}")
print(f"Graph '{G.name()}' degree distribution: {G.degree_distribution()}")
print(f"Graph '{G.name()}' density: {G.density()}")
print(f"Graph '{G.name()}' size in bytes: {G.size_in_bytes()}")
print(f"Graph '{G.name()}' memory_usage: {G.memory_usage()}")

The projection took 13 ms
Graph 'testgraph' node count: 2966
Graph 'testgraph' node labels: ['Query', 'Article', 'Noun']
Graph 'testgraph' relationship count: 9328
Graph 'testgraph' degree distribution: p99     141.000000
min       3.000000
max     261.000000
mean      9.434929
p90       9.000000
p999    150.000000
p50       3.000000
p95      27.000000
p75       3.000000
dtype: float64
Graph 'testgraph' density: 0.001060700303268408
Graph 'testgraph' size in bytes: 19816411
Graph 'testgraph' memory_usage: 18 MiB


## node similarity (JACCARD)

In [38]:
result = gds.nodeSimilarity.filtered.write(
    G,
    similarityMetric='JACCARD', # default
    writeRelationshipType='SIMILAR_JACCARD',
    writeProperty='score',
    relationshipWeightProperty="weight",
    sourceNodeFilter="Query",
    targetNodeFilter="Article",
)

print(f"Relationships produced: {result['relationshipsWritten']}")
print(f"Nodes compared: {result['nodesCompared']}")
print(f"Mean similarity: {result['similarityDistribution']['mean']}")

Relationships produced: 40
Nodes compared: 4
Mean similarity: 0.041773059964179994


## evaluate (jaccard similarity)

In [39]:
query = """
MATCH (q:Query)-[r:SIMILAR_JACCARD]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.score AS Similarity
ORDER BY Query, Similarity DESC
LIMIT 10
"""
gds.run_cypher(query)

Unnamed: 0,Query,Article,URL,Group,Group1,Similarity
0,C-1,B-75,data/newsrelease_B-1-100_C-1-4/B-75.txt,,,0.06044
1,C-1,B-84,data/newsrelease_B-1-100_C-1-4/B-84.txt,,,0.057193
2,C-1,B-27,data/newsrelease_B-1-100_C-1-4/B-27.txt,,,0.04961
3,C-1,B-17,data/newsrelease_B-1-100_C-1-4/B-17.txt,,,0.047939
4,C-1,B-52,data/newsrelease_B-1-100_C-1-4/B-52.txt,,,0.041741
5,C-1,B-54,data/newsrelease_B-1-100_C-1-4/B-54.txt,,,0.033241
6,C-1,B-13,data/newsrelease_B-1-100_C-1-4/B-13.txt,,,0.033118
7,C-1,B-98,data/newsrelease_B-1-100_C-1-4/B-98.txt,,,0.030973
8,C-1,B-5,data/newsrelease_B-1-100_C-1-4/B-5.txt,,,0.030688
9,C-1,B-72,data/newsrelease_B-1-100_C-1-4/B-72.txt,,,0.03029


## node similarity (OVERLAP)

In [40]:
result = gds.nodeSimilarity.filtered.write(
    G,
    similarityMetric='OVERLAP',
    writeRelationshipType='SIMILAR_OVERLAP',
    writeProperty='score',
    relationshipWeightProperty="weight",
    sourceNodeFilter="Query",
    targetNodeFilter="Article",
)

print(f"Relationships produced: {result['relationshipsWritten']}")
print(f"Nodes compared: {result['nodesCompared']}")
print(f"Mean similarity: {result['similarityDistribution']['mean']}")

Relationships produced: 40
Nodes compared: 4
Mean similarity: 0.08149468898773193


## evaluate (overlap similarity)

In [41]:
query = """
MATCH (q:Query)-[r:SIMILAR_OVERLAP]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.score AS Similarity
ORDER BY Query, Similarity DESC
LIMIT 10
"""
gds.run_cypher(query)

Unnamed: 0,Query,Article,URL,Group,Group1,Similarity
0,C-1,B-75,data/newsrelease_B-1-100_C-1-4/B-75.txt,,,0.114088
1,C-1,B-84,data/newsrelease_B-1-100_C-1-4/B-84.txt,,,0.11236
2,C-1,B-27,data/newsrelease_B-1-100_C-1-4/B-27.txt,,,0.095745
3,C-1,B-17,data/newsrelease_B-1-100_C-1-4/B-17.txt,,,0.092481
4,C-1,B-52,data/newsrelease_B-1-100_C-1-4/B-52.txt,,,0.08038
5,C-1,B-54,data/newsrelease_B-1-100_C-1-4/B-54.txt,,,0.066605
6,C-1,B-13,data/newsrelease_B-1-100_C-1-4/B-13.txt,,,0.066551
7,C-1,B-72,data/newsrelease_B-1-100_C-1-4/B-72.txt,,,0.061366
8,C-1,B-95,data/newsrelease_B-1-100_C-1-4/B-95.txt,,,0.061366
9,C-1,B-5,data/newsrelease_B-1-100_C-1-4/B-5.txt,,,0.060501


## node similarity (COSINE)

In [42]:
result = gds.nodeSimilarity.filtered.write(
    G,
    similarityMetric='COSINE',
    writeRelationshipType='SIMILAR_COSINE',
    writeProperty='score',
    relationshipWeightProperty="weight",
    sourceNodeFilter="Query",
    targetNodeFilter="Article",
)

print(f"Relationships produced: {result['relationshipsWritten']}")
print(f"Nodes compared: {result['nodesCompared']}")
print(f"Mean similarity: {result['similarityDistribution']['mean']}")

Relationships produced: 40
Nodes compared: 4
Mean similarity: 0.0887689471244812


## evaluate (cosine similarity)

In [43]:
query = """
MATCH (q:Query)-[r:SIMILAR_COSINE]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.score AS Similarity
ORDER BY Query, Similarity DESC
LIMIT 10
"""
gds.run_cypher(query)

Unnamed: 0,Query,Article,URL,Group,Group1,Similarity
0,C-1,B-75,data/newsrelease_B-1-100_C-1-4/B-75.txt,,,0.126199
1,C-1,B-84,data/newsrelease_B-1-100_C-1-4/B-84.txt,,,0.104104
2,C-1,B-52,data/newsrelease_B-1-100_C-1-4/B-52.txt,,,0.099071
3,C-1,B-27,data/newsrelease_B-1-100_C-1-4/B-27.txt,,,0.095018
4,C-1,B-17,data/newsrelease_B-1-100_C-1-4/B-17.txt,,,0.086985
5,C-1,B-5,data/newsrelease_B-1-100_C-1-4/B-5.txt,,,0.069384
6,C-1,B-98,data/newsrelease_B-1-100_C-1-4/B-98.txt,,,0.067478
7,C-1,B-13,data/newsrelease_B-1-100_C-1-4/B-13.txt,,,0.063633
8,C-1,B-50,data/newsrelease_B-1-100_C-1-4/B-50.txt,,,0.059378
9,C-1,B-72,data/newsrelease_B-1-100_C-1-4/B-72.txt,,,0.056054


## ppr (personalized pagerank)

In [44]:
for idx, name in enumerate(list(QUERY_DICT.keys())):
    nodeid = gds.find_node_id(labels=["Query"], properties={"name": name})
    result = gds.pageRank.write(
        G,
        writeProperty="pr"+str(idx),
        maxIterations=20,
        dampingFactor=0.85,
        relationshipWeightProperty='weight',
        sourceNodes=[nodeid]
    )   
    print(f"Node properties written: {result['nodePropertiesWritten']}")
    print(f"Mean: {result['centralityDistribution']['mean']}")


Node properties written: 2966
Mean: 0.0003240621538858453
Node properties written: 2966
Mean: 0.00032406426363880387
Node properties written: 2966
Mean: 0.00032406917958461597
Node properties written: 2966
Mean: 0.00032405480290595727


## evaluate (ppr)

In [45]:
query_idx = 0 
query_one = list(QUERY_DICT.keys())[query_idx]
query = f"""
MATCH (q:Query)-[r:CORRELATES]-(a:Article) WHERE q.name = "{query_one}"
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, a.pr{query_idx} AS ppr
ORDER BY Query, ppr DESC
LIMIT 10
"""
gds.run_cypher(query)

Unnamed: 0,Query,Article,URL,Group,Group1,ppr
0,C-1,B-52,data/newsrelease_B-1-100_C-1-4/B-52.txt,,,0.010232
1,C-1,B-75,data/newsrelease_B-1-100_C-1-4/B-75.txt,,,0.007261
2,C-1,B-84,data/newsrelease_B-1-100_C-1-4/B-84.txt,,,0.00645
3,C-1,B-27,data/newsrelease_B-1-100_C-1-4/B-27.txt,,,0.005489
4,C-1,B-17,data/newsrelease_B-1-100_C-1-4/B-17.txt,,,0.004911
5,C-1,B-36,data/newsrelease_B-1-100_C-1-4/B-36.txt,,,0.004668
6,C-1,B-50,data/newsrelease_B-1-100_C-1-4/B-50.txt,,,0.004248
7,C-1,B-13,data/newsrelease_B-1-100_C-1-4/B-13.txt,,,0.004067
8,C-1,B-98,data/newsrelease_B-1-100_C-1-4/B-98.txt,,,0.003645
9,C-1,B-55,data/newsrelease_B-1-100_C-1-4/B-55.txt,,,0.003284


## 1. node embedding

In [46]:
# fastrp
result = gds.fastRP.stream(
    G,
    randomSeed=42,
    embeddingDimension=16,
    relationshipWeightProperty="weight",
    iterationWeights=[1, 1, 1],
)

# node2vec
result = gds.node2vec.stream(
    G,
    randomSeed=42,
    embeddingDimension=16,
    relationshipWeightProperty="weight",
    iterations=3,
)

# hashgnn
result = gds.beta.hashgnn.stream(
    G,
    iterations = 3,
    embeddingDensity = 8,
    generateFeatures = {"dimension": 16, "densityLevel": 1},
    randomSeed = 42,
)

print(f"Embedding vectors: {result['embedding']}")

Embedding vectors: 0       [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
1       [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...
2       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...
3       [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...
4       [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, ...
                              ...                        
2961    [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...
2962    [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...
2963    [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
2964    [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...
2965    [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...
Name: embedding, Length: 2966, dtype: object


In [47]:
# fastrp
result = gds.fastRP.mutate(
    G,
    mutateProperty="embedding_fastrp",
    randomSeed=42,
    embeddingDimension=16,
    relationshipWeightProperty="weight", # each relationship should have
    iterationWeights=[1, 1, 1],
)

# node2vec
result = gds.node2vec.mutate(
    G,
    mutateProperty="embedding_node2vec",
    randomSeed=42,
    embeddingDimension=16,
    relationshipWeightProperty="weight",
    iterations=3,
)

# hashgnn
result = gds.beta.hashgnn.mutate(
    G,
    mutateProperty="embedding_hashgnn",
    randomSeed=42,
    heterogeneous=True,
    iterations=3,
    embeddingDensity=8,
    # opt1
    generateFeatures={"dimension": 16, "densityLevel": 1},
    # # opt2 not work
    # binarizeFeatures={"dimension": 16, "threshold": 0},
    # featureProperties=['phrase', 'salience'], # each node should have
)

print(f"Number of embedding vectors produced: {result['nodePropertiesWritten']}")

Number of embedding vectors produced: 2966


## 2. kNN

In [48]:
# fastrp
result = gds.knn.filtered.write(
    G,
    topK=10,
    nodeProperties=["embedding_fastrp"],
    randomSeed=42, # Note that concurrency must be set to 1 when setting this parameter.
    concurrency=1,
    sampleRate=1.0,
    deltaThreshold=0.0,
    writeRelationshipType="SIMILAR_FASTRP",
    writeProperty="score",
    sourceNodeFilter="Query",
    targetNodeFilter="Article",
)

# node2vec
result = gds.knn.filtered.write(
    G,
    topK=10,
    nodeProperties=["embedding_node2vec"],
    randomSeed=42, # Note that concurrency must be set to 1 when setting this parameter.
    concurrency=1,
    sampleRate=1.0,
    deltaThreshold=0.0,
    writeRelationshipType="SIMILAR_NODE2VEC",
    writeProperty="score",
    sourceNodeFilter="Query",
    targetNodeFilter="Article",
)

# hashgnn
result = gds.knn.filtered.write(
    G,
    topK=10,
    nodeProperties=["embedding_hashgnn"],
    randomSeed=42, # Note that concurrency must be set to 1 when setting this parameter.
    concurrency=1,
    sampleRate=1.0,
    deltaThreshold=0.0,
    writeRelationshipType="SIMILAR_HASHGNN",
    writeProperty="score",
    sourceNodeFilter="Query",
    targetNodeFilter="Article",
)

print(f"Relationships produced: {result['relationshipsWritten']}")
print(f"Nodes compared: {result['nodesCompared']}")
print(f"Mean similarity: {result['similarityDistribution']['mean']}")

Relationships produced: 40
Nodes compared: 2966
Mean similarity: 0.9916296005249023


## evaluate (node embedding + knn)

In [49]:
# fastrp
query = """
MATCH (q:Query)-[r:SIMILAR_FASTRP]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.score AS Similarity
ORDER BY Query, Similarity DESC
LIMIT 10
"""
gds.run_cypher(query)

Unnamed: 0,Query,Article,URL,Group,Group1,Similarity
0,C-1,B-58,data/newsrelease_B-1-100_C-1-4/B-58.txt,,,0.751734
1,C-1,B-85,data/newsrelease_B-1-100_C-1-4/B-85.txt,,,0.672669
2,C-1,B-19,data/newsrelease_B-1-100_C-1-4/B-19.txt,,,0.661049
3,C-1,B-62,data/newsrelease_B-1-100_C-1-4/B-62.txt,,,0.620701
4,C-1,B-99,data/newsrelease_B-1-100_C-1-4/B-99.txt,,,0.58615
5,C-1,B-70,data/newsrelease_B-1-100_C-1-4/B-70.txt,,,0.516265
6,C-1,B-74,data/newsrelease_B-1-100_C-1-4/B-74.txt,,,0.514406
7,C-1,B-53,data/newsrelease_B-1-100_C-1-4/B-53.txt,,,0.502302
8,C-1,B-33,data/newsrelease_B-1-100_C-1-4/B-33.txt,,,0.484571
9,C-1,B-45,data/newsrelease_B-1-100_C-1-4/B-45.txt,,,0.481028


In [50]:
# node2vec
query = """
MATCH (q:Query)-[r:SIMILAR_NODE2VEC]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.score AS Similarity
ORDER BY Query, Similarity DESC
LIMIT 10
"""
gds.run_cypher(query)

Unnamed: 0,Query,Article,URL,Group,Group1,Similarity
0,C-1,B-84,data/newsrelease_B-1-100_C-1-4/B-84.txt,,,0.869317
1,C-1,B-36,data/newsrelease_B-1-100_C-1-4/B-36.txt,,,0.813916
2,C-1,B-98,data/newsrelease_B-1-100_C-1-4/B-98.txt,,,0.796198
3,C-1,B-54,data/newsrelease_B-1-100_C-1-4/B-54.txt,,,0.755824
4,C-1,B-73,data/newsrelease_B-1-100_C-1-4/B-73.txt,,,0.734588
5,C-1,B-33,data/newsrelease_B-1-100_C-1-4/B-33.txt,,,0.731524
6,C-1,B-78,data/newsrelease_B-1-100_C-1-4/B-78.txt,,,0.724026
7,C-1,B-39,data/newsrelease_B-1-100_C-1-4/B-39.txt,,,0.704462
8,C-1,B-93,data/newsrelease_B-1-100_C-1-4/B-93.txt,,,0.698007
9,C-1,B-11,data/newsrelease_B-1-100_C-1-4/B-11.txt,,,0.63626


In [51]:
# hashgnn
query = """
MATCH (q:Query)-[r:SIMILAR_HASHGNN]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.score AS Similarity
ORDER BY Query, Similarity DESC
LIMIT 10
"""
gds.run_cypher(query)

Unnamed: 0,Query,Article,URL,Group,Group1,Similarity
0,C-1,B-48,data/newsrelease_B-1-100_C-1-4/B-48.txt,,,1.0
1,C-1,B-53,data/newsrelease_B-1-100_C-1-4/B-53.txt,,,1.0
2,C-1,B-89,data/newsrelease_B-1-100_C-1-4/B-89.txt,,,1.0
3,C-1,B-31,data/newsrelease_B-1-100_C-1-4/B-31.txt,,,1.0
4,C-1,B-97,data/newsrelease_B-1-100_C-1-4/B-97.txt,,,1.0
5,C-1,B-76,data/newsrelease_B-1-100_C-1-4/B-76.txt,,,1.0
6,C-1,B-43,data/newsrelease_B-1-100_C-1-4/B-43.txt,,,1.0
7,C-1,B-86,data/newsrelease_B-1-100_C-1-4/B-86.txt,,,1.0
8,C-1,B-66,data/newsrelease_B-1-100_C-1-4/B-66.txt,,,1.0
9,C-1,B-64,data/newsrelease_B-1-100_C-1-4/B-64.txt,,,1.0


# (postprocessing) free up memory

In [52]:
G.drop()
query = """
MATCH (n) DETACH DELETE n
"""
gds.run_cypher(query)
gds.close()