In [82]:
from graphdatascience import GraphDataScience
import sys
import re

# neo4j desktop v5.11.0

In [83]:
host = "bolt://localhost:7687"
user = "neo4j"
password= "j4oenj4oen"

gds = GraphDataScience(host, auth=(user, password))
print(gds.version())

# params
KEY = "AIzaSyAPQNUpCCFrsJhX2A-CgvOG4fDWlxuA8ec" # api key
nphrase = 100 # number of nouns extracted from each article
DATA_CLASS = "DNP" # DNP or WIKI
DATA_TYPE = "TXT" # TXT or URL (currently txt is used for dnp data)
DATA_URL = "" # input data
QUERY_DICT = {} # query dict {QUERY_NAME: QUERY_URL}
if DATA_CLASS == "DNP":
    if DATA_TYPE == "TXT":
        DATA_URL = "data/newsrelease_B-1-100_C-1-4/"
        QUERY_DICT["C-1"] = DATA_URL + "C-1.txt"
        QUERY_DICT["C-2"] = DATA_URL + "C-2.txt"
        QUERY_DICT["C-3"] = DATA_URL + "C-3.txt"
        QUERY_DICT["C-4"] = DATA_URL + "C-4.txt"
    elif DATA_TYPE == "URL":
        DATA_URL = "https://raw.githubusercontent.com/smallcat9603/graph/main/dnp/kg/data/articles.csv"
        QUERY_DICT["C-1"] = "https://www.holdings.toppan.com/ja/news/2023/10/newsrelease231004_1.html"
        QUERY_DICT["C-2"] = "https://www.holdings.toppan.com/ja/news/2023/10/newsrelease231004_2.html"
        QUERY_DICT["C-3"] = "https://www.holdings.toppan.com/ja/news/2023/10/newsrelease231004_3.html"
        QUERY_DICT["C-4"] = "https://www.holdings.toppan.com/ja/news/2023/10/newsrelease231003_1.html"
elif DATA_CLASS == "WIKI_FP100":
    DATA_URL = "https://raw.githubusercontent.com/smallcat9603/graph/main/dnp/kg/data/wikidata_footballplayer_100.csv"
    QUERY_DICT["Thierry Henry"] = "https://en.wikipedia.org/wiki/Thierry_Henry"
elif DATA_CLASS == "WIKI_P100":
    DATA_URL = "https://raw.githubusercontent.com/smallcat9603/graph/main/dnp/kg/data/wikidata_persons_100.csv"  
    QUERY_DICT["Joe Biden"] = "https://en.wikipedia.org/wiki/Joe_Biden"
else:
    print("DATA ERROR")
    sys.exit(1)

query = """
CREATE CONSTRAINT id_unique IF NOT EXISTS 
For (a:Article) REQUIRE a.url IS UNIQUE;
"""
gds.run_cypher(query)

2.5.4


# Create Article-[Noun]-Article Graph

## create url nodes (article, person, ...)

In [88]:
if DATA_CLASS == "DNP" and DATA_TYPE == "TXT":
  for idx in range(1, 101):
    node = "B-" + str(idx)
    file = DATA_URL + node + ".txt"
    content = ""
    with open(file, 'r') as f:
      content = f.read()
      content = re.sub('\n+', ' ', content)
    query = f"""
    MERGE (a:Article {{ name: "{node}", url: "{file}", body: "{content}" }})
    """
    gds.run_cypher(query)
else:
  query = f"""
  CALL apoc.periodic.iterate(
    "LOAD CSV WITH HEADERS FROM '{DATA_URL}' AS row
    RETURN row",
    "MERGE (a:Article {{name: row.id, url: row.url}})
    SET a.grp = CASE WHEN 'occupation' IN keys(row) THEN row.occupation ELSE null END
    SET a.grp1 = CASE WHEN 'nationality' IN keys(row) THEN row.nationality ELSE null END
    WITH a
    CALL apoc.load.html(a.url, {{
      title: 'title',
      h2: 'h2',
      body: 'body p'
    }})
    YIELD value
    WITH a,
          reduce(texts = '', n IN range(0, size(value.body)-1) | texts + ' ' + coalesce(value.body[n].text, '')) AS body,
          value.title[0].text AS title
    SET a.body = body, a.title = title",
    {{batchSize: 5, parallel: true}}
  )
  YIELD batches, total, timeTaken, committedOperations
  RETURN batches, total, timeTaken, committedOperations
  """
  gds.run_cypher(query)

## set phrase and salience properties

In [89]:
query = f"""
CALL apoc.periodic.iterate(
  "MATCH (a:Article)
   WHERE a.processed IS NULL
   RETURN a",
  "CALL apoc.nlp.gcp.entities.stream([item in $_batch | item.a], {{
     nodeProperty: 'body',
     key: '{KEY}'
   }})
   YIELD node, value
   SET node.processed = true
   WITH node, value
   UNWIND value.entities AS entity
   SET node.phrase = coalesce(node.phrase, []) + entity['name']
   SET node.salience = coalesce(node.salience, []) + entity['salience']",
  {{batchMode: "BATCH_SINGLE", batchSize: 10}})
YIELD batches, total, timeTaken, committedOperations
RETURN batches, total, timeTaken, committedOperations
"""
gds.run_cypher(query)

Unnamed: 0,batches,total,timeTaken,committedOperations
0,10,100,94,100


## create noun-url relationships

In [90]:
query = """
MATCH (a:Article)
WHERE a.processed IS NOT NULL
FOREACH (word IN a.phrase[0..$nphrase] |
  MERGE (n:Noun {name: word})
  MERGE (a)-[r:CONTAINS]-(n)
  SET r.rank = apoc.coll.indexOf(a.phrase, word) + 1
  SET r.score = a.salience[apoc.coll.indexOf(a.phrase, word)]
  SET r.weight = $nphrase - apoc.coll.indexOf(a.phrase, word)
)
"""
gds.run_cypher(query, {'nphrase': nphrase})

## query

In [92]:
if DATA_CLASS == "DNP" and DATA_TYPE == "TXT":
  for QUERY_NAME, QUERY_URL in QUERY_DICT.items():
    content = ""
    with open(QUERY_URL, 'r') as f:
      content = f.read()
      content = re.sub('\n+', ' ', content)
    query = f"""
    MERGE (q:Query {{ name: "{QUERY_NAME}", url: "{QUERY_URL}", body: "{content}" }})
    """
    gds.run_cypher(query)
else:
  for QUERY_NAME, QUERY_URL in QUERY_DICT.items():
    query = """
    MERGE (q:Query {name: $name, url: $url})
    WITH q
    CALL apoc.load.html(i.url, {
    title: "title",
    h2: "h2",
    body: "body p"
    })
    YIELD value
    WITH q,
        reduce(texts = "", n IN range(0, size(value.body)-1) | texts + " " + coalesce(value.body[n].text, "")) AS body,
        value.title[0].text AS title
    SET q.body = body, q.title = title
    RETURN q.title, q.body
    """
    gds.run_cypher(query, {"name": QUERY_NAME, "url": QUERY_URL})
    
# set phrase and salience properties (Query)
query = f"""
MATCH (q:Query)
CALL apoc.nlp.gcp.entities.stream(q, {{
 nodeProperty: 'body',
 key: '{KEY}'
}})
YIELD node, value
SET node.processed = true
WITH node, value
UNWIND value.entities AS entity
SET node.phrase = coalesce(node.phrase, []) + entity['name']
SET node.salience = coalesce(node.salience, []) + entity['salience']
"""
gds.run_cypher(query)

# create noun-article relationships (Query)
query = """
MATCH (q:Query)
WHERE q.processed IS NOT NULL
FOREACH (word IN q.phrase[0..$nphrase] |
  MERGE (n:Noun {name: word})
  MERGE (q)-[r:CONTAINS]-(n)
  SET r.rank = apoc.coll.indexOf(q.phrase, word) + 1
  SET r.score = q.salience[apoc.coll.indexOf(q.phrase, word)]
  SET r.weight = $nphrase - apoc.coll.indexOf(q.phrase, word)
)
"""
gds.run_cypher(query, {'nphrase': nphrase})

## evaluate (naive by rank)

In [93]:
query = """
MATCH (q:Query)-[r:CONTAINS]-(n:Noun)-[c:CONTAINS]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, collect(n.name) AS Common, SUM((1.0/r.rank)*(1.0/c.rank)) AS Similarity 
ORDER BY Similarity DESC
LIMIT 10
"""
gds.run_cypher(query)

Unnamed: 0,Article,URL,Group,Group1,Common,Similarity
0,B-75,data/newsrelease_B-1-100_C-1-4/B-75.txt,,,"[以下, CO2排出量, 本社, 削減, 内容物, パッケージ, 製品]",0.097526
1,B-84,data/newsrelease_B-1-100_C-1-4/B-84.txt,,,"[以下, CO2排出量, 本社, 製造, 削減, CO2, 実現, 社会, パッケージ, 製...",0.03508
2,B-52,data/newsrelease_B-1-100_C-1-4/B-52.txt,,,"[CO2排出量, 製造, 削減, CO2, 同等, 製造工程, 実現, 環境配慮型, 製品,...",0.032132
3,B-50,data/newsrelease_B-1-100_C-1-4/B-50.txt,,,"[CO2排出量, 削減, 実現, 社会, 製品, こと]",0.022266
4,B-5,data/newsrelease_B-1-100_C-1-4/B-5.txt,,,"[多く, 以下, 本社, 東京都, こと]",0.019166
5,B-98,data/newsrelease_B-1-100_C-1-4/B-98.txt,,,"[以下, CO2排出量, 本社, 削減, CO2, 実現]",0.017291
6,B-49,data/newsrelease_B-1-100_C-1-4/B-49.txt,,,"[製造, 実現, 社会, ブース, 製品, 開発]",0.016868
7,B-72,data/newsrelease_B-1-100_C-1-4/B-72.txt,,,"[以下, 本社, 製造, ニュースリリース, 製品, こと, 開発]",0.014972
8,B-97,data/newsrelease_B-1-100_C-1-4/B-97.txt,,,"[製造, 社会, 製品, こと, 開発]",0.014649
9,B-19,data/newsrelease_B-1-100_C-1-4/B-19.txt,,,"[多く, ステークホルダー, CO2排出量, 削減, 実現, 環境配慮型, 社会, 幕張メッ...",0.014466


## create article-article relationships

In [94]:
query = """
MATCH (a1:Article), (a2:Article)
WHERE a1 <> a2 AND any(x IN a1.phrase[0..$nphrase] WHERE x IN a2.phrase[0..$nphrase])
MERGE (a1)-[r:CORRELATES]-(a2)
SET r.common = [x IN a1.phrase[0..$nphrase] WHERE x IN a2.phrase[0..$nphrase]]
"""
gds.run_cypher(query, {'nphrase': nphrase})

#query
query = """
MATCH (q:Query), (a:Article)
WHERE any(x IN q.phrase[0..$nphrase] WHERE x IN a.phrase[0..$nphrase])
MERGE (q)-[r:CORRELATES]-(a)
SET r.common = [x IN q.phrase[0..$nphrase] WHERE x IN a.phrase[0..$nphrase]]
"""
gds.run_cypher(query, {'nphrase': nphrase})

## evaluate (still naive by salience)

In [96]:
query = """
MATCH (q:Query)-[r:CORRELATES]-(a:Article)
WITH r, a, reduce(s = 0.0, word IN r.common | 
s + q.salience[apoc.coll.indexOf(q.phrase, word)] + a.salience[apoc.coll.indexOf(a.phrase, word)]) AS Similarity
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.common, Similarity 
ORDER BY Similarity DESC
LIMIT 10
"""
gds.run_cypher(query)

Unnamed: 0,Article,URL,Group,Group1,r.common,Similarity
0,B-52,data/newsrelease_B-1-100_C-1-4/B-52.txt,,,"[同等, CO2排出量, 削減, CO2排出量, 環境配慮型, 製造, 開発, CO2排出量...",0.791611
1,B-75,data/newsrelease_B-1-100_C-1-4/B-75.txt,,,"[CO2排出量, 削減, CO2排出量, 本社, 以下, パッケージ, CO2排出量, 削減...",0.657581
2,B-84,data/newsrelease_B-1-100_C-1-4/B-84.txt,,,"[CO2排出量, 削減, CO2排出量, 本社, 以下, パッケージ, 製造, 開発, CO...",0.603284
3,B-50,data/newsrelease_B-1-100_C-1-4/B-50.txt,,,"[CO2排出量, 削減, CO2排出量, CO2排出量, 社会, 削減, 実現, 製品, こ...",0.498526
4,B-72,data/newsrelease_B-1-100_C-1-4/B-72.txt,,,"[本社, 以下, 製造, 開発, ニュースリリース, 製品, 開発, こと, こと, 製造,...",0.352651
5,B-98,data/newsrelease_B-1-100_C-1-4/B-98.txt,,,"[CO2排出量, 削減, CO2排出量, 本社, 以下, CO2排出量, CO2, 削減, ...",0.311805
6,B-19,data/newsrelease_B-1-100_C-1-4/B-19.txt,,,"[CO2排出量, 削減, CO2排出量, 環境配慮型, パッケージ, ブース, CO2排出量...",0.28529
7,B-49,data/newsrelease_B-1-100_C-1-4/B-49.txt,,,"[製造, 開発, ブース, 社会, 実現, 製品, 開発, 製造, 製造]",0.267355
8,B-36,data/newsrelease_B-1-100_C-1-4/B-36.txt,,,"[CO2排出量, CO2排出量, パッケージ, 製造, 開発, CO2排出量, 実現, パッ...",0.255479
9,B-27,data/newsrelease_B-1-100_C-1-4/B-27.txt,,,"[削減, グループ会社, 本社, 以下, 削減, 実現, 削減, 削減, 多く]",0.224865


## project graph to memory

In [97]:
node_projection = ["Query", "Article", "Noun"]
# # why raising error "java.lang.UnsupportedOperationException: Loading of values of type StringArray is currently not supported" ???
# node_projection = {"Query": {"properties": 'phrase'}, "Article": {"properties": 'phrase'}, "Noun": {}}
relationship_projection = {
    "CONTAINS": {"orientation": "UNDIRECTED", "properties": ["rank", "score", "weight"]},
    # "CORRELATES": {"orientation": "UNDIRECTED", "properties": ["common"]} # Unsupported type [TEXT_ARRAY] of value StringArray[DNP]. Please use a numeric property.
    }
# # how to project node properties???
# node_properties = { 
#     "nodeProperties": {
#         "phrase": {"defaultValue": []},
#         "salience": {"defaultValue": []}
#     }
# }
G, result = gds.graph.project("testgraph", node_projection, relationship_projection)
print(f"The projection took {result['projectMillis']} ms")
print(f"Graph '{G.name()}' node count: {G.node_count()}")
print(f"Graph '{G.name()}' node labels: {G.node_labels()}")
print(f"Graph '{G.name()}' relationship count: {G.relationship_count()}")
print(f"Graph '{G.name()}' degree distribution: {G.degree_distribution()}")
print(f"Graph '{G.name()}' density: {G.density()}")
print(f"Graph '{G.name()}' size in bytes: {G.size_in_bytes()}")
print(f"Graph '{G.name()}' memory_usage: {G.memory_usage()}")

The projection took 64 ms
Graph 'testgraph' node count: 4860
Graph 'testgraph' node labels: ['Query', 'Article', 'Noun']
Graph 'testgraph' relationship count: 16870
Graph 'testgraph' degree distribution: p99     255.00000
min       3.00000
max     294.00000
p90       9.00000
mean     10.41358
p999    282.00000
p50       3.00000
p95      21.00000
p75       3.00000
dtype: float64
Graph 'testgraph' density: 0.000714384320979185
Graph 'testgraph' size in bytes: 32977499
Graph 'testgraph' memory_usage: 31 MiB


## node similarity (JACCARD)

In [98]:
result = gds.nodeSimilarity.filtered.write(
    G,
    similarityMetric='JACCARD', # default
    writeRelationshipType='SIMILAR_J',
    writeProperty='score',
    relationshipWeightProperty="weight",
    sourceNodeFilter="Query",
    targetNodeFilter="Article",
)

print(f"Relationships produced: {result['relationshipsWritten']}")
print(f"Nodes compared: {result['nodesCompared']}")
print(f"Mean similarity: {result['similarityDistribution']['mean']}")

Relationships produced: 10
Nodes compared: 1
Mean similarity: 0.05262758731842041


## evaluate (jaccard similarity)

In [99]:
query = """
MATCH (q:Query)-[r:SIMILAR_J]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.score AS Similarity
ORDER BY Similarity DESC
"""
gds.run_cypher(query)

Unnamed: 0,Article,URL,Group,Group1,Similarity
0,B-84,data/newsrelease_B-1-100_C-1-4/B-84.txt,,,0.0782
1,B-52,data/newsrelease_B-1-100_C-1-4/B-52.txt,,,0.068744
2,B-75,data/newsrelease_B-1-100_C-1-4/B-75.txt,,,0.056868
3,B-17,data/newsrelease_B-1-100_C-1-4/B-17.txt,,,0.05422
4,B-19,data/newsrelease_B-1-100_C-1-4/B-19.txt,,,0.049231
5,B-71,data/newsrelease_B-1-100_C-1-4/B-71.txt,,,0.048316
6,B-27,data/newsrelease_B-1-100_C-1-4/B-27.txt,,,0.044348
7,B-72,data/newsrelease_B-1-100_C-1-4/B-72.txt,,,0.043223
8,B-36,data/newsrelease_B-1-100_C-1-4/B-36.txt,,,0.042806
9,B-98,data/newsrelease_B-1-100_C-1-4/B-98.txt,,,0.04032


## node similarity (OVERLAP)

In [100]:
result = gds.nodeSimilarity.filtered.write(
    G,
    similarityMetric='OVERLAP',
    writeRelationshipType='SIMILAR_O',
    writeProperty='score',
    relationshipWeightProperty="weight",
    sourceNodeFilter="Query",
    targetNodeFilter="Article",
)

print(f"Relationships produced: {result['relationshipsWritten']}")
print(f"Nodes compared: {result['nodesCompared']}")
print(f"Mean similarity: {result['similarityDistribution']['mean']}")

Relationships produced: 10
Nodes compared: 1
Mean similarity: 0.10369725227355957


## evaluate (overlap similarity)

In [101]:
query = """
MATCH (q:Query)-[r:SIMILAR_O]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.score AS Similarity
ORDER BY Similarity DESC
"""
gds.run_cypher(query)

Unnamed: 0,Article,URL,Group,Group1,Similarity
0,B-84,data/newsrelease_B-1-100_C-1-4/B-84.txt,,,0.154895
1,B-52,data/newsrelease_B-1-100_C-1-4/B-52.txt,,,0.130541
2,B-75,data/newsrelease_B-1-100_C-1-4/B-75.txt,,,0.108577
3,B-17,data/newsrelease_B-1-100_C-1-4/B-17.txt,,,0.105455
4,B-19,data/newsrelease_B-1-100_C-1-4/B-19.txt,,,0.101315
5,B-71,data/newsrelease_B-1-100_C-1-4/B-71.txt,,,0.093278
6,B-72,data/newsrelease_B-1-100_C-1-4/B-72.txt,,,0.08792
7,B-36,data/newsrelease_B-1-100_C-1-4/B-36.txt,,,0.087677
8,B-27,data/newsrelease_B-1-100_C-1-4/B-27.txt,,,0.087433
9,B-98,data/newsrelease_B-1-100_C-1-4/B-98.txt,,,0.079883


## node similarity (COSINE)

In [102]:
result = gds.nodeSimilarity.filtered.write(
    G,
    similarityMetric='COSINE',
    writeRelationshipType='SIMILAR_C',
    writeProperty='score',
    relationshipWeightProperty="weight",
    sourceNodeFilter="Query",
    targetNodeFilter="Article",
)

print(f"Relationships produced: {result['relationshipsWritten']}")
print(f"Nodes compared: {result['nodesCompared']}")
print(f"Mean similarity: {result['similarityDistribution']['mean']}")

Relationships produced: 10
Nodes compared: 1
Mean similarity: 0.11475157737731934


## evaluate (cosine similarity)

In [103]:
query = """
MATCH (q:Query)-[r:SIMILAR_C]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.score AS Similarity
ORDER BY Similarity DESC
"""
gds.run_cypher(query)

Unnamed: 0,Article,URL,Group,Group1,Similarity
0,B-84,data/newsrelease_B-1-100_C-1-4/B-84.txt,,,0.158508
1,B-52,data/newsrelease_B-1-100_C-1-4/B-52.txt,,,0.156205
2,B-75,data/newsrelease_B-1-100_C-1-4/B-75.txt,,,0.132661
3,B-19,data/newsrelease_B-1-100_C-1-4/B-19.txt,,,0.112778
4,B-17,data/newsrelease_B-1-100_C-1-4/B-17.txt,,,0.111239
5,B-27,data/newsrelease_B-1-100_C-1-4/B-27.txt,,,0.098992
6,B-71,data/newsrelease_B-1-100_C-1-4/B-71.txt,,,0.098955
7,B-72,data/newsrelease_B-1-100_C-1-4/B-72.txt,,,0.093783
8,B-36,data/newsrelease_B-1-100_C-1-4/B-36.txt,,,0.093086
9,B-98,data/newsrelease_B-1-100_C-1-4/B-98.txt,,,0.09131


## 1. node embedding

In [104]:
# fastrp
result = gds.fastRP.stream(
    G,
    randomSeed=42,
    embeddingDimension=16,
    relationshipWeightProperty="weight",
    iterationWeights=[1, 1, 1],
)

# node2vec
result = gds.node2vec.stream(
    G,
    randomSeed=42,
    embeddingDimension=16,
    relationshipWeightProperty="weight",
    iterations=3,
)

# hashgnn
result = gds.beta.hashgnn.stream(
    G,
    iterations = 3,
    embeddingDensity = 8,
    generateFeatures = {"dimension": 16, "densityLevel": 1},
    randomSeed = 42,
)

print(f"Embedding vectors: {result['embedding']}")

Embedding vectors: 0       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...
1       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...
2       [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...
                              ...                        
4855    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...
4856    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4857    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...
4858    [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...
4859    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...
Name: embedding, Length: 4860, dtype: object


In [105]:
# fastrp
result = gds.fastRP.mutate(
    G,
    mutateProperty="embedding_fastrp",
    randomSeed=42,
    embeddingDimension=16,
    relationshipWeightProperty="weight", # each relationship should have
    iterationWeights=[1, 1, 1],
)

# node2vec
result = gds.node2vec.mutate(
    G,
    mutateProperty="embedding_node2vec",
    randomSeed=42,
    embeddingDimension=16,
    relationshipWeightProperty="weight",
    iterations=3,
)

# hashgnn
result = gds.beta.hashgnn.mutate(
    G,
    mutateProperty="embedding_hashgnn",
    randomSeed=42,
    heterogeneous=True,
    iterations=3,
    embeddingDensity=8,
    # opt1
    generateFeatures={"dimension": 16, "densityLevel": 1},
    # # opt2 not work
    # binarizeFeatures={"dimension": 16, "threshold": 0},
    # featureProperties=['phrase', 'salience'], # each node should have
)

print(f"Number of embedding vectors produced: {result['nodePropertiesWritten']}")

Number of embedding vectors produced: 4860


## 2. kNN

In [106]:
# fastrp
result = gds.knn.filtered.write(
    G,
    topK=10,
    nodeProperties=["embedding_fastrp"],
    randomSeed=42, # Note that concurrency must be set to 1 when setting this parameter.
    concurrency=1,
    sampleRate=1.0,
    deltaThreshold=0.0,
    writeRelationshipType="SIMILAR_F",
    writeProperty="score",
    sourceNodeFilter="Query",
    targetNodeFilter="Article",
)

# node2vec
result = gds.knn.filtered.write(
    G,
    topK=10,
    nodeProperties=["embedding_node2vec"],
    randomSeed=42, # Note that concurrency must be set to 1 when setting this parameter.
    concurrency=1,
    sampleRate=1.0,
    deltaThreshold=0.0,
    writeRelationshipType="SIMILAR_N",
    writeProperty="score",
    sourceNodeFilter="Query",
    targetNodeFilter="Article",
)

# hashgnn
result = gds.knn.filtered.write(
    G,
    topK=10,
    nodeProperties=["embedding_hashgnn"],
    randomSeed=42, # Note that concurrency must be set to 1 when setting this parameter.
    concurrency=1,
    sampleRate=1.0,
    deltaThreshold=0.0,
    writeRelationshipType="SIMILAR_H",
    writeProperty="score",
    sourceNodeFilter="Query",
    targetNodeFilter="Article",
)

print(f"Relationships produced: {result['relationshipsWritten']}")
print(f"Nodes compared: {result['nodesCompared']}")
print(f"Mean similarity: {result['similarityDistribution']['mean']}")

Relationships produced: 10
Nodes compared: 4860
Mean similarity: 0.9400001525878906


## evaluate (node embedding + knn)

In [107]:
# fastrp
query = """
MATCH (q:Query)-[r:SIMILAR_F]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.score AS Similarity
ORDER BY Similarity DESC
"""
gds.run_cypher(query)

Unnamed: 0,Article,URL,Group,Group1,Similarity
0,B-71,data/newsrelease_B-1-100_C-1-4/B-71.txt,,,0.920148
1,B-7,data/newsrelease_B-1-100_C-1-4/B-7.txt,,,0.77221
2,B-76,data/newsrelease_B-1-100_C-1-4/B-76.txt,,,0.737279
3,B-15,data/newsrelease_B-1-100_C-1-4/B-15.txt,,,0.731383
4,B-10,data/newsrelease_B-1-100_C-1-4/B-10.txt,,,0.700913
5,B-100,data/newsrelease_B-1-100_C-1-4/B-100.txt,,,0.691759
6,B-4,data/newsrelease_B-1-100_C-1-4/B-4.txt,,,0.685134
7,B-38,data/newsrelease_B-1-100_C-1-4/B-38.txt,,,0.642118
8,B-74,data/newsrelease_B-1-100_C-1-4/B-74.txt,,,0.529116
9,B-43,data/newsrelease_B-1-100_C-1-4/B-43.txt,,,0.466307


In [108]:
# node2vec
query = """
MATCH (q:Query)-[r:SIMILAR_N]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.score AS Similarity
ORDER BY Similarity DESC
"""
gds.run_cypher(query)

Unnamed: 0,Article,URL,Group,Group1,Similarity
0,B-19,data/newsrelease_B-1-100_C-1-4/B-19.txt,,,0.877493
1,B-52,data/newsrelease_B-1-100_C-1-4/B-52.txt,,,0.872907
2,B-25,data/newsrelease_B-1-100_C-1-4/B-25.txt,,,0.823174
3,B-98,data/newsrelease_B-1-100_C-1-4/B-98.txt,,,0.808106
4,B-9,data/newsrelease_B-1-100_C-1-4/B-9.txt,,,0.770281
5,B-77,data/newsrelease_B-1-100_C-1-4/B-77.txt,,,0.769365
6,B-100,data/newsrelease_B-1-100_C-1-4/B-100.txt,,,0.699163
7,B-96,data/newsrelease_B-1-100_C-1-4/B-96.txt,,,0.692028
8,B-69,data/newsrelease_B-1-100_C-1-4/B-69.txt,,,0.68398
9,B-40,data/newsrelease_B-1-100_C-1-4/B-40.txt,,,0.674275


In [109]:
# hashgnn
query = """
MATCH (q:Query)-[r:SIMILAR_H]-(a:Article)
RETURN q.name AS Query, a.name AS Article, a.url AS URL, a.grp AS Group, a.grp1 AS Group1, r.score AS Similarity
ORDER BY Similarity DESC
"""
gds.run_cypher(query)

Unnamed: 0,Article,URL,Group,Group1,Similarity
0,B-44,data/newsrelease_B-1-100_C-1-4/B-44.txt,,,1.0
1,B-72,data/newsrelease_B-1-100_C-1-4/B-72.txt,,,1.0
2,B-6,data/newsrelease_B-1-100_C-1-4/B-6.txt,,,1.0
3,B-77,data/newsrelease_B-1-100_C-1-4/B-77.txt,,,1.0
4,B-94,data/newsrelease_B-1-100_C-1-4/B-94.txt,,,0.9
5,B-86,data/newsrelease_B-1-100_C-1-4/B-86.txt,,,0.9
6,B-82,data/newsrelease_B-1-100_C-1-4/B-82.txt,,,0.9
7,B-3,data/newsrelease_B-1-100_C-1-4/B-3.txt,,,0.9
8,B-1,data/newsrelease_B-1-100_C-1-4/B-1.txt,,,0.9
9,B-91,data/newsrelease_B-1-100_C-1-4/B-91.txt,,,0.9


# (postprocessing) free up memory

In [110]:
G.drop()
query = """
MATCH (n) DETACH DELETE n
"""
gds.run_cypher(query)
gds.close()