In [26]:
# Import Neo4J Python Library

from neo4j import GraphDatabase

In [27]:
#Establish DB Connection to Neo4J

bolt_uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(bolt_uri, auth=("neo4j", "123456"))

In [28]:
# Import Pandas and Numpy Library

import pandas as pd
import numpy as np

In [29]:
#DB Query For Trained Data (Virus - Child Of - Virus)

with driver.session(database="neo4j") as session:
    result = session.run("""
             MATCH (virus:Virus)-[:CHILD_OF]->(other:Virus)
             RETURN id(virus) as node1, id(other) as node2, virus.label AS virus1, other.label AS virus2, 1 AS label
             """)
    train_existing_links = pd.DataFrame([dict(record) for record in result])

    result = session.run("""
             MATCH (virus:Virus)
             WHERE (virus)-[:CHILD_OF]-()
             MATCH (virus)-[:CHILD_OF*2..3]-(other)
             WHERE not((virus)-[:CHILD_OF]-(other))
             RETURN id(virus) as node1, id(other) as node2, virus.label AS virus1, other.label AS virus2, 0 AS label
             """)
    train_missing_links = pd.DataFrame([dict(record) for record in result])
    train_missing_links = train_missing_links.drop_duplicates()

In [30]:
# Data Training Labeling - With Total Data Samples Count

training_df = train_missing_links.append(train_existing_links, ignore_index=True)
training_df['label'] = training_df['label'].astype('category')

count_class_0, count_class_1 = training_df.label.value_counts()
print(f"Negative examples: {count_class_0}")
print(f"Positive examples: {count_class_1}")

Negative examples: 4792
Positive examples: 303


In [31]:
# Random Downslamping of Trained Data

df_class_0 = training_df[training_df['label'] == 0]
df_class_1 = training_df[training_df['label'] == 1]

df_class_0_under = df_class_0.sample(count_class_1)
df_train_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random downsampling:')
print(df_train_under.label.value_counts())

Random downsampling:
0    303
1    303
Name: label, dtype: int64


In [32]:
# Detailed Random Downsampling with Positive and Negative Examples Count

with driver.session(database="neo4j") as session:
    result = session.run("""
             MATCH (virus:Virus)-[:CHILD_OF]->(other:Virus)
             RETURN id(virus) as node1, id(other) as node2, virus.label AS virus1, other.label AS virus2, 1 AS label
             """)
    test_existing_links = pd.DataFrame([dict(record) for record in result])

    result = session.run("""
             MATCH (virus:Virus)
             WHERE (virus)-[:CHILD_OF]-()
             MATCH (virus)-[:CHILD_OF*2..3]-(other)
             WHERE not((virus)-[:CHILD_OF]-(other))
             RETURN id(virus) as node1, id(other) as node2, virus.label AS virus1, other.label AS virus2, 0 AS label
             """)
    test_missing_links = pd.DataFrame([dict(record) for record in result])
    test_missing_links = test_missing_links.drop_duplicates()

test_df = test_missing_links.append(test_existing_links, ignore_index=True)
test_df['label'] = test_df['label'].astype('category')

count_class_0, count_class_1 = test_df.label.value_counts()
print(f"Negative examples: {count_class_0}")
print(f"Positive examples: {count_class_1}")

df_class_0 = test_df[test_df['label'] == 0]
df_class_1 = test_df[test_df['label'] == 1]

df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random downsampling:')
print(df_test_under.label.value_counts())

Negative examples: 4792
Positive examples: 303
Random downsampling:
0    303
1    303
Name: label, dtype: int64


In [33]:
# Call Data Sample

df_train_under

Unnamed: 0,node1,node2,virus1,virus2,label
2670,158,112,Abnidovirineae,Heroarterivirinae,0
1427,86,117,Peiartevirus,Kigiartevirus,0
2328,139,272,BtKY72/Rhinolophus sp./Kenya/2007,Hibecovirus,0
2900,172,111,Betacoronavirus,SARS coronavirus B024,0
975,58,117,Epsilonarterivirus,Kigiartevirus,0
...,...,...,...,...,...
5090,279,27,Tipravirus,Okavirus,1
5091,280,72,Bovine torovirus,Torovirus,1
5092,280,100,Bovine torovirus,Renitovirus,1
5093,281,158,Abyssoviridae,Abnidovirineae,1


In [34]:
# Path Finding Algorithms - CN,PA,TN

def apply_graphy_features(data, rel_type):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE id(p1) = pair.node1
    MATCH (p2) WHERE id(p2) = pair.node2
    RETURN pair.node1 AS node1,
           pair.node2 AS node2,
           gds.alpha.linkprediction.commonNeighbors(p1, p2, {
             relationshipQuery: $relType}) AS cn,
           gds.alpha.linkprediction.preferentialAttachment(p1, p2, {
             relationshipQuery: $relType}) AS pa,
           gds.alpha.linkprediction.totalNeighbors(p1, p2, {
             relationshipQuery: $relType}) AS tn
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]

    with driver.session(database="neo4j") as session:
        result = session.run(query, {"pairs": pairs, "relType": rel_type})
        features = pd.DataFrame([dict(record) for record in result])
    return pd.merge(data, features, on = ["node1", "node2"])

df_train_under = apply_graphy_features(df_train_under, "CHILD_OF")
df_test_under = apply_graphy_features(df_test_under, "NOT_CHILD_OF")

In [35]:
# Takes 5 Samples of Trained Data

df_train_under.sample(5)

Unnamed: 0,node1,node2,virus1,virus2,label,cn,pa,tn
55,227,255,Bat SARS coronavirus Rp1,Embecovirus,0,0.0,7.0,8.0
113,172,227,Betacoronavirus,Bat SARS coronavirus Rp1,0,1.0,9.0,9.0
232,96,21,Monidovirineae,Zealarterivirinae,0,0.0,4.0,4.0
425,113,39,Zetaarterivirus,Simarterivirinae,1,0.0,14.0,9.0
20,51,212,Arteriviridae,Muarterivirus,0,1.0,16.0,9.0


In [36]:
# Triangle Count Algorithms

query = """
CALL gds.triangleCount.write({
  nodeProjection: 'Virus',
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  writeProperty: 'trianglesTrain'
});
"""

with driver.session(database="neo4j") as session:
    result = session.run(query)

query = """
CALL gds.triangleCount.write({
  nodeProjection: 'Virus',
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  writeProperty: 'trianglesTest'
});
"""

with driver.session(database="neo4j") as session:
    result = session.run(query)

In [37]:
# Local Clustering Coefficient Algorithms

query = """
CALL gds.localClusteringCoefficient.write({
  nodeProjection: 'Virus',
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  writeProperty: 'coefficientTrain'
});
"""

with driver.session(database="neo4j") as session:
    result = session.run(query)

query = """
CALL gds.localClusteringCoefficient.write({
  nodeProjection: 'Author',
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  writeProperty: 'coefficientTest'
});
"""

with driver.session(database="neo4j") as session:
    result = session.run(query)

In [38]:
# Minimun and Maximum Value of Triangle Count and Coefficient Clustering Processing

def apply_triangles_features(data, triangles_prop, coefficient_prop):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE id(p1) = pair.node1
    MATCH (p2) WHERE id(p2) = pair.node2
    RETURN pair.node1 AS node1,
    pair.node2 AS node2,
    apoc.coll.min([p1[$trianglesProp], p2[$trianglesProp]]) AS minTriangles,
    apoc.coll.max([p1[$trianglesProp], p2[$trianglesProp]]) AS maxTriangles,
    apoc.coll.min([p1[$coefficientProp], p2[$coefficientProp]]) AS minCoefficient,
    apoc.coll.max([p1[$coefficientProp], p2[$coefficientProp]]) AS maxCoefficient
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {
    "pairs": pairs,
    "trianglesProp": triangles_prop,
    "coefficientProp": coefficient_prop
    }

    with driver.session(database="neo4j") as session:
        result = session.run(query, params)
        features = pd.DataFrame([dict(record) for record in result])

    return pd.merge(data, features, on = ["node1", "node2"])

df_train_under = apply_triangles_features(df_train_under, "trianglesTrain", "coefficientTrain")
df_test_under = apply_triangles_features(df_test_under, "trianglesTest", "coefficientTest")

In [39]:
df_train_under.sample(5)

Unnamed: 0,node1,node2,virus1,virus2,label,cn,pa,tn,minTriangles,maxTriangles,minCoefficient,maxCoefficient
545,227,137,Bat SARS coronavirus Rp1,SARSr-CoV,1,0.0,18.0,19.0,0,1,0.0,0.006536
497,179,137,RaTG13,SARSr-CoV,1,0.0,18.0,19.0,0,1,0.0,0.006536
439,125,54,Roniviridae,니도바이러스목,1,1.0,44.0,14.0,2,3,0.054545,0.333333
290,210,29,Kadilivirus,Alphamesonivirus 2,0,1.0,4.0,3.0,0,1,0.0,1.0
353,45,183,Alphamononivirus,Mononivirinae,1,0.0,4.0,4.0,0,0,0.0,0.0


In [40]:
# Community Detection - Label Propagation

query = """
CALL gds.labelPropagation.write({
  nodeProjection: "Virus",
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  writeProperty: "partitionTrain"
});
"""

with driver.session(database="neo4j") as session:
    result = session.run(query)

query = """
CALL gds.labelPropagation.write({
  nodeProjection: "Virus",
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  writeProperty: "partitionTest"
});
"""

with driver.session(database="neo4j") as session:
    result = session.run(query)

In [41]:
# Community Detection - louvain

query = """
CALL gds.louvain.stream({
  nodeProjection: 'Virus',
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  includeIntermediateCommunities: true
})
YIELD nodeId, communityId, intermediateCommunityIds
WITH gds.util.asNode(nodeId) AS node, intermediateCommunityIds[0] AS smallestCommunity
SET node.louvainTrain = smallestCommunity;
"""

with driver.session(database="neo4j") as session:
    display(session.run(query).consume().counters)

query = """
CALL gds.louvain.stream({
  nodeProjection: 'Virus',
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  includeIntermediateCommunities: true
})
YIELD nodeId, communityId, intermediateCommunityIds
WITH gds.util.asNode(nodeId) AS node, intermediateCommunityIds[0] AS smallestCommunity
SET node.louvainTest = smallestCommunity;
"""

with driver.session(database="neo4j") as session:
    display(session.run(query).consume().counters)

{'properties_set': 277}

{'properties_set': 277}

In [42]:
# Community Detection - Same Community

def apply_community_features(data, partition_prop, louvain_prop):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE id(p1) = pair.node1
    MATCH (p2) WHERE id(p2) = pair.node2
    RETURN pair.node1 AS node1,
    pair.node2 AS node2,
    gds.alpha.linkprediction.sameCommunity(p1, p2, $partitionProp) AS sp,
    gds.alpha.linkprediction.sameCommunity(p1, p2, $louvainProp) AS sl
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {
    "pairs": pairs,
    "partitionProp": partition_prop,
    "louvainProp": louvain_prop
    }

    with driver.session(database="neo4j") as session:
        result = session.run(query, params)
        features = pd.DataFrame([dict(record) for record in result])

    return pd.merge(data, features, on = ["node1", "node2"])

df_train_under = apply_community_features(df_train_under, "partitionTrain", "louvainTrain")
df_test_under = apply_community_features(df_test_under, "partitionTest", "louvainTest")

In [43]:
df_test_under.head(5)

Unnamed: 0,node1,node2,virus1,virus2,label,cn,pa,tn,minTriangles,maxTriangles,minCoefficient,maxCoefficient,sp,sl
0,250,57,Orthocoronavirinae,FalCoV UAE-HKU27,0,0.0,0.0,0.0,0,0,,,0.0,0.0
1,114,27,Mesoniviridae,Okavirus,0,0.0,0.0,0.0,2,2,,,0.0,0.0
2,64,21,Tornidovirineae,Zealarterivirinae,0,0.0,0.0,0.0,0,0,,,1.0,0.0
3,170,250,SARS-CoV-1,Orthocoronavirinae,0,0.0,0.0,0.0,0,0,,,0.0,0.0
4,36,62,Variarterivirinae,Betaarterivirus suid 1,0,0.0,0.0,0.0,0,0,,,0.0,0.0


In [44]:
df_train_under.head(5)

Unnamed: 0,node1,node2,virus1,virus2,label,cn,pa,tn,minTriangles,maxTriangles,minCoefficient,maxCoefficient,sp,sl
0,158,112,Abnidovirineae,Heroarterivirinae,0,0.0,4.0,4.0,0,0,0.0,0.0,1.0,0.0
1,86,117,Peiartevirus,Kigiartevirus,0,1.0,4.0,3.0,0,0,0.0,0.0,1.0,0.0
2,139,272,BtKY72/Rhinolophus sp./Kenya/2007,Hibecovirus,0,0.0,2.0,3.0,0,0,0.0,0.0,1.0,0.0
3,172,111,Betacoronavirus,SARS coronavirus B024,0,1.0,9.0,9.0,0,3,0.0,0.083333,1.0,0.0
4,58,117,Epsilonarterivirus,Kigiartevirus,0,0.0,4.0,4.0,0,0,0.0,0.0,1.0,0.0


In [45]:
%%time
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

columns = [
        "cn", "pa", "tn", # graph features
        "minTriangles", "maxTriangles", "minCoefficient", "maxCoefficient", # triangle features
        "sp", "sl" # community features
    ]

X = df_train_under[columns]
y = df_train_under["label"]

kf =KFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1

Fold:1, Train set: 484, Test set:122
Fold:2, Train set: 485, Test set:121
Fold:3, Train set: 485, Test set:121
Fold:4, Train set: 485, Test set:121
Fold:5, Train set: 485, Test set:121
CPU times: total: 0 ns
Wall time: 2 ms


In [46]:
%%time
def rmse(score):
    rmse = np.sqrt(-score)
    print(f'rmse= {"{:.2f}".format(rmse)}')

score = cross_val_score(ensemble.RandomForestClassifier(random_state= 42), X, y, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')


Scores for each fold are: [0.90983607 0.90082645 0.88429752 0.90082645 0.90909091]
CPU times: total: 688 ms
Wall time: 696 ms


In [47]:
print('Complete !')

Complete !
