In [1]:
# Import Neo4J Python Library

from neo4j import GraphDatabase

In [2]:
#Establish DB Connection to Neo4J

bolt_uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(bolt_uri, auth=("neo4j", "123456"))

In [3]:
# Import Pandas and Numpy Library

import pandas as pd
import numpy as np

In [4]:
#DB Query For Trained Data (Virus - Child Of - Virus)

with driver.session(database="neo4j") as session:
    result = session.run("""
             MATCH (virus:Virus)-[:CHILD_OF]->(other:Virus)
             RETURN id(virus) as node1, id(other) as node2, virus.label AS virus1, other.label AS virus2, 1 AS label
             """)
    train_existing_links = pd.DataFrame([dict(record) for record in result])

    result = session.run("""
             MATCH (virus:Virus)
             WHERE (virus)-[:CHILD_OF]-()
             MATCH (virus)-[:CHILD_OF*2..3]-(other)
             WHERE not((virus)-[:CHILD_OF]-(other))
             RETURN id(virus) as node1, id(other) as node2, virus.label AS virus1, other.label AS virus2, 0 AS label
             """)
    train_missing_links = pd.DataFrame([dict(record) for record in result])
    train_missing_links = train_missing_links.drop_duplicates()

In [5]:
# Data Training Labeling - With Total Data Samples Count

training_df = train_missing_links.append(train_existing_links, ignore_index=True)
training_df['label'] = training_df['label'].astype('category')

count_class_0, count_class_1 = training_df.label.value_counts()
print(f"Negative examples: {count_class_0}")
print(f"Positive examples: {count_class_1}")

Negative examples: 4792
Positive examples: 303


In [6]:
# Random Downslamping of Trained Data

df_class_0 = training_df[training_df['label'] == 0]
df_class_1 = training_df[training_df['label'] == 1]

df_class_0_under = df_class_0.sample(count_class_1)
df_train_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random downsampling:')
print(df_train_under.label.value_counts())

Random downsampling:
0    303
1    303
Name: label, dtype: int64


In [7]:
# Detailed Random Downsampling with Positive and Negative Examples Count

with driver.session(database="neo4j") as session:
    result = session.run("""
             MATCH (virus:Virus)-[:CHILD_OF]->(other:Virus)
             RETURN id(virus) as node1, id(other) as node2, virus.label AS virus1, other.label AS virus2, 1 AS label
             """)
    test_existing_links = pd.DataFrame([dict(record) for record in result])

    result = session.run("""
             MATCH (virus:Virus)
             WHERE (virus)-[:CHILD_OF]-()
             MATCH (virus)-[:CHILD_OF*2..3]-(other)
             WHERE not((virus)-[:CHILD_OF]-(other))
             RETURN id(virus) as node1, id(other) as node2, virus.label AS virus1, other.label AS virus2, 0 AS label
             """)
    test_missing_links = pd.DataFrame([dict(record) for record in result])
    test_missing_links = test_missing_links.drop_duplicates()

test_df = test_missing_links.append(test_existing_links, ignore_index=True)
test_df['label'] = test_df['label'].astype('category')

count_class_0, count_class_1 = test_df.label.value_counts()
print(f"Negative examples: {count_class_0}")
print(f"Positive examples: {count_class_1}")

df_class_0 = test_df[test_df['label'] == 0]
df_class_1 = test_df[test_df['label'] == 1]

df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random downsampling:')
print(df_test_under.label.value_counts())

Negative examples: 4792
Positive examples: 303
Random downsampling:
0    303
1    303
Name: label, dtype: int64


In [8]:
# Call Data Sample

df_train_under

Unnamed: 0,node1,node2,virus1,virus2,label
572,39,112,Simarterivirinae,Heroarterivirinae,0
3658,216,204,Rhinacovirus,Pedacovirus,0
2403,142,255,Rousettus bat coronavirus HKU9,Embecovirus,0
4456,260,191,Porcine torovirus,Coronaviridae,0
1952,114,245,Mesoniviridae,Olifoviridae,0
...,...,...,...,...,...
5090,279,27,Tipravirus,Okavirus,1
5091,280,72,Bovine torovirus,Torovirus,1
5092,280,100,Bovine torovirus,Renitovirus,1
5093,281,158,Abyssoviridae,Abnidovirineae,1


In [9]:
# Path Finding Algorithms - CN,PA,TN

def apply_graphy_features(data, rel_type):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE id(p1) = pair.node1
    MATCH (p2) WHERE id(p2) = pair.node2
    RETURN pair.node1 AS node1,
           pair.node2 AS node2,
           gds.alpha.linkprediction.commonNeighbors(p1, p2, {
             relationshipQuery: $relType}) AS cn,
           gds.alpha.linkprediction.preferentialAttachment(p1, p2, {
             relationshipQuery: $relType}) AS pa,
           gds.alpha.linkprediction.totalNeighbors(p1, p2, {
             relationshipQuery: $relType}) AS tn
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]

    with driver.session(database="neo4j") as session:
        result = session.run(query, {"pairs": pairs, "relType": rel_type})
        features = pd.DataFrame([dict(record) for record in result])
    return pd.merge(data, features, on = ["node1", "node2"])

df_train_under = apply_graphy_features(df_train_under, "CHILD_OF")
df_test_under = apply_graphy_features(df_test_under, "NOT_CHILD_OF")

In [10]:
# Triangle Count Algorithms

query = """
CALL gds.triangleCount.write({
  nodeProjection: 'Virus',
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  writeProperty: 'trianglesTrain'
});
"""

with driver.session(database="neo4j") as session:
    result = session.run(query)

query = """
CALL gds.triangleCount.write({
  nodeProjection: 'Virus',
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  writeProperty: 'trianglesTest'
});
"""

with driver.session(database="neo4j") as session:
    result = session.run(query)

In [11]:
# Local Clustering Coefficient Algorithms

query = """
CALL gds.localClusteringCoefficient.write({
  nodeProjection: 'Virus',
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  writeProperty: 'coefficientTrain'
});
"""

with driver.session(database="neo4j") as session:
    result = session.run(query)

query = """
CALL gds.localClusteringCoefficient.write({
  nodeProjection: 'Author',
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  writeProperty: 'coefficientTest'
});
"""

with driver.session(database="neo4j") as session:
    result = session.run(query)

In [12]:
# Minimun and Maximum Value of Triangle Count and Coefficient Clustering Processing

def apply_triangles_features(data, triangles_prop, coefficient_prop):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE id(p1) = pair.node1
    MATCH (p2) WHERE id(p2) = pair.node2
    RETURN pair.node1 AS node1,
    pair.node2 AS node2,
    apoc.coll.min([p1[$trianglesProp], p2[$trianglesProp]]) AS minTriangles,
    apoc.coll.max([p1[$trianglesProp], p2[$trianglesProp]]) AS maxTriangles,
    apoc.coll.min([p1[$coefficientProp], p2[$coefficientProp]]) AS minCoefficient,
    apoc.coll.max([p1[$coefficientProp], p2[$coefficientProp]]) AS maxCoefficient
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {
    "pairs": pairs,
    "trianglesProp": triangles_prop,
    "coefficientProp": coefficient_prop
    }

    with driver.session(database="neo4j") as session:
        result = session.run(query, params)
        features = pd.DataFrame([dict(record) for record in result])

    return pd.merge(data, features, on = ["node1", "node2"])

df_train_under = apply_triangles_features(df_train_under, "trianglesTrain", "coefficientTrain")
df_test_under = apply_triangles_features(df_test_under, "trianglesTest", "coefficientTest")

In [13]:
df_train_under.sample(5)

Unnamed: 0,node1,node2,virus1,virus2,label,cn,pa,tn,minTriangles,maxTriangles,minCoefficient,maxCoefficient
284,144,97,Middle East respiratory syndrome coronavirus,Murine coronavirus,0,0.0,3.0,4.0,0,1,0.0,0.333333
140,234,269,Alphacoronavirus 1,Sunacovirus,0,0.0,2.0,3.0,0,0,0.0,0.0
402,91,140,Gresnaviridae,Arnidovirineae,1,0.0,10.0,7.0,0,1,0.0,0.1
552,234,249,Alphacoronavirus 1,Tegacovirus,1,0.0,3.0,4.0,0,0,0.0,0.0
238,88,36,Mibartevirus,Variarterivirinae,0,1.0,8.0,5.0,0,0,0.0,0.0


In [14]:
# Community Detection - Label Propagation

query = """
CALL gds.labelPropagation.write({
  nodeProjection: "Virus",
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  writeProperty: "partitionTrain"
});
"""

with driver.session(database="neo4j") as session:
    result = session.run(query)

query = """
CALL gds.labelPropagation.write({
  nodeProjection: "Virus",
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  writeProperty: "partitionTest"
});
"""

with driver.session(database="neo4j") as session:
    result = session.run(query)

In [15]:
# Community Detection - louvain

query = """
CALL gds.louvain.stream({
  nodeProjection: 'Virus',
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  includeIntermediateCommunities: true
})
YIELD nodeId, communityId, intermediateCommunityIds
WITH gds.util.asNode(nodeId) AS node, intermediateCommunityIds[0] AS smallestCommunity
SET node.louvainTrain = smallestCommunity;
"""

with driver.session(database="neo4j") as session:
    display(session.run(query).consume().counters)

query = """
CALL gds.louvain.stream({
  nodeProjection: 'Virus',
  relationshipProjection: {
    CHILD_OF: {
      type: 'CHILD_OF',
      orientation: 'UNDIRECTED'
    }
  },
  includeIntermediateCommunities: true
})
YIELD nodeId, communityId, intermediateCommunityIds
WITH gds.util.asNode(nodeId) AS node, intermediateCommunityIds[0] AS smallestCommunity
SET node.louvainTest = smallestCommunity;
"""

with driver.session(database="neo4j") as session:
    display(session.run(query).consume().counters)

{'properties_set': 277}

{'properties_set': 277}

In [16]:
# Community Detection - Same Community

def apply_community_features(data, partition_prop, louvain_prop):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE id(p1) = pair.node1
    MATCH (p2) WHERE id(p2) = pair.node2
    RETURN pair.node1 AS node1,
    pair.node2 AS node2,
    gds.alpha.linkprediction.sameCommunity(p1, p2, $partitionProp) AS sp,
    gds.alpha.linkprediction.sameCommunity(p1, p2, $louvainProp) AS sl
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {
    "pairs": pairs,
    "partitionProp": partition_prop,
    "louvainProp": louvain_prop
    }

    with driver.session(database="neo4j") as session:
        result = session.run(query, params)
        features = pd.DataFrame([dict(record) for record in result])

    return pd.merge(data, features, on = ["node1", "node2"])

df_train_under = apply_community_features(df_train_under, "partitionTrain", "louvainTrain")
df_test_under = apply_community_features(df_test_under, "partitionTest", "louvainTest")

In [17]:
df_train_under.head(5)

Unnamed: 0,node1,node2,virus1,virus2,label,cn,pa,tn,minTriangles,maxTriangles,minCoefficient,maxCoefficient,sp,sl
0,39,112,Simarterivirinae,Heroarterivirinae,0,1.0,14.0,8.0,0,0,0.0,0.0,1.0,0.0
1,216,204,Rhinacovirus,Pedacovirus,0,1.0,6.0,4.0,0,0,0.0,0.0,0.0,0.0
2,142,255,Rousettus bat coronavirus HKU9,Embecovirus,0,0.0,7.0,8.0,0,1,0.0,0.047619,0.0,0.0
3,260,191,Porcine torovirus,Coronaviridae,0,0.0,8.0,6.0,0,1,0.0,1.0,0.0,0.0
4,114,245,Mesoniviridae,Olifoviridae,0,0.0,8.0,6.0,0,2,0.0,0.333333,0.0,0.0


In [18]:
# # Machine Kearning Evaluation - Randoom Forest Classifier

# # Global Variable Declaration
# accuracy = float(0)
# precision = float(0)
# recall = float(0)

# # Data Training Iteration - 10 Times
# for i in range(10):
#     temp_acc = 0
#     temp_prec = 0
#     temp_recall = 0
    
#     df_train_under.sample(10)

#     from sklearn.ensemble import RandomForestClassifier
#     classifier = RandomForestClassifier(n_estimators=30, max_depth=10, random_state=0)

#     columns = [
#         "cn", "pa", "tn", # graph features
#         "minTriangles", "maxTriangles", "minCoefficient", "maxCoefficient", # triangle features
#         "sp", "sl" # community features
#     ]

#     X = df_train_under[columns]
#     y = df_train_under["label"]
#     classifier.fit(X, y)

#     from sklearn.metrics import recall_score
#     from sklearn.metrics import precision_score
#     from sklearn.metrics import accuracy_score

#     def evaluate_model(predictions, actual):
#         temp_acc = float(accuracy_score(actual, predictions))
#         temp_prec = float(precision_score(actual, predictions))
#         temp_recall = float(recall_score(actual, predictions))
        
#         global accuracy
#         global precision
#         global recall

#         accuracy += temp_acc
#         precision += temp_prec
#         recall += temp_recall
        
#         return pd.DataFrame({
#             "Measure": ["Accuracy", "Precision", "Recall"],
#             "Score": [accuracy_score(actual, predictions),
#                       precision_score(actual, predictions),
#                       recall_score(actual, predictions)]
#         })
    
#     predictions = classifier.predict(df_train_under[columns])
#     y_test = df_test_under["label"]

#     evaluate_model(predictions, y_test)
    

In [19]:
# # Model Evaluation Result - Cont'd

# # Average of Evaluation Results
# avg_accuracy = accuracy / 10
# avg_precision = precision / 10
# avg_recall = recall / 10

# # Finalized Evaluation Result Data Frame
# pd_final = pd.DataFrame({
#     "Measure": ["Accuracy", "Precision", "Recall"],
#     "Score": [avg_accuracy,avg_precision,avg_recall]
# })

# # Call Final Finalized Evaluation Result Data Frame
# pd_final

In [20]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

columns = [
        "cn", "pa", "tn", # graph features
        "minTriangles", "maxTriangles", "minCoefficient", "maxCoefficient", # triangle features
        "sp", "sl" # community features
    ]

X = df_train_under[columns]
y = df_train_under["label"]

kf =KFold(n_splits=10, shuffle=True, random_state=42)

cnt = 1
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1

Fold:1, Train set: 545, Test set:61
Fold:2, Train set: 545, Test set:61
Fold:3, Train set: 545, Test set:61
Fold:4, Train set: 545, Test set:61
Fold:5, Train set: 545, Test set:61
Fold:6, Train set: 545, Test set:61
Fold:7, Train set: 546, Test set:60
Fold:8, Train set: 546, Test set:60
Fold:9, Train set: 546, Test set:60
Fold:10, Train set: 546, Test set:60


In [21]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

columns = [
        "cn", "pa", "tn", # graph features
        "minTriangles", "maxTriangles", "minCoefficient", "maxCoefficient", # triangle features
        "sp", "sl" # community features
    ]

X = df_train_under[columns]
y = df_train_under["label"]

kf =KFold(n_splits=10, shuffle=True, random_state=42)

cnt = 1
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1

accuracy = cross_val_score(ensemble.RandomForestClassifier(random_state= 42), X, y, cv= kf, scoring="accuracy")

precision = cross_val_score(ensemble.RandomForestClassifier(random_state= 42), X, y, cv= kf, scoring="precision")

recall = cross_val_score(ensemble.RandomForestClassifier(random_state= 42), X, y, cv= kf, scoring="recall")


Unnamed: 0,Test Number,Accuracy,Precision,Recall
0,1,0.901639,0.870968,0.931034
1,2,0.918033,0.846154,0.956522
2,3,0.95082,0.903226,1.0
3,4,0.868852,0.846154,0.942857
4,5,0.852459,0.794118,0.931034
5,6,0.901639,1.0,0.837838
6,7,0.883333,0.967742,0.833333
7,8,0.95,1.0,0.884615
8,9,0.916667,0.925926,0.892857
9,10,0.916667,0.935484,0.90625


In [22]:
pd_final.to_csv(r'C:\Users\fikri\Desktop\pd_final.csv', index = False, header=True)

In [23]:
avg_accuracy = 0;
temp_acc = 0
for i in accuracy:
    global temp_acc
    temp_acc+=i
    global avg_accuracy
    avg_accuracy = temp_acc / len(accuracy)
    

avg_precision = 0;
temp_prec = 0
for i in precision:
    global temp_prec
    temp_prec+=i
    global avg_precision
    avg_precision = temp_prec / len(precision)
    

avg_recall = 0;
temp_recall = 0
for i in recall:
    global temp_recall
    temp_recall+=i
    global avg_recall
    avg_recall = temp_recall / len(recall)
    
pd_final_average = pd.DataFrame({
    "Average Measure Result": ["Accuracy", "Precision", "Recall"],
    "Score": [avg_accuracy,avg_precision,avg_recall]
})

pd_final_average

Unnamed: 0,Average Measure Result,Score
0,Accuracy,0.906011
1,Precision,0.908977
2,Recall,0.911634


In [24]:
pd_final_average.to_csv(r'C:\Users\fikri\Desktop\pd_final_avg.csv', index = False, header=True)

In [25]:
print("Model Evaluation Completed")

Model Evaluation Completed


In [None]:
# Export to CSV

# df_train_under.to_csv(r'C:\Users\fikri\Desktop\dataset_FP.csv', index = False, header=True)

In [None]:
# Nodes Visualization

from pyvis.network import Network
import pandas as pd

covtax_net = Network(height='auto', width='100%', bgcolor='#222222', font_color='white')

# set the physics layout of the network
covtax_net.force_atlas_2based()
covtax_data = pd.read_csv(r'C:\Users\fikri\Desktop\dataset_FP.csv')

virus1 = covtax_data['virus1']
virus2 = covtax_data['virus2']
edge_data = zip(virus1, virus2)

for e in edge_data:
    vir1 = e[0]
    vir2 = e[1]

    covtax_net.add_node(vir1, vir1, title=vir1)
    covtax_net.add_node(vir2, vir2, title=vir2)
    covtax_net.add_edge(vir1, vir2)
    
# add neighbor data to node hover data
for node in covtax_net.nodes:
    node['title'] += ' Neighbors:<br><br>' + '<br>'.join(neighbor_map[node['id']])
    node['value'] = len(neighbor_map[node['id']])

covtax_net.show_buttons(filter_=['physics'])
covtax_net.show('covid_taxonomy.html')