In [25]:
### Loading Credentials from local file; 
### this cell is meant to be deleted before publishing
import yaml

with open("../creds.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

uri = cfg["sonar_creds"]["uri"]
user = cfg["sonar_creds"]["user"]
password = cfg["sonar_creds"]["pass"]

<font size = "20"> SoNAR (IDH) - HNA Curriculum </font>

<font size = "5">Notebook 4:  Example Case - History of Physiology</font>

# Defining the Physiology Graph

Search for "hysiolog" as substring for Physiology to retrieve every possible string containing.


In [26]:
from neo4j import GraphDatabase

driver = GraphDatabase.driver(uri, auth=(user, password))

query = """
MATCH (t:TopicTerm)
WHERE t.Name CONTAINS "hysiolog"
RETURN DISTINCT(t.Name)
"""

with driver.session() as session:
    all_physiology_terms = session.run(query).data()
    
all_physiology_terms

[{'(t.Name)': 'Arbeitsphysiologie'},
 {'(t.Name)': 'Neurophysiologie'},
 {'(t.Name)': 'Pathophysiologie'},
 {'(t.Name)': 'Pflanzenphysiologie'},
 {'(t.Name)': 'Physiologie'},
 {'(t.Name)': 'Sinnesphysiologie'},
 {'(t.Name)': 'Tierphysiologie'},
 {'(t.Name)': 'Physiologische Chemie'},
 {'(t.Name)': 'Physiologische Psychologie'},
 {'(t.Name)': 'Sprachphysiologie'},
 {'(t.Name)': 'Sportphysiologie'},
 {'(t.Name)': 'Leistungsphysiologie'},
 {'(t.Name)': 'Physiologische Psychiatrie'},
 {'(t.Name)': 'Elektrophysiologie'},
 {'(t.Name)': 'Altersphysiologie'},
 {'(t.Name)': 'Bewegungsphysiologie'},
 {'(t.Name)': 'Entwicklungsphysiologie'},
 {'(t.Name)': 'Ernährungsphysiologie'},
 {'(t.Name)': 'Ertragsphysiologie'},
 {'(t.Name)': 'Histophysiologie'},
 {'(t.Name)': 'Höhenphysiologie'},
 {'(t.Name)': 'Nacherntephysiologie'},
 {'(t.Name)': 'Physiologische Optik'},
 {'(t.Name)': 'Physiologische Uhr'},
 {'(t.Name)': 'Psychophysiologische Diagnostik'},
 {'(t.Name)': 'Stoffwechselphysiologie'},
 {'(t

Let's see whether every topic term also has people connected to them. 

In [3]:
from helper_functions.helper_fun import to_nx_graph

query = """
MATCH (t:TopicTerm)-[r]-(n:PerName)
WHERE t.Name CONTAINS "hysiolog"
RETURN *
"""

driver = GraphDatabase.driver(uri, auth=(user, password))

G = to_nx_graph(neo4j_driver = driver, 
                query = query)

Check which topic terms aren't present in query result:

In [124]:
import numpy as np

relevant_topics = []
for node in list(G.nodes):
    if G.nodes[node]["type"] == "TopicTerm":
        relevant_topics.append((G.nodes[node]["label"]))

np.setdiff1d([d["(t.Name)"] for d in all_physiology_terms], relevant_topics)

array(['Muskelphysiologie', 'Tauchphysiologie', 'Umweltphysiologie',
       'Zellphysiologie'], dtype='<U57')

@todo add colors to network below by node type

In [6]:
from helper_functions.helper_fun import to_nx_graph
from pyvis.network import Network

nt = Network('750px', '100%', notebook=True, directed = True)
nt.from_nx(G)
nt.set_edge_smooth("dynamic")
#nt.show('./html_networks/physiological_net.html')

## Retrieving the Network

In [128]:
%%time

from helper_functions.helper_fun import to_nx_graph

query = """
MATCH (t:TopicTerm)--(n)
WHERE t.Name CONTAINS "hysiolog" AND (n:PerName)
WITH DISTINCT [x in collect(t)+collect(n)|id(x)] as collectedIds 
MATCH (n)-[rel1:RelationToPerName|RelationToResource|RelationCorpName|RelationToUniTitle*0..1]-(n2)
WHERE id(n) in collectedIds 
RETURN n, n2, rel1
"""


driver = GraphDatabase.driver(uri, auth=(user, password))

G = to_nx_graph(neo4j_driver = driver, 
                query = query)

CPU times: user 12.5 s, sys: 669 ms, total: 13.1 s
Wall time: 13.6 s


## Descriptive Metrics

In [129]:
print("Number of Nodes: ", G.number_of_nodes())
print("Number of Edges: ", G.number_of_edges())

Number of Nodes:  36188
Number of Edges:  36691


In [131]:
person_nodes = [x for x,y in G.nodes(data=True) if y['type']=="PerName"]
resources_and_works_nodes = [x for x,y in G.nodes(data=True) if y['type'] in ["Resource", "UniTitle"]]
corps_nodes = [x for x,y in G.nodes(data=True) if y['type']=="CorpName"]
geo_nodes = [x for x,y in G.nodes(data=True) if y['type']=="GeoName"]
topicterm_nodes = [x for x,y in G.nodes(data=True) if y['type']=="TopicTerm"]


print("Number of Persons in Graph: ", len(person_nodes))
print("Number of Resources and Works in Graph: ", len(resources_and_works_nodes))
print("Number of Corporations in Graph: ", len(corps_nodes))
print("Number of GeoNames in Graph:", len(geo_nodes))
print("")
print("## Topic Terms in Graph: ## ")
for node in topicterm_nodes:
    print(G.nodes(data=True)[node]["label"])

Number of Persons in Graph:  1856
Number of Resources and Works in Graph:  34300
Number of Corporations in Graph:  5
Number of GeoNames in Graph: 0

## Topic Terms in Graph: ## 
Arbeitsphysiologie
Neurophysiologie
Pathophysiologie
Pflanzenphysiologie
Physiologie
Sinnesphysiologie
Tierphysiologie
Physiologische Chemie
Physiologische Psychologie
Sportphysiologie
Leistungsphysiologie
Physiologische Psychiatrie
Elektrophysiologie
Entwicklungsphysiologie
Ernährungsphysiologie
Höhenphysiologie
Nacherntephysiologie
Stoffwechselphysiologie
Vegetative Physiologie
Verhaltensphysiologie
Physiologe
Physiologin
Physiologische Akustik
Pflanzenphysiologe
Pflanzenphysiologin
Bernoulli-Balken
Bathybius haeckelii


### Check Density and Connectedness of Network

In [132]:
import networkx as nx

nx.density(G)

5.60366781019814e-05

In [133]:
nx.is_connected(G)

False

In [134]:
components = nx.connected_components(G)

sorted_components = sorted([component for component in components], key = len, reverse=True)

[{37355546,
  40632352,
  40894497,
  41418792,
  6291504,
  41549877,
  41549878,
  41549879,
  41549881,
  41549882,
  41549883,
  41549884,
  41549885,
  41549886,
  41549887,
  41549888,
  41549889,
  41549890,
  11010115,
  41549891,
  41549893,
  41549894,
  41549896,
  10879062,
  41549913,
  34734171,
  40894560,
  40894561,
  41549924,
  16908389,
  41549925,
  41549927,
  40894567,
  3014767,
  3014770,
  3014771,
  14942330,
  17432726,
  17432727,
  17432728,
  20054181,
  39059631,
  3539129,
  11010234,
  14680258,
  45744323,
  10485955,
  12320969,
  4194524,
  10879208,
  39190764,
  14693138,
  14693139,
  14693140,
  14693141,
  14693142,
  14693143,
  14693144,
  14693145,
  14693146,
  19398944,
  14693147,
  14693148,
  13107495,
  40108338,
  31457594,
  10355004,
  40632641,
  14680424,
  38666601,
  45744505,
  39059858,
  40894873,
  7471518,
  39322018,
  39322019,
  39322020,
  39322021,
  11010467,
  11010468,
  39322022,
  17564081,
  13107659,
  11010519,

In [135]:
import matplotlib.pyplot as plt

component_length_list = [len(component) for component in sorted_components]

print("number of unique component lengths:", len(set(component_length_list)))
print("maximum component length:", max(set(component_length_list)))
print("minimum component length:", min(set(component_length_list)))

number of unique component lengths: 57
maximum component length: 30045
minimum component length: 1


In [41]:

largest_component = max(components, key = len)
print(len(largest_component))

largest_component_graph = G.subgraph(largest_component)

34344


### Check the Eccentricity

In [74]:
%%time

ecc = nx.eccentricity(largest_component_graph)

KeyboardInterrupt: 

# Investigating the Persons

## Centrality Investigation

At first we are going to investigate the centrality of the physiology network. By assessing the centrality we actually calculate the importance of the nodes. 

 

### Eigenvector Centrality

In [43]:
eigenvectors = nx.eigenvector_centrality_numpy(largest_component_graph)

In [44]:
from operator import itemgetter

eigenvectors_sorted = sorted(eigenvectors.items(), key = itemgetter(1), reverse = True)
eigenvectors_filtered = [item for item in eigenvectors_sorted if item[0] in person_nodes]
top_eigenvectors = eigenvectors_filtered[:10]

In [49]:
for i in top_eigenvectors: 
    degree = eigenvectors[i[0]] 
    print("Name:", largest_component_graph.nodes(data = True)[i[0]]["label"], "| Eigenvector Centrality:", i[1])

Name: Wundt, Wilhelm | Eigenvector Centrality: 0.7070485601159595
Name: Arnold, Ida Eberhardina | Eigenvector Centrality: 0.009072499015222474
Name: Wundt, Marie Friederike | Eigenvector Centrality: 0.009072499015222472
Name: Wundt, Max | Eigenvector Centrality: 0.009060325336248008
Name: Wundt, Wilhelm | Eigenvector Centrality: 0.009060325336248006
Name: Wundt, Eleonore | Eigenvector Centrality: 0.009060325336248003
Name: Wundt, Maximilian | Eigenvector Centrality: 0.009060325336248003
Name: Wundt, Magdalena | Eigenvector Centrality: 0.009060325336248003
Name: Wundt, Ludwig | Eigenvector Centrality: 0.009060325336248001
Name: Wundt, Sophie | Eigenvector Centrality: 0.009060325336248001


### PageRank

In [46]:
pageranks = nx.pagerank(largest_component_graph)

In [47]:
from operator import itemgetter

pageranks_sorted = sorted(pageranks.items(), key = itemgetter(1), reverse = True)
pageranks_filtered = [item for item in pageranks_sorted if item[0] in person_nodes]
top_pageranks = pageranks_filtered[:10]

In [65]:
for i in top_pageranks: 
    degree = pageranks[i[0]] 
    print("id: ", i[0], "Name:", largest_component_graph.nodes(data = True)[i[0]]["label"], "| PageRank Centrality:", i[1])

id:  2925003 Name: Wundt, Wilhelm | PageRank Centrality: 0.08057700948597157
id:  2920435 Name: Oken, Lorenz | PageRank Centrality: 0.029295525735891337
id:  2912081 Name: Baer, Karl Ernst von | PageRank Centrality: 0.01979145213303186
id:  2928379 Name: Cohn, Jonas | PageRank Centrality: 0.015588480544858454
id:  2895349 Name: Pirson, André | PageRank Centrality: 0.014669088728914124
id:  2946338 Name: Eccles, John C. | PageRank Centrality: 0.014487548400826014
id:  2915992 Name: Haller, Albrecht von | PageRank Centrality: 0.013664474789929586
id:  2914257 Name: Du Bois- Reymond, Emil Heinrich | PageRank Centrality: 0.01049896411425947
id:  2941800 Name: Sömmerring, Samuel Thomas von | PageRank Centrality: 0.010328603262062114
id:  2914627 Name: Euler, Leonhard | PageRank Centrality: 0.009142718996798398


### Centrality of Women

We can apply another filter to our 

## Investigating Cliques

In [52]:
cliques = nx.find_cliques(largest_component_graph)
sorted(list(cliques), key = len, reverse=True)

[[2622036, 2615412, 6225315],
 [2622036, 2615412, 7120996],
 [2622036, 2615412, 7120997],
 [2622036, 2615412, 7120998],
 [2622036, 2615412, 1498406],
 [2622036, 2615412, 45698022],
 [2622036, 2615412, 45697765],
 [2622036, 2615412, 45697764],
 [2622036, 2615412, 6428237],
 [2622036, 2615412, 45697748],
 [2622036, 2615412, 1504503],
 [2622036, 2615412, 1499416],
 [26084015, 3752347, 22887645],
 [24117985, 26519786, 1498208],
 [24117985, 26519786, 1498211],
 [24117985, 26519786, 1526117],
 [24117985, 26519786, 5318631],
 [24117985, 26519786, 3078825],
 [24117985, 26519786, 1501066],
 [21497207, 4170218, 20597028],
 [2885841, 2849828, 1502164],
 [2885841, 2849828, 1497541],
 [22547110, 22587509, 1497541],
 [21368152, 21806032, 2675697],
 [28577971, 22956784, 1498208],
 [28577971, 22956784, 1505287],
 [3674325, 20942632, 23741819],
 [24516406, 25606934, 1498208],
 [24516406, 25606934, 1498211],
 [24516406, 25606934, 1526117],
 [24516406, 25606934, 1505287],
 [24516406, 25606934, 1501066],


In [53]:
cliques = nx.find_cliques(largest_component_graph)
cliques_of_three = [clique for clique in cliques if len(clique) >= 3]
clique_nodes = set(n for clique in cliques_of_three for n in clique)
cliques_subgraph = G.subgraph(clique_nodes)

In [54]:
from helper_functions.helper_fun import to_nx_graph
from pyvis.network import Network

nt = Network('750px', '100%', notebook=True)
nt.from_nx(cliques_subgraph)
nt.set_edge_smooth("dynamic")
nt.show('./html_networks/cliques.html')

## Investigating Clustering & Triads

In [73]:
nx.clustering(largest_component_graph)

{33544416: 0.6666666666666666,
 32673333: 0.6666666666666666,
 32673332: 0.3333333333333333,
 32718428: 1.0,
 32734415: 0,
 26519786: 0.2857142857142857,
 24117985: 0.2857142857142857,
 1498208: 0.015126050420168067,
 1501066: 0.03162055335968379,
 1498211: 0.03162055335968379,
 5318631: 1.0,
 3078825: 1.0,
 1526117: 0.03463203463203463,
 22185405: 0,
 2677233: 0,
 1498304: 0,
 5046970: 0,
 1498320: 0,
 1497822: 0,
 14948439: 0,
 22459300: 0,
 23581357: 0,
 23654879: 0,
 23807140: 0,
 26106203: 0,
 26582559: 0,
 27648652: 0,
 29488170: 0,
 42308821: 0,
 1025526: 0,
 1501581: 0,
 1034986: 0,
 1497541: 0.0006172839506172839,
 40414496: 0,
 1499311: 0,
 1505348: 0.16666666666666666,
 30155745: 0,
 3752347: 0.3333333333333333,
 28974110: 0,
 3935879: 0,
 1504395: 0,
 28533997: 0,
 4563096: 0,
 5253979: 0,
 28069637: 0,
 3067449: 0.017543859649122806,
 27836199: 0.6666666666666666,
 21261716: 0.6666666666666666,
 2920692: 2.816028836135282e-05,
 26084015: 1.0,
 22887645: 1.0,
 26039994: 0,


In [103]:
from helper_functions.helper_fun import to_nx_graph
from pyvis.network import Network

nt = Network('750px', '100%', notebook=True)
nt.from_nx(neighbor_subgraph)
nt.set_edge_smooth("dynamic")
#nt.show('./html_networks/neighbors.html')

## Community Detection

# Investigating Corporations

# Investigating Resources and Works

# Investigating predefined set of Physiologists

* Gustav Fritsch (DE-588)115568808
* Eduard Hitzig (DE-588)116917423
* Hermann Munk (DE-588)117185930
* Nathan Zuntz (DE-588)118896202
* Friedrich Goltz (DE-588)116764694
* Adolf Fick (DE-588)118800000
* Jacques Loeb (DE-588)119133628

In [None]:
physiologists = ["(DE-588)115568808", "(DE-588)116917423", "(DE-588)117185930", "(DE-588)118896202", 
                 "(DE-588)116764694", "(DE-588)118800000", "(DE-588)119133628"]

# Bibliography

Scifo, E. (2020). Hands-On Graph Analytics with Neo4j: Perform graph processing and visualization techniques using connected data across your enterprise. Birmingham, England: Packt Publishing.