In [1]:
### Loading Credentials from local file; 
### this cell is meant to be deleted before publishing
import yaml

with open("../creds.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

uri = cfg["sonar_creds"]["uri"]
user = cfg["sonar_creds"]["user"]
password = cfg["sonar_creds"]["pass"]

<font size = "20"> SoNAR (IDH) - HNA Curriculum </font>

<font size = "5">Notebook 4:  Example Case - History of Physiology</font>

# Defining the Physiology Graph

Search for "hysiolog" as substring for Physiology to retrieve every possible string containing.


In [3]:
from neo4j import GraphDatabase

driver = GraphDatabase.driver(uri, auth=(user, password))

query = """
MATCH (t:TopicTerm)
WHERE t.Name CONTAINS "hysiolog"
RETURN DISTINCT(t.Name)
"""

with driver.session() as session:
    all_physiology_terms = session.run(query).data()
    
all_physiology_terms

[{'(t.Name)': 'Arbeitsphysiologie'},
 {'(t.Name)': 'Neurophysiologie'},
 {'(t.Name)': 'Pathophysiologie'},
 {'(t.Name)': 'Pflanzenphysiologie'},
 {'(t.Name)': 'Physiologie'},
 {'(t.Name)': 'Sinnesphysiologie'},
 {'(t.Name)': 'Tierphysiologie'},
 {'(t.Name)': 'Physiologische Chemie'},
 {'(t.Name)': 'Physiologische Psychologie'},
 {'(t.Name)': 'Sprachphysiologie'},
 {'(t.Name)': 'Sportphysiologie'},
 {'(t.Name)': 'Leistungsphysiologie'},
 {'(t.Name)': 'Physiologische Psychiatrie'},
 {'(t.Name)': 'Elektrophysiologie'},
 {'(t.Name)': 'Altersphysiologie'},
 {'(t.Name)': 'Bewegungsphysiologie'},
 {'(t.Name)': 'Entwicklungsphysiologie'},
 {'(t.Name)': 'Ernährungsphysiologie'},
 {'(t.Name)': 'Ertragsphysiologie'},
 {'(t.Name)': 'Histophysiologie'},
 {'(t.Name)': 'Höhenphysiologie'},
 {'(t.Name)': 'Nacherntephysiologie'},
 {'(t.Name)': 'Physiologische Optik'},
 {'(t.Name)': 'Physiologische Uhr'},
 {'(t.Name)': 'Psychophysiologische Diagnostik'},
 {'(t.Name)': 'Stoffwechselphysiologie'},
 {'(t

Let's see whether every topic term also has people connected to them. 

In [4]:
from helper_functions.helper_fun import to_nx_graph

query = """
MATCH (t:TopicTerm)-[r]-(n:PerName)
WHERE t.Name CONTAINS "hysiolog"
RETURN *
"""

driver = GraphDatabase.driver(uri, auth=(user, password))

G = to_nx_graph(neo4j_driver = driver, 
                query = query)

Check which topic terms aren't present in query result:

In [5]:
import numpy as np

relevant_topics = []
for node in list(G.nodes):
    if G.nodes[node]["type"] == "TopicTerm":
        relevant_topics.append((G.nodes[node]["label"]))

np.setdiff1d([d["(t.Name)"] for d in all_physiology_terms], relevant_topics)

array(['Altersphysiologie', 'Bewegungsphysiologie',
       'Elektrophysiologische Untersuchung', 'Ertragsphysiologie',
       'Experimentelle Physiologie', 'Histophysiologie',
       'Ignaz-L.-Lieben-Preis für Physik, Chemie und Physiologie',
       'Muskelphysiologie', 'Physiologische Optik', 'Physiologische Uhr',
       'Psychophysiologische Diagnostik', 'Reizphysiologie',
       'Sprachphysiologie', 'Tauchphysiologie', 'Umweltphysiologie',
       'Vergleichende Neurophysiologie', 'Vergleichende Physiologie',
       'Zellphysiologie'], dtype='<U57')

@todo add colors to network below by node type

In [6]:
from helper_functions.helper_fun import to_nx_graph
from pyvis.network import Network

nt = Network('750px', '100%', notebook=True, directed = True)
nt.from_nx(G)
nt.set_edge_smooth("dynamic")
#nt.show('./html_networks/physiological_net.html')

## Retrieving the Network

In [80]:
query = """
MATCH (n2)
WHERE n2:PerName OR n2:Resource OR n2:UniTitle OR n2:CorpName OR n2:TopicTerm
RETURN LABELS(n2)
LIMIT 10
"""

driver = GraphDatabase.driver(uri, auth=(user, password))

with driver.session() as session:
    result = session.run(query).data()
    
result

[{'LABELS(n2)': ['TopicTerm']},
 {'LABELS(n2)': ['TopicTerm']},
 {'LABELS(n2)': ['TopicTerm']},
 {'LABELS(n2)': ['TopicTerm']},
 {'LABELS(n2)': ['TopicTerm']},
 {'LABELS(n2)': ['TopicTerm']},
 {'LABELS(n2)': ['TopicTerm']},
 {'LABELS(n2)': ['TopicTerm']},
 {'LABELS(n2)': ['TopicTerm']},
 {'LABELS(n2)': ['TopicTerm']}]

In [184]:
%%time

from helper_functions.helper_fun import to_nx_graph

query = """
MATCH (t:TopicTerm)--(n:PerName)
WHERE t.Name CONTAINS "hysiolog"
WITH DISTINCT [x in collect(t)+collect(n)|id(x)] as collectedIds 
MATCH (n)-[rel1:RelationToPerName|RelationToResource|RelationCorpName*0..1]-(n2)
WHERE id(n) in collectedIds 
RETURN n, n2, rel1
"""


driver = GraphDatabase.driver(uri, auth=(user, password))

G = to_nx_graph(neo4j_driver = driver, 
                query = query)

CPU times: user 14.9 s, sys: 495 ms, total: 15.4 s
Wall time: 15.8 s


## Descriptive Metrics

In [185]:
print("Number of Nodes: ", G.number_of_nodes())
print("Number of Edges: ", G.number_of_edges())

Number of Nodes:  36187
Number of Edges:  36690


In [118]:
person_nodes = [x for x,y in G.nodes(data=True) if y['type']=="PerName"]
resources_and_works_nodes = [x for x,y in G.nodes(data=True) if y['type'] in ["Resource", "UniTitle"]]
corps_nodes = [x for x,y in G.nodes(data=True) if y['type']=="CorpName"]
topicterm_nodes = [x for x,y in G.nodes(data=True) if y['type']=="TopicTerm"]


print("Number of Persons in Graph: ", len(person_nodes))
print("Number of Resources and Works in Graph: ", len(resources_and_works_nodes))
print("Number of Corporations in Graph: ", len(corps_nodes))
print("")
print("## Topic Terms in Graph: ## ")
for node in topicterm_nodes:
    print(G.nodes(data=True)[node]["label"])

Number of Persons in Graph:  1856
Number of Resources and Works in Graph:  34299
Number of Corporations in Graph:  5

## Topic Terms in Graph: ## 
Arbeitsphysiologie
Neurophysiologie
Pathophysiologie
Pflanzenphysiologie
Physiologie
Sinnesphysiologie
Tierphysiologie
Physiologische Chemie
Physiologische Psychologie
Sportphysiologie
Leistungsphysiologie
Physiologische Psychiatrie
Elektrophysiologie
Entwicklungsphysiologie
Ernährungsphysiologie
Höhenphysiologie
Nacherntephysiologie
Stoffwechselphysiologie
Vegetative Physiologie
Verhaltensphysiologie
Physiologe
Physiologin
Physiologische Akustik
Pflanzenphysiologe
Pflanzenphysiologin
Bernoulli-Balken
Bathybius haeckelii


In [108]:
import networkx as nx

nx.density(G)

3.64757734467894e-05

# Investigating the Persons

## Centrality Investigation

At first we are going to investigate the centrality of the physiology network. By assessing the centrality we actually calculate the importance of the nodes. 

 

### Eigenvector Centrality

In [119]:
eigenvectors = nx.eigenvector_centrality_numpy(G)

In [121]:
from operator import itemgetter

eigenvectors_sorted = sorted(eigenvectors.items(), key = itemgetter(1), reverse = True)
eigenvectors_filtered = [item for item in eigenvectors_sorted if item[0] in person_nodes]
top_eigenvectors = eigenvectors_filtered[:10]

In [122]:
for i in top_eigenvectors: 
    degree = eigenvectors[i[0]] 
    print("Name:", G.nodes(data = True)[i[0]]["label"], "| Eigenvector Centrality:", i[1])

Name: Wundt, Wilhelm | Eigenvector Centrality: 0.7070494505966636
Name: Wundt, Marie Friederike | Eigenvector Centrality: 0.009074726145276537
Name: Arnold, Ida Eberhardina | Eigenvector Centrality: 0.009074726145276532
Name: Wundt, Wilhelm | Eigenvector Centrality: 0.009062579963126097
Name: Wundt, Eleonore | Eigenvector Centrality: 0.009062579963126092
Name: Wundt, Magdalena | Eigenvector Centrality: 0.00906257996312609
Name: Wundt, Ludwig | Eigenvector Centrality: 0.009062579963126089
Name: Wundt, Sophie | Eigenvector Centrality: 0.009062579963126087
Name: Wundt, Max | Eigenvector Centrality: 0.009062579963126087
Name: Wundt, Maximilian | Eigenvector Centrality: 0.009062579963126087


### PageRank

In [123]:
pageranks = nx.pagerank(G)

In [124]:
from operator import itemgetter

pageranks_sorted = sorted(pageranks.items(), key = itemgetter(1), reverse = True)
pageranks_filtered = [item for item in pageranks_sorted if item[0] in person_nodes]
top_pageranks = pageranks_filtered[:10]

In [125]:
for i in top_pageranks: 
    degree = pageranks[i[0]] 
    print("Name:", G.nodes(data = True)[i[0]]["label"], "| PageRank Centrality:", i[1])

Name: Wundt, Wilhelm | PageRank Centrality: 0.07749802724851465
Name: Oken, Lorenz | PageRank Centrality: 0.028146978668306927
Name: Baer, Karl Ernst von | PageRank Centrality: 0.019040751791885017
Name: Cohn, Jonas | PageRank Centrality: 0.01496615179743833
Name: Pirson, André | PageRank Centrality: 0.014152577093836416
Name: Eccles, John C. | PageRank Centrality: 0.013947577661537536
Name: Haller, Albrecht von | PageRank Centrality: 0.013138540797033306
Name: Du Bois- Reymond, Emil Heinrich | PageRank Centrality: 0.010103140633417208
Name: Sömmerring, Samuel Thomas von | PageRank Centrality: 0.009926079219744307
Name: Euler, Leonhard | PageRank Centrality: 0.008790405208168936


### Centrality of Women

We can apply another filter to our 

## Investigating Cliques

In [173]:
cliques = nx.find_cliques(G)
#sorted(list(cliques), key = len, reverse=True)

In [180]:
cliques_of_three = [clique for clique in cliques if len(clique) >= 3]
clique_nodes = set(n for clique in cliques_of_three for n in clique)
h = G.subgraph(clique_nodes)

In [181]:
from helper_functions.helper_fun import to_nx_graph
from pyvis.network import Network

nt = Network('750px', '100%', notebook=True)
nt.from_nx(h)
nt.set_edge_smooth("dynamic")
nt.show('./html_networks/cliques.html')

# Investigating predefined set of Physiologists

* Gustav Fritsch (DE-588)115568808
* Eduard Hitzig (DE-588)116917423
* Hermann Munk (DE-588)117185930
* Nathan Zuntz (DE-588)118896202
* Friedrich Goltz (DE-588)116764694
* Adolf Fick (DE-588)118800000
* Jacques Loeb (DE-588)119133628

In [None]:
physiologists = ["(DE-588)115568808", "(DE-588)116917423", "(DE-588)117185930", "(DE-588)118896202", 
                 "(DE-588)116764694", "(DE-588)118800000", "(DE-588)119133628"]

# Bibliography

Scifo, E. (2020). Hands-On Graph Analytics with Neo4j: Perform graph processing and visualization techniques using connected data across your enterprise. Birmingham, England: Packt Publishing.