In [12]:
import collections
import dataclasses

import networkx as nx
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns

from scripture_graph import graph_lib
from scripture_graph import notebook_lib

%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


In [28]:
graph = nx.read_graphml('../../scripture_graph.graphml')
print(nx.info(graph))

Name: 
Type: DiGraph
Number of nodes: 48566
Number of edges: 166449
Average in degree:   3.4273
Average out degree:   3.4273


In [14]:
rows = []
for node in graph.nodes:
    if graph.nodes[node]['kind'] == 'topic':
        row = graph.nodes[node]
        row['key'] = node
        rows.append(row)
topics = pd.DataFrame(rows)
print(topics.shape)
topics.head()

(6571, 5)


Unnamed: 0,kind,volume,source,title,key
0,topic,Study Helps,TG,Aaron,TG Aaron
1,topic,Study Helps,TG,"Aaron, Descendants of","TG Aaron, Descendants of"
2,topic,Study Helps,TG,Aaronic Priesthood,TG Aaronic Priesthood
3,topic,Study Helps,TG,Abarim,TG Abarim
4,topic,Study Helps,TG,Abase,TG Abase


In [25]:
topics.source.value_counts()

TG       3512
IttTC    3059
Name: source, dtype: int64

In [26]:
graph.number_of_nodes() - len(topics)

41995

In [15]:
rows = []
for source, target in graph.edges:
    row = {'source': source, 'target': target}
    row.update({f'source_{key}': value for key, value in graph.nodes[source].items()})
    row.update({f'target_{key}': value for key, value in graph.nodes[target].items()})
    rows.append(row)
edges = pd.DataFrame(rows)
print(edges.shape)
edges.head()

(166449, 18)


Unnamed: 0,source,target,source_kind,source_volume,source_book,source_chapter,source_verse,target_kind,target_volume,target_source,target_title,target_key,target_book,target_chapter,target_verse,source_source,source_title,source_key
0,1 Chr. 1:1,TG Adam,verse,Old Testament,1 Chr.,1.0,1.0,topic,Study Helps,TG,Adam,TG Adam,,,,,,
1,1 Chr. 1:3,Moses 8:2,verse,Old Testament,1 Chr.,1.0,3.0,verse,Pearl of Great Price,,,,Moses,8.0,2.0,,,
2,1 Chr. 1:4,Moses 8:12,verse,Old Testament,1 Chr.,1.0,4.0,verse,Pearl of Great Price,,,,Moses,8.0,12.0,,,
3,1 Chr. 1:7,Gen. 10:4,verse,Old Testament,1 Chr.,1.0,7.0,verse,Old Testament,,,,Gen.,10.0,4.0,,,
4,1 Chr. 1:8,Gen. 10:6,verse,Old Testament,1 Chr.,1.0,8.0,verse,Old Testament,,,,Gen.,10.0,6.0,,,


In [16]:
topic_topic = (edges.source_kind == 'topic') & (edges.target_kind == 'topic')
topic_verse = (edges.source_kind == 'topic')
verse_topic = (edges.target_kind == 'topic')
verse_verse = (edges.source_kind == 'verse') & (edges.target_kind == 'verse')

assert topic_topic.sum() == 0
assert topic_verse.sum() + verse_topic.sum() + verse_verse.sum() == graph.number_of_edges()
print('topic->verse', topic_verse.sum())
print('verse->topic', verse_topic.sum())
print('verse->verse', verse_verse.sum())

topic->verse 98241
verse->topic 22223
verse->verse 45985


In [17]:
sizes = []
for component in nx.weakly_connected_components(graph):
    sizes.append(len(component))
sizes = np.asarray(sizes)

x = np.unique(sizes)
y = np.asarray([np.count_nonzero(sizes == size) for size in x])
df = pd.DataFrame({'size': x, 'count': y})
df

Unnamed: 0,size,count
0,1,13598
1,2,293
2,3,61
3,4,14
4,5,7
5,6,1
6,7,2
7,11,2
8,34066,1


In [18]:
# Prune the graph to the largest weakly connected component.
nodes = None
for component in nx.weakly_connected_components(graph):
    if len(component) > 1000:
        nodes = component
        break
subgraph = nx.subgraph(graph, nodes)
print(graph.number_of_nodes())
print(graph.number_of_edges())
print(subgraph.number_of_nodes())
print(subgraph.number_of_edges())

48566
166449
34066
165456


In [20]:
degree_df = notebook_lib.assign_ranks(nx.in_degree_centrality(subgraph))
degree_df.head(10)

Unnamed: 0,key,score,rank
0,TG Faith,0.003699,1
1,TG Righteousness,0.003581,2
2,"TG Jesus Christ, Prophecies about",0.003288,3
3,"TG Prayer, Pray",0.00317,4
4,"TG Repent, Repentance",0.003082,5
5,TG Treasure,0.002818,6
6,"TG God, Spirit of",0.002701,7
7,TG Missionary Work,0.002671,8
8,TG Walking with God,0.002671,8
9,TG Grace,0.002642,10


In [21]:
degree_df = notebook_lib.assign_ranks(nx.pagerank(subgraph))
degree_df.head(10)

Unnamed: 0,key,score,rank
0,TG Faith,0.000925,1
1,"TG Idolatry, Idol",0.000895,2
2,TG Righteousness,0.000889,3
3,"TG Jesus Christ, Prophecies about",0.000867,4
4,"TG Disobedience, Disobey",0.000849,5
5,TG Angels,0.000836,6
6,TG Treasure,0.000819,7
7,"TG Prayer, Pray",0.000796,8
8,TG Grace,0.000776,9
9,TG Missionary Work,0.000759,10


In [22]:
hubs, authorities = nx.hits(subgraph, max_iter=1000)

In [23]:
notebook_lib.assign_ranks(hubs).head(10)

Unnamed: 0,key,score,rank
0,IttTC God,0.004928,1
1,IttTC Name of the Lord,0.004036,2
2,TG Spirit,0.003881,3
3,"TG Know, Knew, Known",0.003669,4
4,IttTC Jesus Christ—Son of God,0.003669,5
5,IttTC Jesus Christ,0.003243,6
6,TG Name of the Lord,0.00289,7
7,"TG Jesus Christ, Divine Sonship",0.002863,8
8,"IttTC Spirit, Holy/Spirit of the Lord",0.002852,9
9,"IttTC Repentance, Repent",0.002696,10


In [24]:
notebook_lib.assign_ranks(authorities).head(10)

Unnamed: 0,key,score,rank
0,2 Ne. 31:21,0.000872,1
1,Isa. 9:6,0.000855,2
2,Jacob 4:5,0.000841,3
3,2 Ne. 19:6,0.000833,4
4,Moro. 10:4,0.000804,5
5,Moses 6:57,0.000795,6
6,D&C 68:25,0.000761,7
7,D&C 13:1,0.000729,8
8,JS—H 1:17,0.000688,9
9,D&C 20:37,0.000688,10
