<a href="https://colab.research.google.com/github/thomouvic/txtanalytics/blob/main/dhwikinet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Build a network of Wikipedia pages starting from a given page (e.g. Digital Humanities) and collecting its ego-net. 

Adapted from: Complex Network Analysis in Python
Recognize → Construct → Visualize → Analyze → Interpret
by Dmitry Zinoviev.

In [2]:
!pip install wikipedia

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11695 sha256=c19d35d77709db8014d9375ef1a1e8783aaff5b35add15007b1043bc32d6136a
  Stored in directory: /root/.cache/pip/wheels/07/93/05/72c05349177dca2e0ba31a33ba4f7907606f7ddef303517c6a
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [3]:
from operator import itemgetter
import networkx as nx
import wikipedia

In [None]:
# SEED = "Complex network".title()
SEED = "Digital humanities".title()

STOPS = ("International Standard Serial Number",
         "International Standard Book Number",
         "National Diet Library",
         "International Standard Name Identifier",
         "International Standard Book Number (Identifier)",
         "Pubmed Identifier", "Pubmed Central",
         "Digital Object Identifier", "Arxiv",
         "Proc Natl Acad Sci Usa", "Bibcode",
         "Library Of Congress Control Number", "Jstor", 
         "ISBN", "Doi", "Issn")

todo_lst = [(0, SEED)] # The SEED is in the layer 0
todo_set = set(SEED)   # The SEED itself
done_set = set()       # Nothing is done yet



F = nx.DiGraph()
layer, page = todo_lst[0]

while layer < 2:
    del todo_lst[0] #(1)
    done_set.add(page)
    print(layer, page) # Show progress

    try: #(2)
        wiki = wikipedia.page(page)
    except:
        layer, page = todo_lst[0]
        print("Could not load", page)
        continue

    for link in wiki.links: #(3)
        link = link.title()
        if link not in STOPS and not link.startswith("List Of"):
            if link not in todo_set and link not in done_set:
                todo_lst.append((layer + 1, link))
                todo_set.add(link)
            F.add_edge(page, link)

    layer, page = todo_lst[0] #(4)
print("{} nodes, {} edges".format(len(F), nx.number_of_edges(F)))
# 11597 nodes, 21331 edges

# Alex: 14300 nodes, 28192 edges (for Complex Network)
# Alex: 34385 nodes, 63182 edges (for Digital Humanities)

In [None]:
# F.remove_edges_from(F.selfloop_edges())
F.remove_edges_from(nx.selfloop_edges(F, data=True))
duplicates = [(node, node + "s") for node in F if node + "s" in F]
for dup in duplicates:
    F = nx.contracted_nodes(F, *dup, self_loops=False)
duplicates = [(x, y) for x, y 
              in [(node, node.replace("-", " ")) for node in F]
              if x != y and y in F]
for dup in duplicates:
    F = nx.contracted_nodes(F, *dup, self_loops=False)
nx.set_node_attributes(F, 0, "contraction")

In [56]:
# A: remove some useless high degree nodes
useless = ['Doi (Identifier)', 'Isbn (Identifier)', 'Issn (Identifier)', 'Internet', 'Html', 'Google']
for x in useless:
  if F.has_node(x):
    F.remove_node(x)

core = [node for node, deg in dict(F.degree()).items() if deg >= 2]
G = nx.subgraph(F, core)

print("{} nodes, {} edges".format(len(G), nx.number_of_edges(G)))

# nx.write_graphml(G, "cna.graphml")

top_indegree = sorted(dict(G.in_degree()).items(),
                      reverse=True, key=itemgetter(1))[:200]


# print(top_indegree)
for x in top_indegree: 
  print(x[1], x[0])

8076 nodes, 34378 edges
119 Digital Humanities
110 S2Cid (Identifier)
101 Digital Library
96 Semantic Web
87 Database
84 World Wide Web
84 Metadata
80 Semantic Network
79 Ontology (Information Science)
77 Knowledge Representation And Reasoning
76 Hypertext
76 Knowledge Management
75 Knowledge Extraction
75 Semantics (Computer Science)
75 Xml
75 Information Architecture
75 Web 2.0
75 Simple Knowledge Organization System
74 Collective Intelligence
73 Semantic Reasoner
73 Semantic Publishing
73 Hypertext Transfer Protocol
73 Dublin Core
72 Shacl
72 Semantic Computing
72 Doap
72 Resource Description Framework
72 Json-Ld
72 Rdfa
72 Schema.Org
72 Internationalized Resource Identifier
72 Web Ontology Language
72 Linked Data
72 Notation3
72 Sparql
72 Topic Map
72 Uniform Resource Identifier
72 Foaf
71 N-Triples
71 Semantic Matching
71 Hyperdata
71 Common Logic
71 Semantic Analytics
71 Semantic Html
71 Rule Interchange Format
71 Trig (Syntax)
71 Grddl
71 Microdata (Html)
71 Hreview
71 Rdf Schem

In [52]:
nx.write_gexf(G, "dh_wikipedia.gexf")

In [31]:
# wiki = wikipedia.page('Isbn (Identifier)')
# print(wiki)
# print(wiki.title)

<WikipediaPage 'ISBN'>
ISBN


In [50]:
# for node in G.nodes:
#   for neighbor in G.successors(node):
#     print(node, ',', neighbor)

In [38]:
print(G)

DiGraph with 8394 nodes and 37263 edges
