In [1]:
import os
import bz2
import xml.etree.ElementTree as ET
import mwparserfromhell as mph

class Dump:
    """``Dump`` loads and parses dumps from wikipedia from
    ``path_xml`` with index ``path_idx``.
    
    Attributes
    ----------
    idx: dictionary
        ``{'page_name': (byte offset, page id, block size)}``
        Cached. Lazy.
    links: list of strings
        All links.
    article_links: list of strings
        Article links (not files, categories, etc.)
    years: list of int
        Years in the History section of a wikipedia page
        BC denoted as negative values
    page: mwparserfromhell.wikicode
        Current loaded wiki page
    path_xml: string
        Path to the zipped XML dump file.
    path_idx: string
        Path to the zipped index file.
    offset_max: int
        Maximum offset. Set as the size of the zipped dump.
    cache: xml.etree.ElementTree.Node
        Cache of the XML tree in current block
    """
    MAX_YEAR = 2020
    
    def __init__(self, path_xml, path_idx):
        self._idx = {}
        self._links = []
        self._article_links = []
        self._years = []
        self._page = None
        self.path_xml = path_xml
        self.path_idx = path_idx
        self.offset_max = 0
        self.cache = (0, None) # offset, cache
        
    @property
    def idx(self):
        if self._idx:
            return self._idx
        else:
            print('Dump: Loading index...')
            with bz2.BZ2File(self.path_idx, 'rb') as file:
                lines = [line for line in file]
            block_end = os.path.getsize(self.path_xml)
            offset_prev = block_end
            for line in reversed(lines):
                offset, pid, name = line.strip().split(b':', 2)
                offset, pid, name = (int(offset), int(pid), name.decode('utf8'))
                block_end = offset_prev if offset < offset_prev else block_end
                self._idx[name] = (offset, pid, block_end-offset)
                offset_prev = offset
            self.offset_max = max([x[0] for x in self._idx.values()])
            print('Dump: Loaded.')
            return self._idx
    
    @property
    def links(self):
        if self._links:
            return self._links
        elif self.page:
            self._links = [str(x.title) for x in self.page.filter_wikilinks()]
            self._links = [link.split('#')[0] for link in self._links]
            self._links = [link.split(' ') for link in self._links]
            self._links = [[words[0].capitalize()] + words[1:] for words in self._links]
            self._links = [' '.join(words) for words in self._links]
            return self._links
        else:
            return self._links
    
    @property
    def article_links(self):
        if self._article_links:
            return self._article_links
        elif self.links:
            self._article_links = [x for x in self.links if ':' not in x]
            return self._article_links
        else:
            return self._article_links
    
    @property
    def years(self):
        if self._years:
            return self._years
        elif self.page:
            history = Dump.get_history(self.page)
            top = self.page.get_sections()[0].strip_code()
            self._years = Dump.filter_years(top + history)
            return self._years
        else:
            return self._years
    
    @property
    def page(self):
        return self._page
    
    @page.setter
    def page(self, page):
        self._page = page
        self._links = []
        self._article_links = []
        self._years = []
    
    def load_page(self, page_name, filter_top=False):
        """Loads & returs page (``mwparserfromhell.wikicode``)
        named ``page_name`` from dump file. Returns only the
        top section if ``filter_top``.
        """
        if page_name not in self.idx.keys():
            self.page = None
            return
        offset, pid, block_size = self.idx[page_name]
        if offset == self.cache[0]:
            root = self.cache[1]
        else:
            xml = Dump.fetch_block(self.path_xml, offset, block_size)
            xml = b'<mediawiki>' + xml + b'</mediawiki>'*(offset != self.offset_max)
            root = ET.fromstring(xml)
            self.cache = (offset, root)
        text = Dump.search_id(root, pid)
        text = Dump.filter_top_section(text) if filter_top else text
        self.page = mph.parse(text, skip_style_tags = True)
        if self.page and 'REDIRECT' in self.page.strip_code():
            redirect = self.page.filter_wikilinks()[0].title
            return self.load_page(str(redirect))
        else:
            return self.page
    
    @staticmethod
    def fetch_block(path, offset, block_size):
        """ Fetches block of ``block_size`` (``int``) bytes
        at ``offset`` (``int``) in the zipped dump at 
        ``path`` (``string``) and returns the uncompressed
        text (``string``).
        """
        with open(path, 'rb') as file:
            file.seek(offset)
            return bz2.decompress(file.read(block_size))
    
    @staticmethod
    def search_id(root, pid):
        """Returns the text of the page with id ``pid``"""
        for page in root.iter('page'):
            if pid == int(page.find('id').text):
                return page.find('revision').find('text').text
    
    @staticmethod
    def filter_top_section(text):
        """Returns the top section of text,
        where the first header has the form ``==Heading==``
        """
        head = re.search(r'==.*?==', text)
        idx = head.span(0)[0] if head else len(text)
        return text[:idx] #(text[:idx], text[idx:])
    
    @staticmethod
    def get_history(page):
        """Returns the text of the history section.
        Returns ``""`` if not found.
        """
        headings = page.filter_headings()
        idx = [i for i, head in enumerate(headings) 
                       if 'History' in head or 'history' in head]
        if not idx:
            return ""
        sections = page.get_sections(include_headings=True)
        history = str(sections[idx[0]+1].strip_code())
        return history
    
    @staticmethod
    def filter_years(text):
        """Filters the years from text."""
        months = ['january', 'february', 'march', 'april', 'may', 'june',
                  'july', 'august', 'september', 'october', 'november', 'december']
        prepositions = ['around', 'after', 'at', 'as',
                        'approximately', 'before', 'between', 'by',
                        'during', 'from', 'in', 'near', 'past',
                        'since', 'until', 'within'] # removed: about, on
        conjugations = ['and']
        articles = ['the']
        times = ['early', 'mid', 'late']
        patterns = months + prepositions + conjugations + articles + times
        re_string = r'\b(' + '|'.join(patterns) + r')\b(\s|-)\b([0-9]{3,4})s?\b(?i)(?!\sMYA)\s?(BCE|BC)?'
        years = [int(match.group(3)) * (-2*bool(match.group(4))+1)
                for match in re.finditer(re_string, text, re.IGNORECASE)]
        re_string = r'([0-9]{1,2})(st|nd|rd|th) century\s?(BCE|BC)?'
        centuries = [(int(match.group(1)) * 100 - 100) * (-2*bool(match.group(2))+1)
                     for match in re.finditer(re_string, text, re.IGNORECASE)]
        years += centuries
        years = [y for y in years if y<Dump.MAX_YEAR]
        return sorted(years + centuries)
    
# import sys
# !{sys.executable} -m pip install networkx

In [2]:
path_base = '/Users/sppatankar/Developer/My Passport/Curiosity/Data/'
name_xml = 'enwiki-20190801-pages-articles-multistream.xml.bz2'
name_index = 'enwiki-20190801-pages-articles-multistream-index.txt.bz2'
path_xml = path_base + name_xml
path_index = path_base + name_index
dump = Dump(path_xml, path_index)

In [68]:
import pandas as pd
import codecs
import copy
import re
import numpy as np
from scipy.spatial.distance import cosine
from scipy.io import savemat
import networkx as nx

def clean_entity_name(name):
    
    name = name.replace('/wiki/', '') # remove leading address sub-string
    name = name.split("_")
    processed_string = []
    for token in name:
        try:
            token_copy = copy.deepcopy(token)
            # Find and replace six characters long encodings (they are divided in pairs by % signs)
            raw_decoding_tags_1 = re.findall('%[a-zA-Z\d]{2}%[a-zA-Z\d]{2}%[a-zA-Z\d]{2}', token_copy)
            processed_decoding_tags_1 = {}
            for idx, tag in enumerate(raw_decoding_tags_1):
                tag = tag.replace('%', '')
                tag = tag.lower()
                decoding = codecs.decode(tag, "hex").decode('utf-8')
                processed_decoding_tags_1[raw_decoding_tags_1[idx]] = decoding
            for key, value in processed_decoding_tags_1.items():
                token_copy = token_copy.replace(key, value) 
            # Find and replace four characters long encodings
            raw_decoding_tags_2 = re.findall('%[a-zA-Z\d]{2}%[a-zA-Z\d]{2}', token_copy)
            processed_decoding_tags_2 = {}
            for idx, tag in enumerate(raw_decoding_tags_2):
                tag = tag.replace('%', '')
                tag = tag.lower()
                decoding = codecs.decode(tag, "hex").decode('utf-8')
                processed_decoding_tags_2[raw_decoding_tags_2[idx]] = decoding
            for key, value in processed_decoding_tags_2.items():
                token_copy = token_copy.replace(key, value)
            # Find and replace two characters long encodings
            raw_decoding_tags_3 = re.findall('%[a-zA-Z\d]{2}', token_copy)
            processed_decoding_tags_3 = {}
            for idx, tag in enumerate(raw_decoding_tags_3):
                tag = tag.replace('%', '')
                tag = tag.lower()
                decoding = codecs.decode(tag, "hex").decode('utf-8')
                processed_decoding_tags_3[raw_decoding_tags_3[idx]] = decoding
            for key, value in processed_decoding_tags_3.items():
                token_copy = token_copy.replace(key, value)
            processed_string.append(token_copy)
        except UnicodeDecodeError:
            print('Could not process %s.' % token)
            processed_string.append(token_copy)

    # print(' '.join(processed_string))
    return ' '.join(processed_string)

test_string = 'Mosque%E2%80%93Cathedral_of_C%C3%B3rdoba'
clean_entity_name(test_string)

test_string = 'Bah%C3%A1%27%C3%AD_Faith'
clean_entity_name(test_string)

test_string = '/wiki/99_and_44/100%25_Dead'
clean_entity_name(test_string)

test_string = 'Camden%2C_New_Jersey'
clean_entity_name(test_string)

test_string = 'Palu,_Elaz%C4%B1%C4%9F'
clean_entity_name(test_string)

Could not process Elaz%C4%B1%C4%9F.


'Palu, Elaz%C4%B1%C4%9F'

In [69]:
wiki_df = pd.read_csv(path_base + 'KNOT_data_raw.csv')
wiki_df.head(3)

Unnamed: 0,ID,SourceName,TargetName,Day,TimeOrder,Hyperlink,DistanceWeights,AgeYears,SexOrient,Race,GenderFactor,EducDeg,Income,JE_5D,DS_5D,ST_5D,SC_5D,TS_5D,Count,Weight
0,101,/wiki/Jeff_Bezos,/wiki/Cloud_infrastructure,1,1,no,1.0,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,1,0.0
1,101,/wiki/Cloud_infrastructure,/wiki/Cloud_computing_security,1,2,yes,0.2,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2,0.8
2,101,/wiki/Cloud_computing_security,/wiki/Cloud_infrastructure,1,3,no,0.2,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,3,0.8


In [70]:
# create UID for each page
source_nodes = set(wiki_df['SourceName'].tolist())
target_nodes = set(wiki_df['TargetName'].tolist())
source_nodes.update(target_nodes)
node_set = {entity: name for name, entity in enumerate(source_nodes)}
wiki_df['SourceUID'] = wiki_df['SourceName'].apply(lambda x: node_set[x])
wiki_df['SrcNameClean'] = wiki_df['SourceName'].apply(lambda x: clean_entity_name(x))
wiki_df['TargetUID'] = wiki_df['TargetName'].apply(lambda x: node_set[x])
wiki_df['TgtNameClean'] = wiki_df['TargetName'].apply(lambda x: clean_entity_name(x))
wiki_df.head(3)

Could not process Elaz%C4%B1%C4%9F.
Could not process Dav%C3%AD%C3%B0sd%C3%B3ttir.
Could not process %C3%9E%C3%B3risd%C3%B3ttir.
Could not process %C4%90%C3%ACnh.
Could not process Dvo%C5%99%C3%A1k.
Could not process Elaz%C4%B1%C4%9F.
Could not process Dav%C3%AD%C3%B0sd%C3%B3ttir.
Could not process %C3%9E%C3%B3risd%C3%B3ttir.
Could not process %C4%90%C3%ACnh.
Could not process Dvo%C5%99%C3%A1k.


Unnamed: 0,ID,SourceName,TargetName,Day,TimeOrder,Hyperlink,DistanceWeights,AgeYears,SexOrient,Race,...,DS_5D,ST_5D,SC_5D,TS_5D,Count,Weight,SourceUID,SrcNameClean,TargetUID,TgtNameClean
0,101,/wiki/Jeff_Bezos,/wiki/Cloud_infrastructure,1,1,no,1.0,23.27945,Heterosexual,AsiaAm,...,4.25,1.6,2.8,2.0,1,0.0,10617,Jeff Bezos,8349,Cloud infrastructure
1,101,/wiki/Cloud_infrastructure,/wiki/Cloud_computing_security,1,2,yes,0.2,23.27945,Heterosexual,AsiaAm,...,4.25,1.6,2.8,2.0,2,0.8,8349,Cloud infrastructure,412,Cloud computing security
2,101,/wiki/Cloud_computing_security,/wiki/Cloud_infrastructure,1,3,no,0.2,23.27945,Heterosexual,AsiaAm,...,4.25,1.6,2.8,2.0,3,0.8,412,Cloud computing security,8349,Cloud infrastructure


In [73]:
# split the data by individual
ID_groups = wiki_df.groupby('ID')
for ID, group in ID_groups:
    count = 0
    # enforce time ordering
    group.sort_values(by = ['TimeOrder'], inplace = True)
    network_df = group[['TimeOrder', 'SourceUID', 'SrcNameClean', 'TargetUID', 'TgtNameClean']].reset_index(drop = True)
    G = nx.Graph()
    links = {}
    for index, row in network_df.iterrows():
        from_node = row.get('SrcNameClean')
        to_node = row.get('TgtNameClean')
        tag = 0
        if from_node in dump.idx.keys():
            if to_node in dump.idx.keys():
                tag = 1
                count += 1
        # print(from_node, ', ',to_node, tag)
        # add nodes to the network
        G.add_node(from_node)
        G.add_node(to_node)
    print("Network built for subject %d" % ID)
    print(len(network_df), count)
        
    
#     # create an empty network
#     G = nx.Graph()
#     edge_info = []
#     for index, row in network_df.iterrows():
#         from_node = row.get('SrcNameClean')
#         to_node = row.get('TgtNameClean')
#         edge_info_dict = {'from': from_node, 'to': to_node}
#         edge_info.append(edge_info_dict)
#         # add nodes to the network
#         G.add_node(from_node)
#         G.add_node(to_node)
#         # add edge to the network
#         G.add_edge(from_node, to_node)
#     adj_G = nx.linalg.graphmatrix.adjacency_matrix(G, weight = 'weight')
#     break

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Network built for subject 101
311 309
Network built for subject 104
148 148
Network built for subject 105
100 100
Network built for subject 106
476 476
Network built for subject 107
202 198
Network built for subject 108
134 134
Network built for subject 109
135 135
Network built for subject 112
89 89
Network built for subject 114
127 127
Network built for subject 115
398 393
Network built for subject 117
165 165
Network built for subject 119
255 255
Network built for subject 120
111 111
Network built for subject 121
321 315
Network built for subject 122
346 344
Network built for subject 126
246 246
Network built for subject 127
179 179
Network built for subject 128
151 151
Network built for subject 130
337 335
Network built for subject 131
287 287
Network built for subject 132
124 124
Network built for subject 135
215 211
Network built for subject 138
199 195
Network built for subject 139
261 259
Network built for subject 140
185 185
Network built for subject 141
560 556
Network built 

In [None]:
topics = ['optics']
links = {}
for topic in topics:
    dump.load_page('Index of %s articles' % topic)
    links[topic] = [str(l) for l in dump.article_links]
    print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

In [None]:
topic

In [None]:
dump.idx.keys()

In [None]:
links

# Get index of articles

* [all indices on Wikipedia](https://en.wikipedia.org/wiki/Portal:Contents/Indices)
* topics not searched
* international trade ("topics"), theory of constraints (small)
* too big: mathematics, neuroscience

In [None]:
path_base = '/Users/harangju/Developer/data/wiki/dumps/'
name_xml = 'enwiki-20190801-pages-articles-multistream.xml.bz2'
name_index = 'enwiki-20190801-pages-articles-multistream-index.txt.bz2'
path_xml = path_base + name_xml
path_index = path_base + name_index
dump = wiki.Dump(path_xml, path_index)

In [None]:
# natural & physical sciences
topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
          'genetics', 'immunology', 'molecular biology']
topics += ['chemistry', 'biophysics', 'energy', 'optics', 
           'earth science', 'geology', 'meteorology']
# philosophy
# topics += []
topics += ['philosophy of language', 'philosophy of law', 
           'philosophy of mind', 'philosophy of science']
# social sciences
topics += ['economics', 'accounting', 'education', 'linguistics', 'law', 'psychology',
           'sociology']
# technology & applied sciences
topics += ['electronics', 'software engineering', 'robotics']

In [None]:
links = {}
for topic in topics:
    dump.load_page('Index of %s articles' % topic)
    links[topic] = [str(l) for l in dump.article_links]
    print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

In [None]:
# https://en.wikipedia.org/wiki/Lists_of_mathematics_topics
# algebra
math_topics = ['calculus', 'geometry', 'abstract algebra',
               'Boolean algebra', 'commutative algebra',# 'homological algebra',
               'group theory',# 'representation theory', 
               'linear algebra']
# calculus & analysis
# math_topics += ['complex analysis', 'functional analysis',
#                 'integration and measure theory', 'harmonic analysis',
#                 'Fourier analysis', 'multivariable calculus', 'real analysis',
#                 'variational']
# geometry
# math_topics += ['geometry', 'curves', 'triangle', 'circle', 'general topology',
#                 'differential geometry', 'algebraic geometry', 'algebraic topology',
#                 'geometric topology', 'know theory', 'Lie groups']
# number theory
math_topics += [#'algebraic number theory',
                'number theory']
# applied math
math_topics += ['dynamical systems and differential equations']
#                 'partial differential equation']
topics += math_topics

In [None]:
links = {}
for topic in math_topics:
    dump.load_page(f"List of {topic} topics")
    links[topic] = [str(l) for l in dump.article_links]
    print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

In [None]:
topics += ['physics']

In [None]:
import string

topic = 'physics'
links[topic] = []
for letter in ['!$@', '0–9'] + list(string.ascii_uppercase):
    dump.load_page('Index of physics articles (%s)' % letter)
    links[topic].extend([str(l) for l in dump.article_links])
print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

In [None]:
topics += ['mathematics']

In [None]:
topic = 'mathematics'
links[topic] = []
for letter in ['0–9'] + list(string.ascii_uppercase):
    dump.load_page(
        f"Wikipedia:WikiProject Mathematics/List of mathematics articles ({letter})'
    )
    links[topic].extend([str(l) for l in dump.article_links])
print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

# Build graphs of topics

In [None]:
import pickle
import gensim.utils as gu

path_models = '/Users/harangju/Developer/data/wiki/models/'
tfidf = gu.SaveLoad.load(path_models + 'tfidf.model')
dct = pickle.load(open(path_models + 'dict.model','rb'))

## One network per topic

In [None]:
path = '/Users/harangju/Developer/data/wiki/graphs/dated/'

networks = {}
for topic in topics:
    ls = links[topic]
    print('\nTopic: ' + topic)
    networks[topic] = wiki.Net()
    networks[topic].build_graph(
        name=topic, dump=dump, nodes=ls, model=tfidf, dct=dct
    )
    networks[topic].save_graph(path + topic + '.pickle')
    networks[topic].save_graph(path + topic + '.gexf')
    networks[topic].save_barcodes(path + topic + '.barcode')

## Redo barcodes

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net(
        path_graph=os.path.join(path_saved, topic+'.pickle'),
    )

In [None]:
path = '/Users/harangju/Developer/data/wiki/graphs/barcode/'

for topic in topics:
    print('\nTopic: ' + topic)
    networks[topic].save_barcodes(os.path.join(path, topic+'.barcode'))

## Subnetworks

## Big network

In [None]:
all_links = list(set([v for l in links.values() for v in l]))
len(all_links)

In [None]:
path = '/Users/harangju/Developer/data/wiki/graphs/dated/'

big_network = wiki.Net()
big_network.build_graph(
    name='big_network',
    dump=dump, 
    nodes=all_links, 
    model=tfidf, 
    dct=dct,
    compute_core_periphery=False, 
    compute_communities=False, 
    compute_community_cores=False
)
big_network.save_graph(os.path.join(path, 'big_network_physics_math.pickle'))
big_network.save_graph(os.path.join(path, 'big_network_physics_math.gexf'))
# big_network.save_barcodes(os.path.join(path, 'big_network.barcode'))

## Nodes without years

In [None]:
path = '/Users/harangju/Developer/data/wiki/graphs/dated-noyear/'

if not os.path.isdir(path):
    os.mkdir(path)

networks_noyear = {}
for topic in topics:
    print('\nTopic: ' + topic)
    networks_noyear[topic] = wiki.Net()
    networks_noyear[topic].build_graph(
        name=topic, dump=dump, nodes=links[topic],
        fill_empty_years=False,
        compute_core_periphery=False,
        compute_communities=False,
        compute_community_cores=False
    )
    networks_noyear[topic].save_graph(path + topic + '.pickle')
    networks_noyear[topic].save_graph(path + topic + '.gexf')

In [None]:
path = '/Users/harangju/Developer/data/wiki/graphs/dated-noyear/'

networks_noyear = {}
for topic in topics:
    print(topic, end=' ')
    networks_noyear[topic] = wiki.Net()
    networks_noyear[topic].load_graph(path + topic + '.pickle')

In [None]:
import pandas as pd
import networkx as nx
import plotly.express as px

fraction_years = pd.DataFrame(
    [
        [
            topic,
            len([
                y
                for n, y in nx.get_node_attributes(
                    networks_noyear[topic].graph, 'year'
                ).items()
                if y
            ]) / len(networks_noyear[topic].graph.nodes)
        ]
        for topic in topics
    ],
    columns=['topics', 'fraction']
)
fraction_years

In [None]:
# for topic in topics:
#     print(topic, end='\t')
#     print( 
#         len([
#             y
#             for n, y in nx.get_node_attributes(
#                 networks_noyear[topic].graph, 'year'
#             ).items()
#             if y
#         ]) / len(networks_noyear[topic].graph.nodes)
#     )

In [None]:
path_fig = '/Users/harangju/Library/Mobile Documents/com~apple~CloudDocs/' +\
    'Documents/research/wikipedia/results'
path_plot = '0 graphs'

fig = px.histogram(fraction_years.fraction)
fig.update_layout(
    width=500, height=360,
    template='plotly_white',
    xaxis={'range': [0, 1]},
    showlegend=False
)
fig.show()
fig.write_image(os.path.join(path_fig, path_plot, 'fraction_years_with_math.pdf'))

# Generate null networks

## Load networks

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net()
    networks[topic].load_graph(path_saved + topic + '.pickle')

## Randomized target & year

Just randomizing year -> you get the same structures, it's just a matter of when you get those structures.
If you randomize year & target, then you're randomizing the structure & how they come about without changing any basic network statistics.

In [None]:
null_models = ['target', 'year']
num_nulls = 10
nulls = {}
for null_model in null_models:
    print('Null model: ' + null_model)
    path_to_save_null = '/Users/harangju/Developer/data/wiki/graphs/null-'\
                        +null_model+'/'
    nulls[null_model] = {}
    for topic, network in networks.items():
        print('Topic: ' + topic)
        nulls[null_model][topic] = []
        for i in range(num_nulls):
            print('Null: ' + str(i))
            null = network.randomize(null_model)
            null.graph.name = topic+'-null-'+str(i)
            null.save_graph(path_to_save_null + null.graph.name + '.pickle')
            null.save_barcodes(path_to_save_null + null.graph.name + '.barcode')
            nulls[null_model][topic].append(null)

## Jittered years

In [None]:
num_jitters = 1
max_jitter = 1
null_model = 'jitter'
path_to_save_null = '/Users/harangju/Developer/data/wiki/graphs/null-'+null_model+'/'
if not os.path.isdir(path_to_save_null):
    os.mkdir(path_to_save_null)
jittered = {}

In [None]:
import copy
import numpy as np
import numpy.random

for topic, network in networks.items():
    print('Topic: ' + topic)
    jittered[topic] = []
    print('Null: ', end='')
    for i in range(num_jitters):
        print(str(i), end=' ')
        null = wiki.Net()
        null.graph = copy.deepcopy(network.graph)
        for node in null.graph.nodes:
            null.graph.nodes[node]['year'] = null.graph.nodes[node]['year'] +\
                np.random.randint(-max_jitter, max_jitter+1)
        null.graph.name = topic+'-null-'+str(i)
        null.save_graph(path_to_save_null + null.graph.name + '.pickle')
        null.save_barcodes(path_to_save_null + null.graph.name + '.barcode')
        jittered[topic].append(null)
    print()

Gephi notes
* node size, fruchterman reingold = [10, 40], force atlas 2 = [4 16]
* text size = [1 1.4]
* preview font size = 5

## Generative networks
* random geometric graph (modularity)
* stochastic block model (modularity)
* caveman graph (modularity, cliques, most clustered & sparse)
* random clustered graph (clustering)

In [None]:
num_nulls = 10
gen_functions = {
    'rgg': lambda g: nx.random_geometric_graph(
        g.number_of_nodes(), 
    ),
    'sbm': lambda g: nx.stochastic_block_model(
        
    ),
    'cg': lambda g: nx.caveman_graph(
        
    ),
    'rcg': lambda g: nx.random_clustered_graph(
        
    )
}

In [None]:
num_nulls = 10
gen_nulls = {}
for name, function in gen_functions.items():
    print('Null model: ' + null_model)
    path_to_save_null = '/Users/harangju/Developer/data/wiki/graphs/null-'+\
        null_model+'/'
    nulls[null_model] = {}
    for topic, network in networks.items():
        print('Topic: ' + topic)
        nulls[null_model][topic] = []
        for i in range(num_nulls):
            print('Null: ' + str(i))
            null = network.randomize(null_model)
            null.graph.name = topic+'-null-'+str(i)
            null.save_graph(path_to_save_null + null.graph.name + '.pickle')
            null.save_barcodes(path_to_save_null + null.graph.name + '.barcode')
            nulls[null_model][topic].append(null)

# Generate networks for D3

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net()
    networks[topic].load_graph(path_saved + topic + '.pickle')

In [None]:
[(topic, len(networks[topic].graph.nodes)) for topic in topics]

In [None]:
import json as js
import networkx as nx

path = '/Users/harangju/Developer/data/wiki/graphs/json/'

for topic, network in networks.items():
    nodes = sorted(network.graph.nodes, key=lambda n: network.graph.nodes[n]['year'])
    json = js.dumps({
        'nodes': [
            {
                'id': node,
                'year': int(network.graph.nodes[node]['year']),
#                 'core_be': int(network.graph.nodes[node]['core_be']),
#                 'core_rb': network.graph.nodes[node]['core_rb'],
                'community': int(network.graph.nodes[node]['community']),
                'degree': network.graph.degree(node)
            }
            for i, node in enumerate(nodes)
        ],
        'links': [
            {
                'source': i,
                'target': nodes.index(target),
                'weight': network.graph.edges[node, target]['weight']
            }
            for i, node in enumerate(nodes)
            for target in network.graph.successors(node)
        ]
    })
    with open(os.path.join(path, topic+'.json'), 'w') as file:
        file.write(json)

# Generate barcodes for D3

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net(
        path_graph=os.path.join(path_saved, topic+'.pickle'),
#         path_barcodes=os.path.join(path_saved, topic+'.barcode')
    )

In [None]:
f = networks['cognitive science'].filtration
m = networks['cognitive science'].persistence
for i, c in enumerate(m):
    if m.pair(i) < i: continue      # skip negative simplices
    dim = f[i].dimension()
    if m.pair(i) != m.unpaired:
        print(f"{i}, {dim}, {c}, {m[i]}, {m.pair(i)}, {m[m.pair(i)]}")
    else:
        print(f"{i}, {dim}, {c}, {m[i]}")

In [None]:
m[m.pair(13)]

In [None]:
import dionysus as d

dgms = d.init_diagrams(
    networks['earth science'].persistence,
    networks['earth science'].filtration
)
dgms

In [None]:
for i, dgm in enumerate(dgms):
    print(f"dim {i}", end=' ')
    for p in dgm:
        print(p, end='; ')
    print()

In [None]:
dgms[2][0], dgms[2][0].data

In [None]:
import numpy as np
barcodes = networks['cognitive science'].barcodes.copy()
barcodes = barcodes\
    .drop(index=barcodes[barcodes.lifetime==0].index)\
    .reset_index(drop=True)
barcodes

In [None]:
barcodes.iloc[27]['death simplex'], barcodes.iloc[27]['homology nodes']

In [None]:
path = '/Users/harangju/Developer/data/wiki/graphs/barcode csv/'

for topic, network in {'cognitive science': networks['cognitive science']}.items(): #networks.items():
    barcodes = network.barcodes.copy(deep=True)
    barcodes = barcodes\
        .drop(index=barcodes[barcodes.lifetime==0].index)\
        .reset_index(drop=True)
    barcodes.death = barcodes.death\
        .replace(np.inf, 2100)\
        .astype(int)
    csv = 'i,birth,death,dim,cavity,death_nodes\n'
    for i, row in barcodes.iterrows():
        if row.lifetime==np.inf:
            cavity = row['birth simplex']
        else:
            cavity = row['homology nodes']
        csv += f"{i},{row.birth},{row.death},{row.dim},{';'.join(cavity)}," + \
            f"{';'.join(row['death nodes'])}\n"
    with open(os.path.join(path, topic+'.csv'), 'w') as file:
        file.write(csv)
# print(csv)