In [1]:
import os
import re
import gc
import json
import pickle
import xml.sax
import requests
import pandas as pd

import bz2
import subprocess
import numpy as np
from IPython import display
from matplotlib import pyplot as plt
from timeit import default_timer as timer
from scipy.sparse import csr_matrix, coo_matrix, csc_matrix, lil_matrix, coo_matrix

In [2]:
import tqdm
from functools import partial
from multiprocessing import Pool
from timeit import default_timer as timer

In [3]:
import mwparserfromhell

In [4]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

## arguements

In [5]:
project = 'enwiki'
dump_date = "20220420"
dataset_home = '/home/cse/phd/anz198717/scratch/suchith_data/wikipedia/wikipedia-data-science'

In [7]:
prog = re.compile(f'({project}-{dump_date})-'+r'pages-articles-multistream([0-9]{1,2}).xml-(p[0-9]+p[0-9]+).bz2')

partitions = sorted([f'{dataset_home}/datasets/{file}' for file in os.listdir(f'{dataset_home}/datasets') 
                     if prog.match(file)])

print(f'Total number of partitions of the wikipedia dump : {len(partitions)}')

Total number of partitions of the wikipedia dump : 62


## helper functions

In [8]:
def extract_filetag(data_path, tag_extractor):
    file_tag = ''
    if isinstance(tag_extractor, re.Pattern):
        parts_tag = tag_extractor.match(os.path.basename(data_path))
        if parts_tag:
            try:
                file_tag = f'-{parts_tag.group(1)}-{parts_tag.group(2)}-{parts_tag.group(3)}'
            except:
                file_tag = ''
    elif isinstance(tag_extractor, str):
        file_tag = tag_extractor
    return file_tag

In [9]:
def dict_head_random(dictionary, n=10):
    keys = np.random.choice(list(dictionary.keys()), size=n)
    for k in keys:
        print(f'{k} : {dictionary[k]}')

## Page handler

In [10]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    
    def __init__(self, matches=None):
        xml.sax.handler.ContentHandler.__init__(self)
        
        """
        PARSING VARIABLES: these variables will be useful
        while parsing the wikipedia dumps.
        """
        #regrex to select wikipedia section
        #TODO: remove this from here.
        self.matches = matches
        
        #basic storage for on-the-fly processing
        self._buffer = None
        self._values = {}
        self._current_tag = None
        
        #flags for handling special cases.
        self._add_page = True
        self._is_pageid = True
        
        
        
        
        
        #from here improvements can be made
        self.pages = []
        self._article_count = 0
        self._total_edges = 0
        
        """
        output information
        """
        #self.article_mat = WikilinkGraph()
        #self.seealso_mat = WikilinkGraph()
        
        self.redirects = {}
        self.id_to_title = {}
        self.page_content = {}
        
        
    def characters(self, content):
        if self._current_tag:
            self._buffer.append(content)
            
    def startElement(self, name, attrs):
        if name in ('title', 'text', 'ns'):
            self._current_tag = name
            self._buffer = []
        elif name == 'id' and self._is_pageid:
            self._is_pageid = False
            self._current_tag = name
            self._buffer = []    
        elif name == 'redirect':
            self.redirects[self._values['article_title'].strip()] = attrs.getValue('title').strip()
            self._add_page = False
                   
    def endElement(self, name):
        if name == self._current_tag:
            self._values[f'article_{name}'] = ' '.join(self._buffer)
            self._current_tag = None
        elif name == 'page':
            if int(self._values['article_ns']):
                self._add_page = False
                
            if self._add_page:
                self._article_count += 1
                self.pages.append(self._values.copy())
                
            self._add_page = True
            self._is_pageid = True

In [11]:
data_path = sorted(partitions)[0]; data_path

'/home/cse/phd/anz198717/scratch/suchith_data/wikipedia/wikipedia-data-science/datasets/enwiki-20220420-pages-articles-multistream1.xml-p1p41242.bz2'

In [12]:
matches = r'^([Ss]ee[ ]*|[Ss]ee[ ]*([Aa]lso|[Mm]ore|[Aa]ll)|[Ss]ee[ ]*[Aa]lso[ ]*\(.+\))$'

In [13]:
start = timer()

handler = WikiXmlHandler(matches)

parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for i, line in enumerate(subprocess.Popen(['bzcat'], 
                         stdin = open(data_path), 
                         stdout = subprocess.PIPE).stdout):
    parser.feed(line)
    
    if handler._article_count > 10:
        break
        
end = timer()
        
print(f'Searched through {handler._article_count} articles.')
print(f'Processing time is {round(end - start)} secs.')

Searched through 11 articles.
Processing time is 0 secs.


In [16]:
handler._article_count, len(handler.pages)

(11, 11)

## Graph Data Structure

In [10]:
def prune_map(mapping, idxs):
    rev_mapping = {value:key for key, value in mapping.items()}
    pruned_mapping = {}
    for i, idx in enumerate(idxs):
        pruned_mapping[rev_mapping[idx]] = i
    return pruned_mapping

In [20]:
class WikilinkGraph:

    def __init__(self):
        self.num_docs = 0
        self.doc_to_rowindex = {}
        
        """
        document link frequency
        """
        self.lf_indptr = [0]
        self.lf_indices = []
        self.lf = []
        
        """
        link document frequency
        """
        self.link_doc_cnt = {}
        
        self.wikilinks = {}
        
    def update_lf_df(self, page_title, links):
        if page_title in self.doc_to_rowindex:
            return
        
        self.doc_to_rowindex[page_title] = self.num_docs
        self.num_docs += 1
        
        """
        number of links of a particular type present in a document.
        """
        doc_link_cnt = {}
        for link in links:
            #please check this again, index here is strange.
            index = self.wikilinks.setdefault(link, len(self.wikilinks))
            doc_link_cnt[link] = doc_link_cnt.get(link, 0) + 1
        
        """
        document link frequecy matrix
        """
        for link, cnt in doc_link_cnt.items():
            self.lf_indices.append(self.wikilinks[link])
            self.lf.append(cnt)
            self.link_doc_cnt[link] = self.link_doc_cnt.get(link, 0) + 1
        self.lf_indptr.append(len(self.lf_indices))
        
    def get_doc_link_freq(self):
        """
        document link frequency
        """
        doc_lf = csr_matrix((self.lf, self.lf_indices, self.lf_indptr), 
                            shape=(self.num_docs, len(self.wikilinks)),dtype=int)
        return doc_lf
    
    def get_link_doc_freq(self):
        """
        link document frequency
        """
        df_indptr = [0]
        df_indices = []
        df = []

        for link, cnt in self.link_doc_cnt.items():
            df_indices.append(self.wikilinks[link])
            df.append(cnt)
        df_indptr.append(len(df_indices))

        link_df = csr_matrix((df, df_indices, df_indptr),
                             shape=(1, len(self.wikilinks)),
                             dtype=int)
        return link_df
    
    def save_data(self, save_dir, tag=''):
        os.makedirs(save_dir, exist_ok=True)
        
        doc_lf = self.get_doc_link_freq()
        link_df = self.get_link_doc_freq()
        
        stat = (self.doc_to_rowindex, doc_lf, link_df,
                self.num_docs, self.wikilinks)

        with open(f'{save_dir}/link_graph{tag}.pickle', 'wb') as f:
            pickle.dump(stat, f)
            
    def load_data(self, save_dir, tag=''):
        filename = f'{save_dir}/link_graph{tag}.pickle'
        
        if os.path.exists(filename):
            
            with open(filename, 'rb') as f:
                stat = pickle.load(f)
                
            self.doc_to_rowindex, doc_lf, link_df, self.num_docs, self.wikilinks = stat
            
            self.lf = list(doc_lf.data)
            self.lf_indptr = list(doc_lf.indptr)
            self.lf_indices = list(doc_lf.indices)
            
            rev_wikilinks = {v:k for k, v in self.wikilinks.items()}
            self.link_doc_cnt = {rev_wikilinks[idx]:cnt for idx, cnt in zip(link_df.indices, link_df.data)}
            
            return True
        return False
    
    def convert_mat2list(self, doc_lf, link_df):
        self.lf = list(doc_lf.data)
        self.lf_indptr = list(doc_lf.indptr)
        self.lf_indices = list(doc_lf.indices)

        rev_wikilinks = {v:k for k, v in self.wikilinks.items()}
        self.link_doc_cnt = {rev_wikilinks[idx]:cnt for idx, cnt in zip(link_df.indices, link_df.data)}
    
    def filter_graph(self, filter_func):
        doc_lf = self.get_doc_link_freq()
        link_df = self.get_link_doc_freq()
        
        doc_lf, link_df, self.wikilinks = self.filter_graph_columns(doc_lf, link_df, self.wikilinks, filter_func)
        doc_lf, self.doc_to_rowindex = self.filter_graph_rows(doc_lf, self.doc_to_rowindex)
        
        self.convert_mat2list(doc_lf, link_df)
        self.num_docs = len(self.doc_to_rowindex)
        
    def filter_graph_columns(self, doc_lf, link_df, wikilinks, filter_func):
        filtered_wikilinks = {}
        for link in wikilinks:
            processed_link = filter_func(link)
            if processed_link:
                if processed_link in filtered_wikilinks:
                    #doc_lf[:, filtered_wikilinks[processed_link]] += doc_lf[:, wikilinks[link]]
                    #link_df[:, filtered_wikilinks[processed_link]] += link_df[:, wikilinks[link]]
                    pass
                else:
                    filtered_wikilinks[processed_link] = wikilinks[link]
        doc_lf = doc_lf[:, list(filtered_wikilinks.values())]
        link_df = link_df[:, list(filtered_wikilinks.values())]
        wikilinks = {article_title: i for i, article_title in enumerate(filtered_wikilinks)}
        
        return doc_lf, link_df, wikilinks
    
    def filter_graph_rows(self, doc_lf, doc_to_rowindex):
        pruned_row_idx = np.where( np.array(doc_lf.sum(axis=1)).reshape(-1) > 0 )[0]
        doc_lf = doc_lf[pruned_row_idx, :]
        doc_to_rowindex = prune_map(doc_to_rowindex, pruned_row_idx)
        return doc_lf, doc_to_rowindex
    

In [21]:
class WikiGraphDataset:
    
    def __init__(self, matches=None):
        self.matches = matches
        
        self.seealso_graph = WikilinkGraph()
        self.article_graph = WikilinkGraph()
        
        self.id_to_title = {}
        self.wiki_content = {}
        self.redirects = {}
        
    def extract_article_info(self, article_title, article_text):
        wikicode = mwparserfromhell.parse(article_text, skip_style_tags=True)
        wikicode.remove_nodetype(inplace=True)

        match_sections, rest_sections = wikicode.split_sections(matches=self.matches, include_lead=True, flat=True)

        match_wikilinks = self.extract_section_wikilinks(match_sections, article_title)
        rest_wikilinks = self.extract_section_wikilinks(rest_sections, article_title)

        article_content = wikicode.strip_code().strip()

        return match_wikilinks, rest_wikilinks, article_content
    
    def extract_section_wikilinks(self, sections, article_title):
        wikilinks = list()

        for section in sections:
            links = list( map(lambda link: (link.title).strip_code().strip(), section.filter_wikilinks()) )
            wikilinks.extend(links)

        return wikilinks
    
    def add_article(self, article_title, article_text, article_id, article_ns):
        
        article_title, article_id = article_title.strip(), int(article_id.strip())
        if article_title in self.article_graph.doc_to_rowindex or article_title in self.seealso_graph.doc_to_rowindex:
            return
        seealso_wikilinks, article_wikilinks, article_content = self.extract_article_info(article_title, 
                                                                                          article_text)
        
        if len(seealso_wikilinks) or len(article_wikilinks):
            self.id_to_title[article_id] = article_title
            
            if len(seealso_wikilinks):
                self.seealso_graph.update_lf_df(article_title, seealso_wikilinks)
            
            if len(article_wikilinks):
                self.article_graph.update_lf_df(article_title, article_wikilinks)
            
            if article_content:
                self.wiki_content[article_id] = article_content
                
    def add_redirect(self, article_title, target_title):
        self.redirects[article_title] = target_title
        
    def save_graph(self, save_dir, tag=''):
        self.seealso_graph.save_data(save_dir, tag=f'_seealso{tag}')
        self.article_graph.save_data(save_dir, tag=f'_articles{tag}')
                
    def save_data(self, save_dir, tag=''):
        os.makedirs(save_dir, exist_ok=True)
        
        self.save_graph(save_dir, tag)
        self.save_idtotitle(save_dir, tag)
        self.save_wikicontent(save_dir, tag)
        self.save_redirects(save_dir, tag)
        
    def save_idtotitle(self, save_dir, tag=''):
        map_file = f'{save_dir}/id_to_title{tag}.pickle'
        with open(map_file, 'wb') as f:
            pickle.dump(self.id_to_title, f)
            
    def save_wikicontent(self, save_dir, tag=''):
        content_file = f'{save_dir}/wiki_content{tag}.pickle'
        with open(content_file, 'wb') as f:
            pickle.dump(self.wiki_content, f)
            
    def save_redirects(self, save_dir, tag=''):
        redirect_file = f'{save_dir}/redirects{tag}.pickle'
        with open(redirect_file, 'wb') as f:
            pickle.dump(self.redirects, f)
            
    def load_graph(self, save_dir, tag=''):
        if not self.seealso_graph.load_data(save_dir, tag=f'_seealso{tag}'):
            raise Exception("Unable to load 'seealso graph'.")
            
        if not self.article_graph.load_data(save_dir, tag=f'_articles{tag}'):
            raise Exception("Unable to load 'article graph'.")
            
    def load_data(self, save_dir, tag=''):
        self.load_graph(save_dir, tag)
        self.load_idtotitle(save_dir, tag)
        self.load_wikicontent(save_dir, tag)
        self.load_redirects(save_dir, tag)
        
    def load_idtotitle(self, save_dir, tag=''):
        map_file = f'{save_dir}/id_to_title{tag}.pickle'
        if os.path.exists(map_file):
            with open(map_file, 'rb') as f:
                self.id_to_title = pickle.load(f)
        else:
            raise Exception("Unable to load 'id_to_title'.")
            
    def load_wikicontent(self, save_dir, tag=''):
        content_file = f'{save_dir}/wiki_content{tag}.pickle'
        if os.path.exists(content_file):
            with open(content_file, 'rb') as f:
                self.wiki_content = pickle.load(f)
        else:
            raise Exception("Unable to load 'wiki_content'.")
            
    def load_redirects(self, save_dir, tag=''):
        redirect_file = f'{save_dir}/redirects{tag}.pickle'
        if os.path.exists(redirect_file):
            with open(redirect_file, 'rb') as f:
                self.redirects = pickle.load(f)
        else:
            raise Exception("Unable to load 'redirects'.")
            
    def lower_id_title(self):
        for article_num, article_title in self.id_to_title.items():
            article_title = article_title[0].lower() + article_title[1:]
            self.id_to_title[article_num] = article_title
            
    def lower_redirects(self):
        redirects, article_titles = zip(*self.redirects.items())
        for redirect, article_title in zip(redirects, article_titles):
            del self.redirects[redirect]
            
            redirect = redirect[0].lower() + redirect[1:]
            article_title = article_title[0].lower() + article_title[1:]
            self.redirects[redirect] = article_title
            

### Base testing

In [140]:
#test this before changing the WikiXmlHandler

In [19]:
wikidataset = WikiGraphDataset(matches=matches)

#### Single

In [247]:
handler.pages[2]['article_title']

'Albedo'

In [248]:
wikidataset.add_article(**handler.pages[2])

In [253]:
wikidataset.article_graph.wikilinks

{'File:Albedo-e hg.svg': 0,
 'diffuse reflection': 1,
 'sunlight': 2,
 'solar radiation': 3,
 'black body': 4,
 'Radiosity (radiometry)': 5,
 'irradiance': 6,
 'position of the Sun': 7,
 'reflectance': 8,
 'visible spectrum': 9,
 'climatology': 10,
 'astronomy': 11,
 'Leadership in Energy and Environmental Design': 12,
 'cloud cover': 13,
 'Johann Heinrich Lambert': 14,
 'Photometria': 15,
 'Deciduous forest': 16,
 'Earth': 17,
 'File:Ceres 2003 2004 clear sky total sky albedo.png': 18,
 'Earth observation': 19,
 'NASA': 20,
 'MODIS': 21,
 'Terra (satellite)': 22,
 'Aqua (satellite)': 23,
 'Suomi NPP': 24,
 'Joint Polar Satellite System': 25,
 'directional-hemispherical reflectance': 26,
 'bidirectional reflectance distribution function': 27,
 'greenhouse effect': 28,
 'ocean planet': 29,
 'climate change': 30,
 'solar zenith angle': 31,
 'bi-hemispherical reflectance': 32,
 'insolation': 33,
 'Arctic': 34,
 'Antarctic': 35,
 'Sahara Desert': 36,
 'Tropical': 37,
 'sub-tropical': 38,
 

In [257]:
print(handler.pages[2]['article_text'])

{{Short description|Ratio of how much light is reflected back from a body}} 
 {{Other uses}} 
 {{Use dmy dates|date=September 2019}} 
 [[File:Albedo-e hg.svg|thumb|upright=1.3|The percentage of [[diffuse reflection|diffusely reflected]] [[sunlight]] relative to various surface conditions]] 
 
 '''Albedo''' ({{IPAc-en|æ|l|ˈ|b|iː|d|oʊ}}; {{etymology|la|albedo|whiteness}}) is the measure of the [[diffuse reflection]] of [[sunlight|solar radiation]] out of the total [[solar radiation]] and measured on a scale from 0, corresponding to a [[black body]] that absorbs all incident radiation, to 1, corresponding to a body that reflects all incident radiation. 
 
 Surface albedo is defined as the ratio of [[Radiosity (radiometry)|radiosity]] ''J'' < sub > e < /sub >  to the [[irradiance]] ''E'' < sub > e < /sub >  (flux per unit area) received by a surface. < ref > {{cite web|url=http://web.cse.ohio-state.edu/~parent.1/classes/782/Lectures/03_Radiometry.pdf |format=PDF|title=Fundamentals of Rende

In [252]:
print(handler.wiki_co[39])

thumb|upright=1.3|The percentage of diffusely reflected sunlight relative to various surface conditions 
 
 '''Albedo''' (; ) is the measure of the diffuse reflection of solar radiation out of the total solar radiation and measured on a scale from 0, corresponding to a black body that absorbs all incident radiation, to 1, corresponding to a body that reflects all incident radiation. 
 
 Surface albedo is defined as the ratio of radiosity ''J'' < sub > e < /sub >  to the irradiance ''E'' < sub > e < /sub >  (flux per unit area) received by a surface. < ref >  < /ref >  The proportion reflected is not only determined by properties of the surface itself, but also by the spectral and angular distribution of solar radiation reaching the Earth's surface. < ref >  < /ref >  These factors vary with atmospheric composition, geographic location, and time (see position of the Sun). While bi-hemispherical reflectance is calculated for a single angle of incidence (i.e., for a given position of the 

#### Multiple

In [191]:
for i in range(len(handler.pages)):
    wikidataset.add_article(**handler.pages[i])

In [209]:
len(wikidataset.article_graph.wikilinks), len(wikidataset.seealso_graph.wikilinks)

(4906, 29)

In [192]:
handler.pages[10]

{'article_title': 'Academy Awards',
 'article_ns': '0',
 'article_id': '324',
 'article_text': '{{short description|Annual awards for cinematic achievements}} \n {{Redirect2|Oscars|The Oscar|other uses|Oscar{{!}}Oscar (disambiguation)}} \n {{pp-move-indef}} \n {{Use American English|date=December 2019}} \n {{Use mdy dates|date=March 2020}} \n {{Infobox award \n | name           = Academy Awards \n | current_awards = 94th Academy Awards \n | image          = Academy Award trophy.png \n | alt            =  \n | caption        = The Academy Award statuette (the  " Oscar " ) \n | awarded_for    = Excellence in the American and International [[film industry]] \n | presenter      = [[Academy of Motion Picture Arts and Sciences]] \n | country        = United States \n | year           = {{Start date and age|1929|5|16}} \n | network        = [[List of Academy Awards broadcasters|List of broadcasters]] \n | website        = {{URL|https://abc.com/shows/oscars}} \n }} \n \n The \'\'\'Academy Awar

In [193]:
wikidataset.article_graph.wikilinks

{'File:Albedo-e hg.svg': 0,
 'diffuse reflection': 1,
 'sunlight': 2,
 'solar radiation': 3,
 'black body': 4,
 'Radiosity (radiometry)': 5,
 'irradiance': 6,
 'position of the Sun': 7,
 'reflectance': 8,
 'visible spectrum': 9,
 'climatology': 10,
 'astronomy': 11,
 'Leadership in Energy and Environmental Design': 12,
 'cloud cover': 13,
 'Johann Heinrich Lambert': 14,
 'Photometria': 15,
 'Deciduous forest': 16,
 'Earth': 17,
 'File:Ceres 2003 2004 clear sky total sky albedo.png': 18,
 'Earth observation': 19,
 'NASA': 20,
 'MODIS': 21,
 'Terra (satellite)': 22,
 'Aqua (satellite)': 23,
 'Suomi NPP': 24,
 'Joint Polar Satellite System': 25,
 'directional-hemispherical reflectance': 26,
 'bidirectional reflectance distribution function': 27,
 'greenhouse effect': 28,
 'ocean planet': 29,
 'climate change': 30,
 'solar zenith angle': 31,
 'bi-hemispherical reflectance': 32,
 'insolation': 33,
 'Arctic': 34,
 'Antarctic': 35,
 'Sahara Desert': 36,
 'Tropical': 37,
 'sub-tropical': 38,
 

In [194]:
wikidataset.id_to_title

{39: 'Albedo',
 290: 'A',
 303: 'Alabama',
 305: 'Achilles',
 307: 'Abraham Lincoln',
 308: 'Aristotle',
 309: 'An American in Paris',
 316: 'Academy Award for Best Production Design',
 324: 'Academy Awards',
 12: 'Anarchism',
 25: 'Autism'}

In [195]:
wikidataset.article_graph.get_link_doc_freq()

<1x4906 sparse matrix of type '<class 'numpy.int64'>'
	with 4906 stored elements in Compressed Sparse Row format>

In [196]:
wikidataset.article_graph.get_doc_link_freq()

<11x4906 sparse matrix of type '<class 'numpy.int64'>'
	with 5039 stored elements in Compressed Sparse Row format>

In [197]:
wikidataset.seealso_graph.get_doc_link_freq().todense()

matrix([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 1, 1, 1, 1]])

In [198]:
wikidataset.seealso_graph.get_link_doc_freq().todense()

matrix([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])

In [199]:
wikidataset.seealso_graph.wikilinks

{'WP:SEEALSO': 0,
 'Cool roof': 1,
 'Daisyworld': 2,
 'Emissivity': 3,
 'Exitance': 4,
 'Global dimming': 5,
 'Irradiance': 6,
 "Kirchhoff's law of thermal radiation": 7,
 'Opposition surge': 8,
 'Polar see-saw': 9,
 'Radar astronomy': 10,
 'Solar radiation management': 11,
 'Index of Alabama-related articles': 12,
 'Outline of Alabama': 13,
 'Outline of Abraham Lincoln': 14,
 'Grace Bedell': 15,
 'The Towers (Ohio State)': 16,
 'List of civil rights leaders': 17,
 'List of photographs of Abraham Lincoln': 18,
 'Lincoln (film)': 19,
 'film': 20,
 'Steven Spielberg': 21,
 'Linconia': 22,
 'BAFTA Award for Best Production Design': 23,
 "Critics' Choice Movie Award for Best Production Design": 24,
 'List of film awards': 25,
 'List of Academy Award records': 26,
 'List of actors with Academy Award nominations': 27,
 'List of superlative Academy Award winners and nominees': 28}

In [200]:
a, b, c = wikidataset.extract_article_info(handler.pages[1]['article_title'], handler.pages[1]['article_text'])

In [201]:
a

[]

In [202]:
b

['neurodevelopmental disorder',
 'Regressive autism',
 'developmental milestones',
 'Heritability of autism',
 'environmental factors',
 'rubella',
 'valproic acid',
 'cocaine',
 'pesticides',
 'fetal growth restriction',
 'autoimmune disease',
 'Controversies in autism',
 'Causes of autism',
 'MMR vaccine and autism',
 'nerve cell',
 'synapse',
 'Diagnostic and Statistical Manual of Mental Disorders',
 'Asperger syndrome',
 'pervasive developmental disorder not otherwise specified',
 'autism spectrum disorder',
 'Societal and cultural aspects of autism',
 'Autism rights movement',
 'neurodevelopmental disorder',
 'Remission (medicine)',
 'autism spectrum disorder',
 '#Classification',
 'Temple Grandin',
 'social communication',
 'neurotypical',
 'neural development',
 'toddler',
 'social norms',
 'eye contact',
 'turn-taking',
 'nonverbal autism',
 'Attachment (psychology)',
 'Attachment in children#Secure attachment',
 'Face perception#Autism',
 'alexithymia',
 'babbling',
 'echolali

In [203]:
len(b)

309

In [204]:
wikidataset.article_graph.get_link_doc_freq().todense()

matrix([[1, 1, 1, ..., 1, 1, 1]])

In [205]:
wikidataset.article_graph.wikilinks

{'File:Albedo-e hg.svg': 0,
 'diffuse reflection': 1,
 'sunlight': 2,
 'solar radiation': 3,
 'black body': 4,
 'Radiosity (radiometry)': 5,
 'irradiance': 6,
 'position of the Sun': 7,
 'reflectance': 8,
 'visible spectrum': 9,
 'climatology': 10,
 'astronomy': 11,
 'Leadership in Energy and Environmental Design': 12,
 'cloud cover': 13,
 'Johann Heinrich Lambert': 14,
 'Photometria': 15,
 'Deciduous forest': 16,
 'Earth': 17,
 'File:Ceres 2003 2004 clear sky total sky albedo.png': 18,
 'Earth observation': 19,
 'NASA': 20,
 'MODIS': 21,
 'Terra (satellite)': 22,
 'Aqua (satellite)': 23,
 'Suomi NPP': 24,
 'Joint Polar Satellite System': 25,
 'directional-hemispherical reflectance': 26,
 'bidirectional reflectance distribution function': 27,
 'greenhouse effect': 28,
 'ocean planet': 29,
 'climate change': 30,
 'solar zenith angle': 31,
 'bi-hemispherical reflectance': 32,
 'insolation': 33,
 'Arctic': 34,
 'Antarctic': 35,
 'Sahara Desert': 36,
 'Tropical': 37,
 'sub-tropical': 38,
 

In [208]:
wikidataset.article_graph.get_doc_link_freq().sum()

6651

### saving code

In [20]:
save_dir = '/home/cse/phd/anz198717/scratch/suchith_data/wikipedia/wikipedia-data-science/test_data_2'

In [21]:
wikidataset = WikiGraphDataset(matches=matches)

In [22]:
for i in range(len(handler.pages)):
    wikidataset.add_article(**handler.pages[i])

In [23]:
wikidataset.save_data(save_dir, tag='-1')

In [24]:
w2 = WikiGraphDataset(matches=matches)

In [25]:
w2.load_data(save_dir, tag='-1')

In [26]:
#check -- all done
wikidataset.article_graph.doc_to_rowindex == w2.article_graph.doc_to_rowindex

True

## Graph Handler

In [22]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    
    def __init__(self, matches=None):
        xml.sax.handler.ContentHandler.__init__(self)
        
        """
        PARSING VARIABLES: these variables will be useful
        while parsing the wikipedia dumps.
        """
        #basic storage for on-the-fly processing
        self._buffer = None
        self._values = {}
        self._current_tag = None
        
        #flags for handling special cases.
        self._add_page = True
        self._is_pageid = True
        
        """
        STORAGE VARIABLES: these variable will be used for
        storing the graph and content of the wikipedia dump.
        """
        self.wikidataset = WikiGraphDataset(matches=matches)
        
    def characters(self, content):
        if self._current_tag:
            self._buffer.append(content)
            
    def startElement(self, name, attrs):
        if name in ('title', 'text', 'ns'):
            self._current_tag = name
            self._buffer = []
            
        elif name == 'id' and self._is_pageid:
            self._is_pageid = False
            self._current_tag = name
            self._buffer = []
            
        elif name == 'redirect':
            article_title = self._values['article_title'].strip()
            target_title = attrs.getValue('title').strip()
            self.wikidataset.add_redirect(article_title, target_title)
            self._add_page = False
                   
    def endElement(self, name):
        if name == self._current_tag:
            self._values[f'article_{name}'] = ' '.join(self._buffer)
            self._current_tag = None
            
        elif name == 'page':
            if int(self._values['article_ns']):
                self._add_page = False
                
            """
            EXTRACT_DATA : The following code stores the data
            """
            if self._add_page:
                self.wikidataset.add_article(**self._values)
            """
            EXTRACT_DATA
            """
            
            self._add_page = True
            self._is_pageid = True

### test handler

In [9]:
data_path = sorted(partitions)[0]

matches = r'^([Ss]ee[ ]*|[Ss]ee[ ]*([Aa]lso|[Mm]ore|[Aa]ll)|[Ss]ee[ ]*[Aa]lso[ ]*\(.+\))$'

In [10]:
start = timer()

handler = WikiXmlHandler(matches)

parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for i, line in enumerate(subprocess.Popen(['bzcat'], 
                         stdin = open(data_path), 
                         stdout = subprocess.PIPE).stdout):
    parser.feed(line)
    if len(handler.wikidataset.id_to_title) > 9:
        break
        
end = timer()
        
print(f'Searched through {len(handler.wikidataset.id_to_title)} articles.')
print(f'Processing time is {round(end - start)} secs.')

Searched through 10 articles.
Processing time is 3 secs.


#### test

In [11]:
save_dir = '/home/scai/phd/aiz218323/scratch/XML/wikipedia-data-science/test_data'

In [12]:
handler.wikidataset.save_data(save_dir, tag='_test')

In [14]:
handler.wikidataset.article_graph.get_doc_link_freq().todense()

matrix([[1, 1, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 1, 1]])

In [17]:
len(handler.wikidataset.article_graph.wikilinks)

4692

In [18]:
w2 = WikiGraphDataset(matches=matches)

In [19]:
w2.load_data(save_dir, tag='_test')

In [24]:
w2.redirects

{'AccessibleComputing': 'Computer accessibility',
 'AfghanistanHistory': 'History of Afghanistan',
 'AfghanistanGeography': 'Geography of Afghanistan',
 'AfghanistanPeople': 'Demographics of Afghanistan',
 'AfghanistanCommunications': 'Communications in Afghanistan',
 'AfghanistanTransportations': 'Transport in Afghanistan',
 'AfghanistanMilitary': 'Military of Afghanistan',
 'AfghanistanTransnationalIssues': 'Foreign relations of Afghanistan',
 'AssistiveTechnology': 'Assistive technology',
 'AmoeboidTaxa': 'Amoeba',
 'AlbaniaHistory': 'History of Albania',
 'AlbaniaPeople': 'Demographics of Albania',
 'AsWeMayThink': 'As We May Think',
 'AlbaniaGovernment': 'Politics of Albania',
 'AlbaniaEconomy': 'Economy of Albania',
 'AfroAsiaticLanguages': 'Afroasiatic languages',
 'ArtificalLanguages': 'Constructed language',
 'AbacuS': 'Abacus',
 'AbalonE': 'Abalone',
 'AbbadideS': 'Abbadid dynasty',
 'AbbesS': 'Abbess',
 'AbbevilleFrance': 'Abbeville',
 'AbbeY': 'Abbey',
 'AbboT': 'Abbot',
 '

## Create graph

In [9]:
def create_graph(data_path, save_dir, matches=None, limit=None, save=True, tag_extractor=None, back=True):
    
    handler = WikiXmlHandler(matches=matches)
    
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)
    
    for i, line in enumerate(subprocess.Popen(['bzcat'],
                                             stdin = open(data_path),
                                             stdout = subprocess.PIPE).stdout):
        try:
            parser.feed(line)
        except StopIteration:
            break
            
        if limit is not None and len(handler.wikidataset.id_to_title) >= limit:
            if back:
                return handler.wikidataset
            else:
                break
    
    if save:
        file_tag = ''
        if isinstance(tag_extractor, re.Pattern):
            parts_tag = tag_extractor.match(os.path.basename(data_path))
            if parts_tag:
                try:
                    file_tag = f'-{parts_tag.group(1)}-{parts_tag.group(2)}-{parts_tag.group(3)}'
                except:
                    file_tag = ''
        elif isinstance(tag_extractor, str):
            file_tag = tag_extractor
            
        handler.wikidataset.save_data(save_dir, tag=file_tag)
        print(f"** Completed processing {os.path.basename(data_path)}.", end='\r')

    del handler
    del parser
    gc.collect()
    
    return None

In [10]:
save_dir = '/home/scai/phd/aiz218323/scratch/XML/wikipedia-data-science/test_data'

tag_extractor = re.compile(f'({project}-{dump_date})-'+r'pages-articles-multistream([0-9]{1,2}).xml-(p[0-9]+p[0-9]+).bz2')

matches = r'^([Ss]ee[ ]*|[Ss]ee[ ]*([Aa]lso|[Mm]ore|[Aa]ll)|[Ss]ee[ ]*[Aa]lso[ ]*\(.+\))$'

In [94]:
data_path = partitions[0]
create_graph(data_path, save_dir, matches=matches, limit=100, save=True, tag_extractor=tag_extractor, back=False)

** Completed processing enwiki-20220420-pages-articles-multistream1.xml-p1p41242.bz2.

In [95]:
for data_path in partitions[0:4]:
    create_graph(data_path, save_dir, matches=matches, limit=100, save=True, tag_extractor=tag_extractor, back=False)

** Completed processing enwiki-20220420-pages-articles-multistream11.xml-p6899367p7054859.bz2.

## Profiling wikilinks

### loading wikilinks

In [25]:
def load_wikilinks_helper(data_path, save_dir, matches=None, tag_extractor=None):
    wikilinks = set()
    
    wikidataset = WikiGraphDataset(matches=matches)
    tag = extract_filetag(data_path, tag_extractor)
    wikidataset.load_graph(save_dir=save_dir, tag=tag)
    
    links = wikidataset.article_graph.wikilinks.keys()
    wikilinks.update(links)
    links = wikidataset.seealso_graph.wikilinks.keys()
    wikilinks.update(links)
    
    return wikilinks

In [26]:
save_dir = f'{dataset_home}/results'

tag_extractor = re.compile(f'({project}-{dump_date})-'+r'pages-articles-multistream([0-9]{1,2}).xml-(p[0-9]+p[0-9]+).bz2')

matches = r'^([Ss]ee[ ]*|[Ss]ee[ ]*([Aa]lso|[Mm]ore|[Aa]ll)|[Ss]ee[ ]*[Aa]lso[ ]*\(.+\))$'

load_wikilinks = partial(load_wikilinks_helper, save_dir=save_dir, matches=matches, tag_extractor=tag_extractor)

print(save_dir)

/home/cse/phd/anz198717/scratch/suchith_data/wikipedia/wikipedia-data-science/results


In [27]:
start = timer()

pool = Pool(processes = 24)
wikilinks = set()
for links in tqdm.tqdm(pool.imap_unordered(load_wikilinks, partitions), total = len(partitions)):
    wikilinks.update(links)
pool.close()
pool.join()

end = timer()

print(f'Number of links : {len(wikilinks)}')
print(f'Time taken to load wikilinks : {end-start:.4f} seconds')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62/62 [01:03<00:00,  1.02s/it]

Number of links : 21876982
Time taken to load wikilinks : 64.0401 seconds





In [28]:
with open(f'{dataset_home}/metadata/all_wikilinks.pkl', 'wb') as fout:
    pickle.dump(wikilinks, fout)

In [30]:
for i, link in enumerate(wikilinks):
    if i > 10:
        break
    print(i, link)

0 
1 2003 in Shooto#Shooto - 3/18 in Korakuen Hall
2 8-inch/45 41st Year Type naval gun
3 Espadon 50 class patrol vessel
4 Pista de Atletismo de Monzón
5 Palo's wedding
6 Nol Kan
7 Ministry of Colleges and Universities
8 Bazelevs Company
9 File:Popular Electronics Cover Jan 1975.jpg
10 Repentance Day


### : profile

In [83]:
def get_colon_profile(wikilinks):
    profile = {}
    links_with_colon = []
    
    for link in tqdm.tqdm(wikilinks):
        
        hash_parts = link.split('#')
        if len(hash_parts) > 1:
            link = hash_parts[0].strip()
            if not link:
                continue
        
        if ':' in link:
            links_with_colon.append(link)
            
        colon_parts = link.split(':')
        
        for i in range(len(colon_parts)-1):
            link_list = profile.setdefault(i, set())
            link_list.add(colon_parts[i].strip())
    return profile, links_with_colon

In [84]:
colon_profile = get_colon_profile(wikilinks)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 21876982/21876982 [00:26<00:00, 814384.25it/s]


In [88]:
colon_profile[1]

['File:Popular Electronics Cover Jan 1975.jpg',
 'File:2008AMPSnationalshow.jpg',
 'File:UHCL-SSB.jpg',
 'File:Operation Provide Hope Patch.jpg',
 'Image:Dallas-Semiconductor-DS1287-Real-Time-IC.jpg',
 'Category:1980 disestablishments in New Hampshire',
 'Category:Sardinian Action Party politicians',
 'File:Cochabamba panorama (cuadrado - square).jpg',
 'Category:Spongiologists',
 'File:NO road sign 210.svg',
 'wikt:escapade',
 ':de:Arno Fischer (Fotograf)Arno Fischer',
 'Category:Documents of the Roman Curia',
 'File:Fire-island-map.jpg',
 'File:Train of Tomorrow Dream Cloud sleeping car.jpg',
 'File:Tisch Mills Wisconsin Fire Department.jpg',
 'Category:Automatic number plate recognition',
 'File:Phone photography.jpg',
 'Category:Fall Out Boy members',
 'File:Brant Aesop.jpg',
 'Image:Sheffield-plaque-yale-new-haven-usa.jpg',
 'Image:Minotauros Myron NAMA 1664 n1.jpg',
 'File:dartmouthdam2.jpg',
 'Category:Mandaic language',
 'Category:Riot Engine games',
 'ISO 639:oda',
 'Image:San

In [86]:
link_part_cnt = []
for p, prefix in colon_profile[0].items():
    link_part_cnt.append([p+1, len(prefix)])
    
pd.DataFrame(link_part_cnt, columns=['link parts','count']).T

Unnamed: 0,0,1,2,3,4,5,6,7
link parts,1,2,3,4,5,6,7,8
count,53753,1818,830,92,8,1,1,1


In [87]:
for i in range(4, 8):
    print(i, ' : ', colon_profile[0][i])

4  :  {'45-3', '3B5D', 'Libro Primero', 'Mittelhochdeutsch/Konrad von Würzburg', 'Wurzelsysteme', '45 - 3', 'Página', 'Libro Segundo'}
5  :  {'7188'}
6  :  {'A7B6'}
7  :  {'EB06'}


In [79]:
print(0, ' : ', colon_profile[0])



In [80]:
print(1, ' : ', colon_profile[1])

1  :  {'', 'Players who have won the Magic', 'Football', "Popular Science Monthly/Volume 17/July 1880/Goethe's Farbenlehre", 'Fa', 'Video games based on Superman', 'Κατηγορία', 'www', 'Witnesses', 'Interactive', 'Desert Storm II', 'The Best of Shoukichi Kina', 'Chapter 6', 'Diess ist der Gotteskinder Last, TWV 1', 'Checkmate', '30 (8', "Missa sopra 'Es wird schier der letzte Tag herkommen', TWV 9", '00 to 17', 'Then', 'artist', 'captain', 'so', 'Kimetsu no Yaiba the Movie', 'Letters patent', 'Marvel Cinematic Universe', 'cs', 'Historic American Sheet Music', 'The Hohei 2', 'fj', "The River Merchant's Wife", 'toolforge', 'Er', 'Cast', 'POV', 'war', 'Incubator', 'Volume 3', 'Категория', 'mendocino', 'tpi', 'Tales From a Galaxy Far, Far Away', 'mad', 'pam', 'The Lord of the Rings Online', 'A Long Time Ago... Volume 6', 'io', 'Smyrna Fellowship Trust', 'literateprograms', 'DU', 'Back and Forth Series 5', 'Wet Hot American Summer', 'Treaty establishing the EEC - Annex IV', 'Sakura Taisen', 

In [81]:
print(2, ' : ', colon_profile[2])

2  :  {'', 'wp', 'Гурвинек', 'Wish List', 'Koh-Lanta', 'Kategória', '분류', 'Turn', 'Grafika', 'Datoteka', 'Birkin/Gainsbourg', 'History XV', 'Tadeo Jones 2', 'Utente', 'מיוחד', '476 A.D. Chapter One', 'Rheinsberg', 'az', 'Cerita Cinta', 'Patent N', 'Oregon Historical Quarterly/Volume 2/Hall J. Kelley', 'Spawn', 'Kép', 'Diskussion', 'Predefinição', 'Служебная', 'collaboration', 'J. Vollmann', 'Volume II', 'Puruhára myangekõi', 'Tzar', 'Decamerão', 'Suzzanna', 'Discuție', 'hero', '꽃파당', 'Tschuschen', 'Eden', 'cs', 'ଫାଇଲ', 'START', 'fi', 'Адмысловае', 'Spécial', 'Strona', 'A Study Prepared by the Department of Defense/I. C. Ho Chi Minh', 'Seznam německých názvů obcí a osad v Česku', 'Волки и овцы', 'Tiedosto', 'sl', 'BEEF', 'King_Arthur', '보컬 전쟁', 'Fájl', 'Мафия', 'ko', '사용자', 'Неуловимые', 'Викитека', 'category', 'Պատկեր', 'Incubator', 'R%C3%A9f%C3%A9rence', 'Micro-livestock', 'Категория', '91', 'Kuva', 'q', 'Yovie and His Friends', 'http', '27', 'ВП', 'Book One', 'wikisource', 'Wikidata'

## Combining partitions

In [23]:
save_dir = f'{dataset_home}/results'
matches = r'^([Ss]ee[ ]*|[Ss]ee[ ]*([Aa]lso|[Mm]ore|[Aa]ll)|[Ss]ee[ ]*[Aa]lso[ ]*\(.+\))$'
tag_extractor = re.compile(f'({project}-{dump_date})-'+r'pages-articles-multistream([0-9]{1,2}).xml-(p[0-9]+p[0-9]+).bz2')


### Partition `WikiGraphDataset`

* `matches` 
* `seealso_graph`
* `article_graph`
* `id_to_title`
* `wiki_content`
* `redirects`

In [24]:
def load_partition_graph_dataset(data_path, save_dir, matches=None, tag_extractor=None):
    wikidataset = WikiGraphDataset(matches=matches)
    tag = extract_filetag(data_path, tag_extractor)
    wikidataset.load_data(save_dir, tag=tag)
    
    return wikidataset

In [25]:
wikidataset = load_partition_graph_dataset(partitions[0], save_dir, matches=matches, tag_extractor=tag_extractor)

In [26]:
wikidataset.lower_id_title()
wikidataset.lower_redirects()

#### Visualize

In [86]:
wikidataset.seealso_graph.get_doc_link_freq(), wikidataset.seealso_graph.get_link_doc_freq()

(<9418x49534 sparse matrix of type '<class 'numpy.int64'>'
 	with 57398 stored elements in Compressed Sparse Row format>,
 <1x49534 sparse matrix of type '<class 'numpy.int64'>'
 	with 49534 stored elements in Compressed Sparse Row format>)

In [91]:
dict_head_random(wikidataset.id_to_title)

13966 : hosea
33230 : wolfgang Petersen
38494 : tiling
2685 : abdera, Thrace
32762 : vinegar
14092 : prince Henry the Navigator
26930 : sideshow
22832 : book of Omni
4517 : boudica
34548 : 2000


In [96]:
dict_head_random(wikidataset.redirects)

state Terrorism : state terrorism
mcGwire : maguire
military of Norfolk Island : norfolk Island
demographics of Norfolk Island : norfolk Island
malmo : malmö
central Dogma Of Genetics : central dogma of molecular biology
glycerine : glycerol
walter Carlos : wendy Carlos
gabrielle DAnnunzio : gabriele D'Annunzio
education vouchers : school voucher


#### filter wikilink in a partition

In [27]:
class FilterWikilinks:
    
    def __init__(self):
        self.subject_namespaces = {'user', 'wikipedia', 'wp', 'project', 'file', 'image', 'mediawiki', 
                                   'template', 't', 'help', 'h', 'category', 'cat', 'portal', 'p', 
                                   'draft', 'timedtext', 'module', 'special', 'media'}
        
        self.talk_namespaces = {'talk', 'user talk', 'wikipedia talk', 'wt', 'project talk', 'file talk', 
                                'image talk','mediawiki talk', 'template talk', 'help talk', 'category talk', 
                                'portal talk', 'draft talk', 'timedtext talk', 'module talk'}
        
        self.interwiki_links = {'wiktionary', 'wikt', 'wikinews', 'n', 'wikibooks', 'b', 'wikiquote','q', 
                                'wikisource','s', 'oldwikisource', 'wikispecies', 'species', 
                                'wikiversity', 'v', 'wikivoyage', 'voy', 'wikimedia','foundation', 'wmf', 
                                'commons', 'c', 'metawiki', 'metawikimedia', 'metawikipedia', 'meta' , 'm', 
                                'incubator', 'strategy', 'mediawikiwiki', 'mw', 'mediazilla', 'bugzilla'}
        
        self.language_code = re.compile(r'^[a-z][a-z]$')
        
    def remove_section_tags(self, link):
        hash_parts = link.split('#')
        if len(hash_parts) > 1:
            link = hash_parts[0].strip()  
        return link
    
    def filter_special_tags(self, link):
        colon_parts = link.split(':')
        part_num = 0
        for part in colon_parts:
            part = part.lower()
            if (part == "w") or (part == "en") or (part_num == 0 and not part):
                part_num += 1
            elif (part in self.subject_namespaces) or (part in self.talk_namespaces) \
            or (part in self.interwiki_links) or self.language_code.match(part):
                return ''
            else:
                break
        return ':'.join(colon_parts[part_num:])
    
    def lower_wikilink(self, link):
        if link: link = link[0].lower() + link[1:]
        return link
        
    def remove_underscore(self, link):
        return link.replace('_', ' ')
    
    def process_wikilink(self, link):
        link = self.remove_section_tags(link)
        link = self.filter_special_tags(link)
        link = self.lower_wikilink(link)
        link = self.remove_underscore(link)
        return link
    

In [28]:
wikigraph = wikidataset.article_graph
#wikigraph = wikidataset.seealso_graph

In [29]:
lf_mat = wikigraph.get_doc_link_freq()
df_mat = wikigraph.get_link_doc_freq()
wikilinks = wikigraph.wikilinks

lf_mat.shape, df_mat.shape, len(wikilinks)

((21066, 1428685), (1, 1428685), 1428685)

In [30]:
start = timer()

link_filter = FilterWikilinks()
wikigraph.filter_graph(link_filter.process_wikilink)

end = timer()
print(f"Time taken : {end - start}")

Time taken : 9.943983715958893


In [30]:
lf_mat = wikigraph.get_doc_link_freq()
df_mat = wikigraph.get_link_doc_freq()
wikilinks = wikigraph.wikilinks

lf_mat.shape, df_mat.shape, len(wikilinks)

((9385, 48466), (1, 48466), 48466)

##### Testing code : basic checks

In [528]:
def filtr_func(word):
    if word != "bad":
        return word.lower()
    return ''

In [549]:
docs = [["hello", "world", "hello", "bad"], ["Goodbye", "cruel", "World"], 
        ["goodbye", "HELLO", "CRUEL", "suchith"], ["bad"]]

doc_to_rowidx = {'a': 0, 'b':1, 'c':2, 'd':3}

indptr = [0]
indices = []
data = []
vocabulary = {}

link_doc_count = {}
for d in docs:
    for term in d:
        index = vocabulary.setdefault(term, len(vocabulary))
        indices.append(index)
        data.append(1)
    indptr.append(len(indices))
    
    for term in set(d):
        link_doc_count[term] = link_doc_count.get(term, 0) + 1
        
df_indptr = [0]
df_indices = []
df = []

for link, cnt in link_doc_count.items():
    df_indices.append(vocabulary[link])
    df.append(cnt)
df_indptr.append(len(df_indices))

link_df = csr_matrix((df, df_indices, df_indptr),
                     shape=(1, len(vocabulary)),
                     dtype=int)

doc_lf = csr_matrix((data, indices, indptr), dtype=int)

In [550]:
link_df.toarray()

array([[1, 1, 2, 1, 1, 1, 1, 1, 1, 1]])

In [551]:
doc_lf.toarray()

array([[2, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])

In [552]:
vocabulary, doc_to_rowidx

({'hello': 0,
  'world': 1,
  'bad': 2,
  'Goodbye': 3,
  'cruel': 4,
  'World': 5,
  'goodbye': 6,
  'HELLO': 7,
  'CRUEL': 8,
  'suchith': 9},
 {'a': 0, 'b': 1, 'c': 2, 'd': 3})

In [553]:
wg = WikilinkGraph()
doc_lf, link_df, vocabulary = wg.filter_graph_columns(doc_lf, link_df, vocabulary, filtr_func)
doc_lf, doc_to_rowidx = wg.filter_graph_rows(doc_lf, doc_to_rowidx)

In [554]:
doc_lf.toarray()

array([[2, 1, 0, 0, 0],
       [0, 1, 1, 1, 0],
       [1, 0, 1, 1, 1]])

In [555]:
link_df.toarray()

array([[2, 2, 2, 2, 1]])

In [556]:
vocabulary

{'hello': 0, 'world': 1, 'goodbye': 2, 'cruel': 3, 'suchith': 4}

In [557]:
doc_to_rowidx

{'a': 0, 'b': 1, 'c': 2}

### Combining `WikilinkGraph`

In [31]:
def load_article_wikilink_graph(data_path, save_dir, matches=None, tag_extractor=None):
    wikidataset = WikiGraphDataset(matches=matches)
    """
    load the saved graph depending on the way it was stored using the
    original partition file tag_extractor used in the def create_graph()
    function.
    """
    tag = extract_filetag(data_path, tag_extractor)
    wikidataset.load_graph(save_dir=save_dir, tag=tag)
    
    link_filter = FilterWikilinks()
    wikigraph = wikidataset.seealso_graph
    wikigraph.filter_graph(link_filter.process_wikilink)
    
    return wikigraph

In [32]:
wikilg1 = load_article_wikilink_graph(partitions[0], save_dir, tag_extractor=tag_extractor)
wikilg2 =  load_article_wikilink_graph(partitions[1], save_dir, tag_extractor=tag_extractor)

In [47]:
m1, v = wikilg1.get_doc_link_freq(), wikilg1.wikilinks
m2, v2 = wikilg2.get_doc_link_freq(), wikilg2.wikilinks

In [48]:
rearrage = np.zeros(len(v2), dtype=int)

for k, i in sorted(v2.items(), key=lambda x: x[1]):
    if k not in v:
        v[k] = len(v)
        
    rearrage[i] = v[k]

In [49]:
r, c = m1.shape[0]+m2.shape[0], len(v)

m = csr_matrix((r, c), dtype=int)

In [50]:
start = timer()

m[:m1.shape[0], :m1.shape[1]] = m1

end = timer()
print(f'Time taken : {end - start} seconds')

Time taken : 11.813745327992365 seconds


In [51]:
start = timer()

for i, c in enumerate(rearrage):
    m[m1.shape[0]:, c] = m2[:, i]

end = timer()
print(f'Time taken : {end - start} seconds')

Time taken : 400.5497639940586 seconds


In [52]:
m2 != m[m1.shape[0]:, rearrage]

<39887x77233 sparse matrix of type '<class 'numpy.bool_'>'
	with 0 stored elements in Compressed Sparse Row format>

In [53]:
m[:m1.shape[0], :m1.shape[1]] != m1

<9385x48466 sparse matrix of type '<class 'numpy.bool_'>'
	with 0 stored elements in Compressed Sparse Row format>

#### test code : basic checks

In [33]:
def build_matrix(docs):
    indptr = [0]
    indices = []
    data = []
    vocabulary = {}
    for d in docs:
        for term in d:
            index = vocabulary.setdefault(term, len(vocabulary))
            indices.append(index)
            data.append(1)
        indptr.append(len(indices))

    return vocabulary, csr_matrix((data, indices, indptr), dtype=int)

In [34]:
d1 = [["hello", "world", "hello"], ["goodbye", "cruel", "world"], ["cricket", "tennis", "squash"]]
d2 = [["world", "hello", "hello"], ["suchith", "prabhu", "prabhu"], ["bye", "tennis", "cricket"]]

v, m1 = build_matrix(d1)
v2, m2 = build_matrix(d2)

rearrage = np.zeros(len(v2), dtype=int)
for k, i in sorted(v2.items(), key=lambda x: x[1]):
    if k not in v:
        v[k] = len(v)
    rearrage[i] = v[k]

In [35]:
m1.toarray()

array([[2, 1, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 1]])

In [40]:
m1[:, 0] += m1[:, 1]

In [41]:
m1.toarray()

array([[4, 1, 0, 0, 0, 0, 0],
       [2, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 1]])

In [43]:
m1.getcol(0) 

<3x1 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [29]:
m2.toarray()

array([[1, 2, 0, 0, 0, 0, 0],
       [0, 0, 1, 2, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 1]])

In [30]:
r, c = m1.shape[0]+m2.shape[0], len(v)

m = csr_matrix((r, c), dtype=int)
"""
combine mat1
"""
m[:m1.shape[0], :m1.shape[1]] = m1
"""
combine mat2
"""
for i,c in enumerate(rearrage):
    m[m1.shape[0]:, c] = m2[:, i]

In [31]:
m.toarray()

array([[2, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [2, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 2, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 1]])

In [44]:
m[m1.shape[0]:, rearrage] != m2

<3x7 sparse matrix of type '<class 'numpy.bool_'>'
	with 0 stored elements in Compressed Sparse Row format>

### WikiGraphCombine

In [15]:
def multiprocessor(func, tasks, num_process=10):
    pool = Pool(processes=num_process)
    results = {}
    for x in tqdm.tqdm( pool.imap(func, tasks), total=len(tasks)):
        results.update(x)
    return results

In [16]:
class WikiGraphCombine:
    
    def __init__(self, partition_files):
        self.partition_files = partition_files
        
        self.parameters = {}
        self.parameters['id_to_title'] = {}
        self.parameters['redirects'] = {}
        self.parameters['wiki_content'] = {}
    
    def extract_filetag(self, data_path, tag_extractor=None):
        file_tag = ''
        if isinstance(tag_extractor, re.Pattern):
            parts_tag = tag_extractor.match(os.path.basename(data_path))
            if parts_tag:
                try:
                    file_tag = f'-{parts_tag.group(1)}-{parts_tag.group(2)}-{parts_tag.group(3)}'
                except:
                    file_tag = ''
        elif isinstance(tag_extractor, str):
            file_tag = tag_extractor
        return file_tag
    
    def load_param(self, data_path, save_dir, param='id_to_title', tag_extractor=None):
        wikidataset = WikiGraphDataset()
        tag = extract_filetag(data_path, tag_extractor)
        
        if param == 'id_to_title':
            wikidataset.load_idtotitle(save_dir=save_dir, tag=tag)
            wikidataset.lower_id_title()
            return wikidataset.id_to_title
        elif param == 'redirects':
            wikidataset.load_redirects(save_dir=save_dir, tag=tag)
            wikidataset.lower_redirects()
            return wikidataset.redirects
        elif param == 'wiki_content':
            wikidataset.load_wikicontent(save_dir=save_dir, tag=tag)
            return wikidataset.wiki_content
        else:
            wikidataset.load_graph(save_dir=save_dir, tag=tag)
            return wikidataset.article_graph, wikidataset.seealso_graph
        
    def combine_param(self, save_dir, param='id_to_title', tag_extractor=None):
        combine_helper = partial(self.load_param, param=param, save_dir=save_dir, tag_extractor=tag_extractor)
        self.parameters[param] = multiprocessor(combine_helper, self.partition_files)
        
    def save_idtotitle(self, save_dir, tag=''):
        os.makedirs(save_dir, exist_ok=True)
        map_file = f'{save_dir}/id_to_title{tag}.pickle'
        with open(map_file, 'wb') as f:
            pickle.dump(self.parameters['id_to_title'], f)
            
    def save_wikicontent(self, save_dir, tag=''):
        os.makedirs(save_dir, exist_ok=True)
        content_file = f'{save_dir}/wiki_content{tag}.pickle'
        with open(content_file, 'wb') as f:
            pickle.dump(self.parameters['wiki_content'], f)
            
    def save_redirects(self, save_dir, tag=''):
        os.makedirs(save_dir, exist_ok=True)
        redirect_file = f'{save_dir}/redirects{tag}.pickle'
        with open(redirect_file, 'wb') as f:
            pickle.dump(self.parameters['redirects'], f)
            

In [17]:
tag_extractor = re.compile(f'({project}-{dump_date})-'+r'pages-articles-multistream([0-9]{1,2}).xml-(p[0-9]+p[0-9]+).bz2')
results_dir = f'{dataset_home}/results'

In [151]:
combine_graph = WikiGraphCombine(partitions[:2])

In [143]:
combine_graph.combine_param(results_dir, param='id_to_title', tag_extractor=tag_extractor)
combine_graph.combine_param(results_dir, param='redirects', tag_extractor=tag_extractor)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  4.75it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.46s/it]


In [148]:
save_dir = f'{dataset_home}/combined'

combine_graph.save_idtotitle(save_dir, tag=f'-{project}-{dump_date}')
combine_graph.save_redirects(save_dir, tag=f'-{project}-{dump_date}')

In [156]:
article_graph, seealso_graph = combine_graph.load_param(partitions[0], results_dir, param='graphs', 
                                                        tag_extractor=tag_extractor)

In [157]:
article_graph.get_doc_link_freq()

<21066x1428685 sparse matrix of type '<class 'numpy.int64'>'
	with 3811343 stored elements in Compressed Sparse Row format>

In [None]:
link_filter = FilterWikilinks()
article_graph.filter_graph(link_filter.process_wikilink)

  self._set_arrayXarray_sparse(i, j, x)


In [None]:
article_graph.get_doc_link_freq()

In [120]:
id2title1 = combine_graph.load_redirects(partitions[0], save_dir, tag_extractor)

In [121]:
id2title2 = combine_graph.load_redirects(partitions[1], save_dir, tag_extractor)

In [122]:
comb_id2title = {}

In [123]:
comb_id2title.update(id2title1)

In [124]:
comb_id2title.update(id2title2)

In [125]:
len(comb_id2title)

219536

In [62]:
graph_combine.extract_filetag(partitions[0], tag_extractor=tag_extractor)

'-enwiki-20220420-1-p1p41242'

### Extra

In [147]:
def extract_filetag(data_path):
    file_tag = ''
    if isinstance(tag_extractor, re.Pattern):
        parts_tag = tag_extractor.match(os.path.basename(data_path))
        if parts_tag:
            try:
                file_tag = f'-{parts_tag.group(1)}-{parts_tag.group(2)}-{parts_tag.group(3)}'
            except:
                file_tag = ''
    elif isinstance(tag_extractor, str):
        file_tag = tag_extractor
    return file_tag

In [150]:
save_dir = '/home/scai/phd/aiz218323/scratch/XML/wikipedia-data-science/test_data'

tag_extractor = re.compile(f'({project}-{dump_date})-'+r'pages-articles-multistream([0-9]{1,2}).xml-(p[0-9]+p[0-9]+).bz2')

matches = r'^([Ss]ee[ ]*|[Ss]ee[ ]*([Aa]lso|[Mm]ore|[Aa]ll)|[Ss]ee[ ]*[Aa]lso[ ]*\(.+\))$'

In [148]:
wikidataset = WikiGraphDataset(matches=matches)

'-enwiki-20220420-1-p1p41242'

In [146]:
for data_path in partitions[0:4]:
    print(data_path)

/home/cse/phd/anz198717/scratch/suchith_data/wikipedia/wikipedia-data-science/datasets/enwiki-20220420-pages-articles-multistream1.xml-p1p41242.bz2
/home/cse/phd/anz198717/scratch/suchith_data/wikipedia/wikipedia-data-science/datasets/enwiki-20220420-pages-articles-multistream10.xml-p4045403p5399366.bz2
/home/cse/phd/anz198717/scratch/suchith_data/wikipedia/wikipedia-data-science/datasets/enwiki-20220420-pages-articles-multistream11.xml-p5399367p6899366.bz2
/home/cse/phd/anz198717/scratch/suchith_data/wikipedia/wikipedia-data-science/datasets/enwiki-20220420-pages-articles-multistream11.xml-p6899367p7054859.bz2
