In [1]:
import os
import requests
import pandas as pd
import mwparserfromhell
from matplotlib import pyplot as plt
from bs4 import BeautifulSoup

In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

## Loading Wikipedia Dumps

In [3]:
project = 'enwiki'
dump_date = "20220420"
dataset_home = '/home/scai/phd/aiz218323/scratch/XML/wikipedia-data-science/'

In [4]:
import re

data_paths = []
file_info = []

prog = re.compile(f'{project}-{dump_date}'+r'*')
for file in os.listdir(f'{dataset_home}/datasets'):
    if prog.match(file):
        path = f'{dataset_home}/datasets/{file}'
        
        data_paths.append(path)
        file_size = os.stat(path).st_size/1e6
        file_articles = int(file.split('p')[-1].split('.')[0]) - int(file.split('p')[-2])
        file_info.append((file, file_size, file_articles))

## Parsing dumps

In [5]:
import bz2
import subprocess

data_path = data_paths[15]
data_path

'/home/scai/phd/aiz218323/scratch/XML/wikipedia-data-science//datasets/enwiki-20220420-pages-articles-multistream13.xml-p10672789p11659682.bz2'

In [6]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []
        
        self._redirects = {}
        self._add_page = True
        self._is_pageid = True
        
    def characters(self, content):
        if self._current_tag:
            self._buffer.append(content)
            
    def startElement(self, name, attrs):
        if name in ('title', 'text', 'ns'):
            self._current_tag = name
            self._buffer = []
        elif name == 'id' and self._is_pageid:
            self._is_pageid = False
            self._current_tag = name
            self._buffer = []    
        elif name == 'redirect':
            self._redirects[self._values['title']] = attrs.getValue('title').strip()
            self._add_page = False
            
            
    def endElement(self, name):
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)
            self._current_tag = None
                
        elif name == 'page':
            if int(self._values['ns']):
                self._add_page = False
                
            if self._add_page:
                self._pages.append( (self._values['title'], self._values['text'], self._values['id']) )
            
            self._add_page = True
            self._is_pageid = True

### Parsing redirect page

In [11]:
lines = []
for i, line in enumerate(subprocess.Popen(['bzcat'],
                                         stdin = open(data_path),
                                         stdout = subprocess.PIPE).stdout):
    lines.append(line)
    if i > 1e6:
        break

In [12]:
page = lines[:1000]
for i, p in enumerate(page):
    print(i, p)

0 b'<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">\n'
1 b'  <siteinfo>\n'
2 b'    <sitename>Wikipedia</sitename>\n'
3 b'    <dbname>enwiki</dbname>\n'
4 b'    <base>https://en.wikipedia.org/wiki/Main_Page</base>\n'
5 b'    <generator>MediaWiki 1.39.0-wmf.7</generator>\n'
6 b'    <case>first-letter</case>\n'
7 b'    <namespaces>\n'
8 b'      <namespace key="-2" case="first-letter">Media</namespace>\n'
9 b'      <namespace key="-1" case="first-letter">Special</namespace>\n'
10 b'      <namespace key="0" case="first-letter" />\n'
11 b'      <namespace key="1" case="first-letter">Talk</namespace>\n'
12 b'      <namespace key="2" case="first-letter">User</namespace>\n'
13 b'      <namespace key="3" case="first-letter">User talk</namespace>\n'
14 b'      <namespace key="4" case="first-lett

In [53]:
handler = WikiXmlHandler()

parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for i, line in enumerate(page):
    parser.feed(line)

In [56]:
handler._redirects

{'Rational inference': 'Inference'}

__Putting it all together__

In [13]:
handler = WikiXmlHandler()

parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for i, line in enumerate(subprocess.Popen(['bzcat'],
                                         stdin=open(data_path),
                                         stdout=subprocess.PIPE).stdout):
    parser.feed(line)
    
    if len(handler._pages) > 50:
        break

In [14]:
handler._pages

[('Dorsa Whiston',
  "'''Dorsa Whiston''' is a [[wrinkle ridge]] system at {{Lunar coords and quad cat|29.4|N|56.4|W|globe:Moon}} in [[Oceanus Procellarum]] on the [[Moon]]. It is 85 km long and was named after [[William Whiston]] in 1976. \n [[File:Dorsa Whiston AS15-97-13264-5.jpg|thumb|right|300px|Mosaic of [[Apollo 15]] images, showing part of Dorsa Whiston]] \n {{moon-stub}} \n [[Category:Ridges on the Moon|Whiston]]",
  '1369254'),
 ('Humor Monastery',
  "[[Image:Rares.jpg|thumb|Petru Rareş]] \n '''Humor Monastery''' located in [[Mănăstirea Humorului]], about 5 & nbsp;km north of the town of [[Gura Humorului]], [[Romania]]. It is a [[monastery]] for nuns dedicated to the [[Dormition]] of Virgin Mary, or [[Theotokos]]. It was constructed in 1530 by Voievod [[Petru Rareş]]  and his chancellor [[Teodor Bubuiog]]. The monastery was built over the foundation of a previous monastery that dated from around 1415. The Humor monastery was closed in 1786 and was not reopened until 1990. \n 

In [15]:
handler._redirects

{'Rational inference': 'Inference',
 'Ripcot': 'The Ripping Friends',
 'Delphic of Gamma Sigma Tau Fraternity': 'Delphic Fraternity',
 'Tir na n-og Awards': 'Tir na n-Og Award',
 'Chantiers de l’Atlantique': "Chantiers de l'Atlantique",
 'South Australian general election campaign, 2006': '2006 South Australian state election',
 'Sr. Maria Stanisia': 'Mary Stanisia',
 'Monica Kurkowski': 'Mary Stanisia',
 'Atlantoaxial': 'Atlanto-axial joint',
 'URGI': 'Avenue (store)',
 'Panagiotis Fassoulas': 'Panagiotis Fasoulas',
 'Dysplasia of the hip': 'Hip dysplasia',
 'Uk.yahoo.com': 'Yahoo!',
 'Twilight of the idols': 'Twilight of the Idols',
 'Factions in Supreme Commander': 'Supreme Commander (video game)',
 'Tir na n-Og awards': 'Tir na n-Og Award',
 'Thon the Egyptian': 'Polydamna',
 'Egg Harbor City (NJT Station)': 'Egg Harbor City station',
 'Nyrondal': 'Greyhawk',
 'William M. Fowler, Jr.': 'William M. Fowler',
 'Allendale (NJT Station)': 'Allendale station (NJ Transit)',
 'Independent 

### See also section filtering

In [7]:
matches = r'^([Ss]ee[ ]*|[Ss]ee[ ]*([Aa]lso|[Mm]ore|[Aa]ll)|[Ss]ee[ ]*[Aa]lso[ ]*\(.+\))$'

In [8]:
handler = WikiXmlHandler()

parser = xml.sax.make_parser()
parser.setContentHandler(handler)

headings = set()

prev_num_pages = 0
seealso_pages_idx = []

for i, line in enumerate(subprocess.Popen(['bzcat'],
                                         stdin=open(data_path),
                                         stdout=subprocess.PIPE).stdout):
    parser.feed(line)
    
    if len(handler._pages) > 1e5 or len(seealso_pages_idx) > 10:
        break
        
    if prev_num_pages < len(handler._pages):
        prev_num_pages += 1
        wiki = mwparserfromhell.parse(handler._pages[-1][1])
        for h in wiki.filter_headings():
            header = h.title.strip_code().strip()
            if re.search(matches, header):
            #if header.lower() == "see also":
                headings.add(header)
                seealso_pages_idx.append(prev_num_pages-1)

In [11]:
page_num_idx = 2
wiki = mwparserfromhell.parse(handler._pages[seealso_pages_idx[page_num_idx]][1])

In [12]:
wiki.strip_code()

' \n \n The College of Engineering and Applied Science is a college within the University of Wisconsin–Milwaukee. It offers bachelor, master and doctoral degrees in civil engineering, electrical engineering, industrial engineering, materials engineering, mechanical engineering, and computer science. \n \n Based on the statistical analysis by H.J. Newton, Professor of Statistics at Texas A & M University in 1997 on the National Research Council report issued in 1995, the school was  ranked 73rd nationally in the National Research Council (NRC) rankings, < ref > A Brief Summary of the NRC Rankings, Texas A & M University < /ref >  with its Civil Engineering program 69th, < ref > NRC Rankings in each of the 41 Areas: civil engineering, Texas A & M University < /ref >  Electronic Engineering 96th, < ref > NRC Rankings in each of the 41 Areas: Electronic Engineering, Texas A & M University < /ref >  Industrial Engineering 34th, < ref > NRC Rankings in each of the 41 Areas: Industrial Engine

In [14]:
for i, n in enumerate(wiki.nodes):
    print(i, n, type(n))

0 {{Infobox university 
 |image= < !-- Do NOT place a non-free image here without complying with WP:NFCC #10c -- > 
 |motto= ''Discover. Innovate. Lead.'' 
 |name=UW-Milwaukee College of Engineering and Applied Science 
 |type            = [[Public university|Public]] 
 |established     = 1964 < ref > [http://www4.uwm.edu/ceas/explore_ceas/about_ceas/ceas_history.cfm History of CEAS on the UW-Milwaukee website] < /ref > 
 |dean            = [[Brett Peters]] 
 |undergrad       = 1,568 < ref name=enrollment > [http://www4.uwm.edu/acad_aff/assessment/intranet/enroll/enrtodat/ay201314/cedall2139_passthrough.cfm Fall 2013 Headcount Report], available on UWM campus or off-campus with UWM student/staff logon < /ref > 
 |postgrad        = 399 < ref name=enrollment / > 
 |city            =[[Milwaukee]] 
 |state           =[[Wisconsin]] 
 |country         =[[United States]] 
 |campus          =[[University of Wisconsin–Milwaukee]] 
 |website= [http://www.uwm.edu/CEAS http://www.uwm.edu/CEAS] 
 |

In [15]:
wiki.remove_nodetype(inplace=True)

In [16]:
for i, n in enumerate(wiki.nodes):
    print(i, n)

0  
 
 The 
1 '''College of Engineering and Applied Science'''
2  is a college within the 
3 [[University of Wisconsin–Milwaukee]]
4 . It offers bachelor, master and doctoral degrees in 
5 [[civil engineering]]
6 , 
7 [[electrical engineering]]
8 , 
9 [[industrial engineering]]
10 , 
11 [[materials engineering]]
12 , 
13 [[mechanical engineering]]
14 , and 
15 [[computer science]]
16 . 
 
 Based on the statistical analysis by H.J. Newton, Professor of Statistics at Texas A & M University in 1997 on the 
17 [[United States National Research Council|National Research Council]]
18  report issued in 1995, the school was  ranked 73rd nationally in the 
19 [[United States National Research Council rankings|National Research Council]]
20  (NRC) rankings, < ref > 
21 [http://www.stat.tamu.edu/~jnewton/nrc_rankings/nrc1.html#RANKBYAREA A Brief Summary of the NRC Rankings]
22 , Texas A & M University < /ref >  with its 
23 [[Civil Engineering]]
24  program 69th, < ref > 
25 [http://www.stat.tamu

In [22]:
wiki.strip_code()

' \n \n The College of Engineering and Applied Science is a college within the University of Wisconsin–Milwaukee. It offers bachelor, master and doctoral degrees in civil engineering, electrical engineering, industrial engineering, materials engineering, mechanical engineering, and computer science. \n \n Based on the statistical analysis by H.J. Newton, Professor of Statistics at Texas A & M University in 1997 on the National Research Council report issued in 1995, the school was  ranked 73rd nationally in the National Research Council (NRC) rankings, < ref > A Brief Summary of the NRC Rankings, Texas A & M University < /ref >  with its Civil Engineering program 69th, < ref > NRC Rankings in each of the 41 Areas: civil engineering, Texas A & M University < /ref >  Electronic Engineering 96th, < ref > NRC Rankings in each of the 41 Areas: Electronic Engineering, Texas A & M University < /ref >  Industrial Engineering 34th, < ref > NRC Rankings in each of the 41 Areas: Industrial Engine

In [23]:
seealso_sections, all_sections = wiki.split_sections(matches=matches)

In [19]:
wikilinks = []
for section in all_sections:
    links = section.filter_wikilinks()
    for link in links:
        wikilinks.append(link.title)
wikilinks

['Civil Engineering',
 'Mechanics',
 'Computer Science',
 'Electrical Engineering',
 'Industrial engineering',
 'Materials Science',
 'Mechanical Engineering',
 'Satya Nadella',
 'Michael Dhuey',
 'Macintosh II',
 'iPod',
 'Luther Graef',
 'Graef Anhalt Schloemer  &  Associates Inc.',
 'American Society of Civil Engineers',
 'Phil Katz',
 'PKZIP',
 'Pradeep Rohatgi',
 'Cheng Xu',
 'aerodynamic',
 'American Society of Mechanical Engineers',
 'Y. Austin Chang',
 'Scott Yanoff',
 'Alan Kulwicki',
 'NASCAR Cup Series',
 'Category:University of Wisconsin–Milwaukee',
 'Category:Engineering schools and colleges in the United States',
 'Category:Engineering universities and colleges in Wisconsin']

In [26]:
wikilinks = []
for section in seealso_sections:
    links = section.filter_wikilinks()
    for link in links:
        wikilinks.append(link.title)
wikilinks

['Jantar-Mantar']

#### filtering article links

In [20]:
wikilinks = []
for section in all_sections:
    links = section.filter_wikilinks()
    for link in links:
        link_title = link.title
        
        link_sections = link_title.split(':')
        link_sections[0] = link_sections[0].strip()
        
        if len(link_sections) == 1:
            wikilinks.append(link_sections[0])
        elif len(link_sections) == 2 and not link_sections[0]:
            wikilinks.append(link_sections[1])

In [21]:
wikilinks

['Civil Engineering',
 'Mechanics',
 'Computer Science',
 'Electrical Engineering',
 'Industrial engineering',
 'Materials Science',
 'Mechanical Engineering',
 'Satya Nadella',
 'Michael Dhuey',
 'Macintosh II',
 'iPod',
 'Luther Graef',
 'Graef Anhalt Schloemer  &  Associates Inc.',
 'American Society of Civil Engineers',
 'Phil Katz',
 'PKZIP',
 'Pradeep Rohatgi',
 'Cheng Xu',
 'aerodynamic',
 'American Society of Mechanical Engineers',
 'Y. Austin Chang',
 'Scott Yanoff',
 'Alan Kulwicki',
 'NASCAR Cup Series']