<a href="https://colab.research.google.com/github/shainaraza/news-recommender-system/blob/master/downloading_and_parsing_wikinews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*   Finding and retrieve online data
*   Parsing XML using a SAX parser
*  Parsing Mediawiki content using mwparserfromhell
*  Running operations in parallel using multiprocessing and multithreading



In [0]:
import requests

# Parsing HTML
from bs4 import BeautifulSoup

# File system management
import os

# Searching for Wikipedia Dumps


In [2]:
#base_url = 'https://dumps.wikimedia.org/enwiki/'
base_url = 'https://dumps.wikimedia.org/enwikinews/'
index = requests.get(base_url).text
soup_index = BeautifulSoup(index, 'html.parser')

# Find the links that are dates of dumps
dumps = [a['href'] for a in soup_index.find_all('a') if 
         a.has_attr('href')]
dumps

#for now i am taking april 2020, later i will take a few from all these three years for tempioral dynamics insh a ALLAH 

['../',
 '20191101/',
 '20191120/',
 '20191201/',
 '20191220/',
 '20200101/',
 '20200120/',
 '20200201/',
 '20200220/',
 '20200301/',
 '20200401/',
 'latest/']

In [3]:
dump_url = base_url + '20200401/'

# Retrieve the html
dump_html = requests.get(dump_url).text
dump_html[:10]

'<!DOCTYPE '

In [4]:
# Convert to a soup
soup_dump = BeautifulSoup(dump_html, 'html.parser')

# Find li elements with the class file
soup_dump.find_all('li', {'class': 'file'}, limit = 10) #[:4]

[<li class="file"><a href="/enwikinews/20200401/enwikinews-20200401-pages-articles-multistream.xml.bz2">enwikinews-20200401-pages-articles-multistream.xml.bz2</a> 45.7 MB</li>,
 <li class="file"><a href="/enwikinews/20200401/enwikinews-20200401-pages-articles-multistream-index.txt.bz2">enwikinews-20200401-pages-articles-multistream-index.txt.bz2</a> 1.2 MB</li>,
 <li class="file"><a href="/enwikinews/20200401/enwikinews-20200401-pages-meta-history.xml.7z">enwikinews-20200401-pages-meta-history.xml.7z</a> 394.3 MB</li>,
 <li class="file"><a href="/enwikinews/20200401/enwikinews-20200401-pages-meta-history.xml.bz2">enwikinews-20200401-pages-meta-history.xml.bz2</a> 646.7 MB</li>,
 <li class="file"><a href="/enwikinews/20200401/enwikinews-20200401-pages-logging.xml.gz">enwikinews-20200401-pages-logging.xml.gz</a> 110.4 MB</li>,
 <li class="file"><a href="/enwikinews/20200401/enwikinews-20200401-pages-meta-current.xml.bz2">enwikinews-20200401-pages-meta-current.xml.bz2</a> 194.7 MB</li>,
 

In [5]:
files = []

# Search through all files
for file in soup_dump.find_all('li', {'class': 'file'}):
    text = file.text
    # Select the relevant files
    if 'pages-articles' in text:
        files.append((text.split()[0], text.split()[1:]))
        
files[:5]

[('enwikinews-20200401-pages-articles-multistream.xml.bz2', ['45.7', 'MB']),
 ('enwikinews-20200401-pages-articles-multistream-index.txt.bz2',
  ['1.2', 'MB']),
 ('enwikinews-20200401-pages-articles.xml.bz2', ['39.9', 'MB'])]

In [10]:
files_to_download = [file[0] for file in files if '-pages-articles.xml' in file[0]]
files_to_download[-5:]

['enwikinews-20200401-pages-articles.xml.bz2']

Download Wikipedia Data


In [0]:
import sys

keras_home = '/content/drive/My Drive/Shaina- DL NRS/data preprocessing/wikinews/'

In [0]:
from urllib.request import urlretrieve
import os,sys
from zipfile import ZipFile
from pathlib import Path
from keras.utils import get_file

data_paths = []
file_info = []

# Iterate through each file
for file in files_to_download[:5]:
     path = keras_home + file
   

     if not os.path.exists( keras_home + file):
        print('Downloading')
        # If not, download the file
        data_paths.append(get_file(keras_home +file, dump_url+file))
       

        # Find the file size in MB
        file_size = os.stat(path).st_size / 1e6
        file_info.append((file, file_size))

     
    # If the file is already downloaded find some information
     else:
        data_paths.append(path)
        # Find the file size in MB
        file_size = os.stat(path).st_size / 1e6
        file_info.append((file, file_size))


In [14]:
print(f'There are {len(file_info)} partitions.')


There are 1 partitions.


In [16]:
print(f"The total size of files on disk is {file_df['size (MB)'].sum() / 1e3} GB")


The total size of files on disk is 0.041848355 GB


In [17]:
import bz2
import subprocess

data_path = data_paths[0]
data_path

'/content/drive/My Drive/Shaina- DL NRS/data preprocessing/wikinews/enwikinews-20200401-pages-articles.xml.bz2'

In [20]:
#The subprocess + bzcat approach is nearly twice as fast. Let's run this again and see what kind of data we have.


%%timeit -n 3 -r 3

lines = []

for i, line in enumerate(subprocess.Popen(['bzcat'], 
                         stdin = open(data_path), 
                         stdout = subprocess.PIPE).stdout):
    lines.append(line)
lines

3 loops, best of 3: 9.36 s per loop


**Parsing Approach
**

> In order to get useful information from this data, we have to parse it on two levels.
1.  Extract the titles and article text from the XML
2.   Extract relevant information from the article text


Parsing XML using XML, SAX


In [0]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))
      

In [22]:
# Object for handling xml
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for i, line in enumerate(subprocess.Popen(['bzcat'], 
                         stdin = open(data_path), 
                         stdout = subprocess.PIPE).stdout):
    parser.feed(line)
    
 
print([x[0] for x in handler._pages])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [0]:
handler._pages

In [24]:
!pip install mwparserfromhell 

Collecting mwparserfromhell
[?25l  Downloading https://files.pythonhosted.org/packages/23/03/4fb04da533c7e237c0104151c028d8bff856293d34e51d208c529696fb79/mwparserfromhell-0.5.4.tar.gz (135kB)
[K     |██▍                             | 10kB 18.5MB/s eta 0:00:01[K     |████▉                           | 20kB 1.7MB/s eta 0:00:01[K     |███████▎                        | 30kB 2.5MB/s eta 0:00:01[K     |█████████▊                      | 40kB 1.7MB/s eta 0:00:01[K     |████████████                    | 51kB 2.1MB/s eta 0:00:01[K     |██████████████▌                 | 61kB 2.5MB/s eta 0:00:01[K     |█████████████████               | 71kB 2.8MB/s eta 0:00:01[K     |███████████████████▍            | 81kB 2.2MB/s eta 0:00:01[K     |█████████████████████▊          | 92kB 2.5MB/s eta 0:00:01[K     |████████████████████████▏       | 102kB 2.7MB/s eta 0:00:01[K     |██████████████████████████▋     | 112kB 2.7MB/s eta 0:00:01[K     |█████████████████████████████   | 122kB 2.7MB/

In [25]:
import mwparserfromhell 

print(handler._pages[1][1])

# Create the wiki article
wiki = mwparserfromhell.parse(handler._pages[1][1])

{{historical|[[Special:Log/upload]]}} < ul > < li > 12:24, 3 Dec 2004 [[User:Stevertigo|Stevertigo]] uploaded  " [[:Image:JuergenLogo.png|JuergenLogo.png]] "   < em > (Juergen Logo - submitted to   & #91; & #91;meta:International logo contest]] under the  & #123; & #123;GFDL}} - modified as potential logo Wikinews ) < /em > < /li > 
 < li > 08:11, 8 Nov 2004 [[User:Tim|Tim]] uploaded  " [[:Image:Wiki.png|Wiki.png]] "   < em > (Draft logo for wikinews. From  & #91; & #91;m:Image:Wikinews-draftlogo.png]].  & #123; & #123;PD}}) < /em > < /li > 
 
 < /ul >


In [26]:
print(type(wiki))
wiki[:200]

<class 'mwparserfromhell.wikicode.Wikicode'>


'{{historical|[[Special:Log/upload]]}} < ul > < li > 12:24, 3 Dec 2004 [[User:Stevertigo|Stevertigo]] uploaded  " [[:Image:JuergenLogo.png|JuergenLogo.png]] "   < em > (Juergen Logo - submitted to   & '

In [27]:
wikilinks = [x.title for x in wiki.filter_wikilinks()]
print(f'There are {len(wikilinks)} wikilinks.')
wikilinks[:5]

There are 5 wikilinks.


['Special:Log/upload',
 'User:Stevertigo',
 ':Image:JuergenLogo.png',
 'User:Tim',
 ':Image:Wiki.png']

In [28]:
external_links = [(x.title, x.url) for x in wiki.filter_external_links()]
print(f'There are {len(external_links)} external links.')
external_links[:50]

There are 0 external links.


[]

In [0]:
import re

def process_article(title, text, timestamp, template = 'Cite news'):
    """Process a wikipedia article looking for template"""
    
    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)
    
    # Search through templates for the template
    matches = wikicode.filter_templates(matches = template)
    
    # Filter out errant matches
    matches = [x for x in matches if x.name.strip_code().strip().lower() == template.lower()]
    
    if len(matches) >= 1:
        # template_name = matches[0].name.strip_code().strip()

        # Extract information from infobox
        properties = {param.name.strip_code().strip(): param.value.strip_code().strip() 
                      for param in matches[0].params
                      if param.value.strip_code().strip()}

        # Extract internal wikilinks
        wikilinks = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]

        # Extract external links
        exlinks = [x.url.strip_code().strip() for x in wikicode.filter_external_links()]

        # Find approximate length of article
        text_length = len(wikicode.strip_code().strip())

        return (title, properties, wikilinks, exlinks, timestamp, text_length)

In [0]:
def process_article(title, text, timestamp):
   wikicode = mwparserfromhell.parse(text)
   wikilinks = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]

        # Extract external links
   exlinks = [x.url.strip_code().strip() for x in wikicode.filter_external_links()]

         # Find approximate length of article
   text_length = len(wikicode.strip_code().strip())

   return (title,  wikilinks, exlinks, timestamp, text_length)

In [0]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Parse through XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._news = []
        self._article_count = 0
        self._non_matches = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._article_count += 1
            # Search through the page to see if the page is a newsitem
            news_item = process_article(**self._values )
            # Append to the list of news
            if news_item:
                self._news.append(news_item)

In [40]:
# Object for handling xml
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for i, line in enumerate(subprocess.Popen(['bzcat'], 
                         stdin = open(data_path), 
                         stdout = subprocess.PIPE).stdout):
    parser.feed(line)
 
        
print(f'Searched through {handler._article_count} .')

Searched through 99352 .


In [42]:
handler._news


[('File:Wiki.png', [], [], '2005-06-29T02:57:00Z', 14),
 ('Wikinews:Upload log',
  ['Special:Log/upload',
   'User:Stevertigo',
   ':Image:JuergenLogo.png',
   'User:Tim',
   ':Image:Wiki.png'],
  [],
  '2011-02-28T10:03:43Z',
  440),
 ('Main Page',
  ['MediaWiki:Common.css/Main Page',
   'MediaWiki:Common.css',
   'Template:Latest news',
   'Category:Portal',
   'Category:No publish',
   'ar:الصفحة الرئيسية',
   'bg:Начална страница',
   'bs:Početna strana',
   'ca:Portada',
   'cs:Wikizprávy:Hlavní strana',
   'de:Hauptseite',
   'eo:Ĉefpaĝo',
   'el:Κύρια Σελίδα',
   'es:Portada',
   'fa:صفحهٔ اصلی',
   'fi:Etusivu',
   'fr:Accueil',
   'he:עמוד ראשי',
   'it:Pagina principale',
   'ja:メインページ',
   'ko:대문',
   'no:Forside',
   'pl:Strona główna',
   'pt:Página principal',
   'ro:Pagina principală',
   'ru:Заглавная страница',
   'sq:Faqja Kryesore',
   'sr:Главна страна',
   'sv:Huvudsida',
   'ta:முதற் பக்கம்',
   'th:หน้าหลัก',
   'tr:Ana Sayfa',
   'uk:Головна',
   'zh:Wikinews:首页

In [43]:
# Uncompress the file if not already uncompressed
if not os.path.exists('/content/drive/My Drive/Shaina- DL NRS/data preprocessing/wikinews/articles11.xml'):
    subprocess.call(["bzcat '/content/drive/My Drive/Shaina- DL NRS/data preprocessing/wikinews/enwikinews-20200401-pages-articles.xml.bz2' >> '/content/drive/My Drive/Shaina- DL NRS/data preprocessing/wikinews/articles11.xml'"],
                    shell = True)
else:
    print('Already uncompressed')

Already uncompressed


In [44]:
!wc '/content/drive/My Drive/Shaina- DL NRS/data preprocessing/wikinews/articles11.xml'


  3704634  20903437 204994714 /content/drive/My Drive/Shaina- DL NRS/data preprocessing/wikinews/articles11.xml


In [47]:
from timeit import default_timer as timer

start = timer()
# Object for handling xml
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

# Parse the entire file
for i, line in enumerate(subprocess.Popen(['bzcat'], 
                         stdin = open(data_path), 
                         stdout = subprocess.PIPE).stdout):
    if (i + 1) % 10000 == 0:
        print(f'Processed {i + 1} lines so far.', end = '\r')
    try:
        parser.feed(line)
    except StopIteration:
        break
    
end = timer()
news = handler._news

print(f'\nSearched through {handler._article_count} articles.')
print(f'\nFound {len(news)} newsbooks .')


Searched through 99352 articles.

Found 99352 newsbooks .


In [0]:

import json

# Save list of books
with open('/content/drive/My Drive/Shaina- DL NRS/data preprocessing/wikinews/p15_books.ndjson', 'wt') as fout:
    for l in books:
        fout.write(json.dumps(l) + '\n')

In [0]:
news_in = []

# Read in list of news
with open('/content/drive/My Drive/Shaina- DL NRS/data preprocessing/wikinews/p15_books.ndjson', 'rt') as fin:
    for l in fin.readlines():
       news_in.append(json.loads(l))

In [51]:
news_in


[['File:Wiki.png', [], [], '2005-06-29T02:57:00Z', 14],
 ['Wikinews:Upload log',
  ['Special:Log/upload',
   'User:Stevertigo',
   ':Image:JuergenLogo.png',
   'User:Tim',
   ':Image:Wiki.png'],
  [],
  '2011-02-28T10:03:43Z',
  440],
 ['Main Page',
  ['MediaWiki:Common.css/Main Page',
   'MediaWiki:Common.css',
   'Template:Latest news',
   'Category:Portal',
   'Category:No publish',
   'ar:الصفحة الرئيسية',
   'bg:Начална страница',
   'bs:Početna strana',
   'ca:Portada',
   'cs:Wikizprávy:Hlavní strana',
   'de:Hauptseite',
   'eo:Ĉefpaĝo',
   'el:Κύρια Σελίδα',
   'es:Portada',
   'fa:صفحهٔ اصلی',
   'fi:Etusivu',
   'fr:Accueil',
   'he:עמוד ראשי',
   'it:Pagina principale',
   'ja:メインページ',
   'ko:대문',
   'no:Forside',
   'pl:Strona główna',
   'pt:Página principal',
   'ro:Pagina principală',
   'ru:Заглавная страница',
   'sq:Faqja Kryesore',
   'sr:Главна страна',
   'sv:Huvudsida',
   'ta:முதற் பக்கம்',
   'th:หน้าหลัก',
   'tr:Ana Sayfa',
   'uk:Головна',
   'zh:Wikinews:首页

In [0]:

# Save list of books
with open('../data/p15_books.ndjson', 'wt') as fout:
    for l in books:
        fout.write(json.dumps(l) + '\n')

FileNotFoundError: ignored