In [1]:
import os
import requests
import pandas as pd
from matplotlib import pyplot as plt
from bs4 import BeautifulSoup

In [2]:
%matplotlib inline

### Downloading Wikipedia Dumps

In [3]:
project = 'enwiki'
base_url = f'https://dumps.wikimedia.org/{project}/'

In [4]:
base_url

'https://dumps.wikimedia.org/enwiki/'

In [5]:
index = requests.get(base_url).text
index

'<html>\r\n<head><title>Index of /enwiki/</title></head>\r\n<body bgcolor="white">\r\n<h1>Index of /enwiki/</h1><hr><pre><a href="../">../</a>\r\n<a href="20220201/">20220201/</a>                                          21-Mar-2022 01:28                   -\r\n<a href="20220220/">20220220/</a>                                          02-Apr-2022 01:27                   -\r\n<a href="20220301/">20220301/</a>                                          02-May-2022 01:25                   -\r\n<a href="20220320/">20220320/</a>                                          02-May-2022 01:27                   -\r\n<a href="20220401/">20220401/</a>                                          07-Apr-2022 21:43                   -\r\n<a href="20220420/">20220420/</a>                                          22-Apr-2022 07:09                   -\r\n<a href="20220501/">20220501/</a>                                          03-May-2022 05:35                   -\r\n<a href="latest/">latest/</a>             

In [6]:
soup_index = BeautifulSoup(index, 'html.parser')

In [7]:
dumps = [ a.text for a in soup_index.find_all('a') if a.has_attr('href')]
dumps

['../',
 '20220201/',
 '20220220/',
 '20220301/',
 '20220320/',
 '20220401/',
 '20220420/',
 '20220501/',
 'latest/']

In [8]:
dump_url = base_url + '20220420/'
dump_html = requests.get(dump_url).text

soup_dump = BeautifulSoup(dump_html, 'html.parser')

In [9]:
soup_dump.find_all('li', {'class': 'file'}, limit=10)[:4]

[<li class="file"><a href="/enwiki/20220420/enwiki-20220420-pages-articles-multistream.xml.bz2">enwiki-20220420-pages-articles-multistream.xml.bz2</a> 19.3 GB</li>,
 <li class="file"><a href="/enwiki/20220420/enwiki-20220420-pages-articles-multistream-index.txt.bz2">enwiki-20220420-pages-articles-multistream-index.txt.bz2</a> 230.0 MB</li>,
 <li class="file"><a href="/enwiki/20220420/enwiki-20220420-pages-articles-multistream1.xml-p1p41242.bz2">enwiki-20220420-pages-articles-multistream1.xml-p1p41242.bz2</a> 247.4 MB</li>,
 <li class="file"><a href="/enwiki/20220420/enwiki-20220420-pages-articles-multistream-index1.txt-p1p41242.bz2">enwiki-20220420-pages-articles-multistream-index1.txt-p1p41242.bz2</a> 221 KB</li>]

In [10]:
files = []

for file in soup_dump.find_all('li', {'class': 'file'}):
    text = file.text
    files.append((text.split(' ')[0], text.split(' ')[1:]))

In [11]:
files[:4]

[('enwiki-20220420-pages-articles-multistream.xml.bz2', ['19.3', 'GB']),
 ('enwiki-20220420-pages-articles-multistream-index.txt.bz2', ['230.0', 'MB']),
 ('enwiki-20220420-pages-articles-multistream1.xml-p1p41242.bz2',
  ['247.4', 'MB']),
 ('enwiki-20220420-pages-articles-multistream-index1.txt-p1p41242.bz2',
  ['221', 'KB'])]

In [12]:
file_to_download = [ file[0] for file in files 
                    if '.xml-p' in file[0] and 'pages-articles' in file[0] and 'multistream' in file[0]]

In [13]:
file_to_download[-5:]

['enwiki-20220420-pages-articles-multistream27.xml-p63975910p65475909.bz2',
 'enwiki-20220420-pages-articles-multistream27.xml-p65475910p66975909.bz2',
 'enwiki-20220420-pages-articles-multistream27.xml-p66975910p68475909.bz2',
 'enwiki-20220420-pages-articles-multistream27.xml-p68475910p69975909.bz2',
 'enwiki-20220420-pages-articles-multistream27.xml-p69975910p70585441.bz2']

In [None]:
import sys
import ssl
import tensorflow as tf

ssl._create_default_https_context = ssl._create_unverified_context
dataset_home = '/home/scai/phd/aiz218323/scratch/XML/wikipedia-data-science/'

In [None]:
data_path = []
file_info = []


for file in file_to_download:
    path = f'{dataset_home}/datasets/{file}'
    
    if not os.path.exists(path):
        data_path.append( tf.keras.utils.get_file(origin=f'{dump_url}/{file}', cache_dir=dataset_home) )
    else:
        data_path.append(path)
        
    file_size = os.stat(path).st_size/1e6
    file_articles = int(file.split('p')[-1].split('.')[0]) - int(file.split('p')[-2])
    file_info.append((file, file_size, file_articles))

__Looking at the size of the files.__

In [167]:
pd.DataFrame(sorted(file_info, key=lambda x: x[1], reverse=True)[:4], columns=['filename', 'size (MB)', '--'])

Unnamed: 0,filename,size (MB),--
0,enwiki-20220420-pages-articles-multistream9.xm...,556.144802,1109141
1,enwiki-20220420-pages-articles-multistream10.x...,549.290108,1353963
2,enwiki-20220420-pages-articles-multistream11.x...,533.662197,1499999
3,enwiki-20220420-pages-articles-multistream8.xm...,512.258438,802148


In [168]:
print(f"Number of partitions : {len(file_info)}")

Number of partitions : 62


In [150]:
dataset_home

'/home/scai/phd/aiz218323/scratch/XML/wikipedia-data-science/datasets'

In [139]:
data_path[0]

'/home/scai/phd/aiz218323/.keras/datasets/enwiki-20220420-pages-articles-multistream1.xml-p1p41242.bz2'

In [140]:
file_info

[('enwiki-20220420-pages-articles-multistream1.xml-p1p41242.bz2',
  259.423102,
  41241)]

In [129]:
import bz2
import subprocess

In [141]:
lines = []
for i, line in enumerate(bz2.BZ2File(data_path[0], 'r')):
    lines.append(line)
    break

In [142]:
lines

[b'<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">\n']