In [1]:
import requests
from xml.etree import ElementTree
import time
import re
from collections import defaultdict
from pathlib import Path
import os

In [2]:
def extract_see_also(page_text):
    try:
        lines         = page_text.splitlines()
        see_index     = lines.index('==See also==')
        ref_index     = lines.index('==References==')
        raw_titles    = lines[see_index+1:ref_index-1]
        regex         = '.*\[\[(.*)\]\]'
        parsed_titles = []
        for title in raw_titles:
            re_result = re.search(regex, title)
            if re_result:
                parsed_titles.append(re_result.group(1))

        return parsed_titles
    except:
        return []

In [3]:
def generate_page_name_from_title(title):
    return '_'.join(title.split())

In [4]:
def get_wikipedia_page(page_title, delay = 3):
    try:
        api_url           = f'https://en.wikipedia.org/wiki/Special:Export/{page_title}'
        req               = requests.get(api_url)
        time.sleep(5)
        page_text         = req.text
        xml_root          = ElementTree.fromstring(page_text)
        page_content      = xml_root\
                .find('{http://www.mediawiki.org/xml/export-0.10/}page')\
                .find('{http://www.mediawiki.org/xml/export-0.10/}revision')\
                .find('{http://www.mediawiki.org/xml/export-0.10/}text')
        page_content_text = page_content.text
        see_also_titles   = extract_see_also(page_content_text)
        see_also_links    = [generate_page_name_from_title(title) for title in see_also_titles] 
        page_dict         = {
            'title'   : page_title,
            'content' : page_content_text,
            'see_also': see_also_links
        }

        return page_dict
    except:
        print(f'Problem downloading {page_title}')
        return None

In [5]:
def mine_graph(entry_points, n = 10):
    queues     = [[point] for point in entry_points]
    downloaded = set()
    i          = 0
    documents  = defaultdict(list)
    
    while len(downloaded) < n:
        if not any(queue for queue in queues):
            print('all queues are empty, exiting.')
            break
        print(100 * len(downloaded) / n, '%')
        queue       = queues[i % len(queues)]
        i          += 1
        if not queue:
            continue
        page_title, category  = queue.pop(0)
        if page_title in downloaded:
            print(f'{page_title} already downloaded')
            continue
        downloaded.add(page_title)
        page_dict = get_wikipedia_page(page_title)
        
        if page_dict is None:
            continue
        
        documents[category].append(page_dict)
        new_queue_elems = [(title, category) for title in page_dict['see_also']]
        queue.extend(new_queue_elems)

    return documents

In [10]:
def save_documents(documents, data_folder = Path('../data/')):
    for category in documents:
        os.makedirs(data_folder / category, exist_ok = True)
        for page in documents[category]:
            title   = page['title'].replace('/', '_')
            content = page['content']
            with open(data_folder / category / title, 'w') as page_file:
                page_file.write(content)

In [7]:
entry_points = [
    ('French_Revolution'                       , 'history'),
    ('Aleppo_offensive_(October–December_2013)', 'history'),
    ('World_War_II'                            , 'history'),
    ('Algebraic_graph_theory'                  , 'math'),
    ('Machine_learning'                        , 'math'),
    ('Game_theory'                             , 'math'),
    ('Astronomy'                               , 'space'),
    ('Universe'                                , 'space'),
    ('Pluto'                                   , 'space'),
    ('Linguistics'                             , 'language'),
    ('Translation'                             , 'language'),
    ('Toki_Pona'                               , 'language'),
    ('Napster'                                 , 'tech'),
    ('Freenet'                                 , 'tech'),
    ('Neuralink'                               , 'tech'),
    ('For_the_World'                           , 'music'),
    ('Pixies'                                  , 'music'),
    ('Jazz'                                    , 'music'),
]

In [8]:
documents = mine_graph(entry_points, 2000)

0.0 %
0.05 %
0.1 %
0.15 %
0.2 %
0.25 %
0.3 %
0.35 %
0.4 %
0.45 %
0.5 %
0.55 %
0.6 %
0.65 %
0.7 %
0.75 %
0.8 %
0.85 %
0.9 %
0.95 %
0.95 %
0.95 %
1.0 %
1.0 %
1.0 %
1.05 %
1.05 %
1.05 %
1.1 %
1.15 %
1.2 %
1.2 %
1.25 %
1.25 %
1.3 %
1.3 %
1.35 %
1.4 %
1.4 %
1.4 %
1.45 %
1.45 %
1.45 %
1.5 %
1.5 %
1.5 %
1.55 %
1.6 %
1.65 %
1.65 %
1.7 %
1.7 %
Problem downloading Big_Bang_(Korean_band)|Big_Bang
1.75 %
1.75 %
1.8 %
1.85 %
1.85 %
1.85 %
1.9 %
1.9 %
1.9 %
1.95 %
1.95 %
1.95 %
2.0 %
2.05 %
2.1 %
2.1 %
2.15 %
2.15 %
2.2 %
2.2 %
2.25 %
2.3 %
2.3 %
2.3 %
2.35 %
2.35 %
2.35 %
2.4 %
2.4 %
2.4 %
2.45 %
2.5 %
2.55 %
2.55 %
2.6 %
2.6 %
2.65 %
2.65 %
2.7 %
2.75 %
2.75 %
2.75 %
2.8 %
2.8 %
2.8 %
2.85 %
2.85 %
2.85 %
2.9 %
2.95 %
3.0 %
3.0 %
3.05 %
3.05 %
3.05 %
3.05 %
3.1 %
3.15 %
3.15 %
3.15 %
3.2 %
3.2 %
3.2 %
3.25 %
3.25 %
3.25 %
3.3 %
3.35 %
3.4 %
3.4 %
3.45 %
3.45 %
3.45 %
3.45 %
3.5 %
3.55 %
3.55 %
3.55 %
3.6 %
3.6 %
3.6 %
3.65 %
3.65 %
3.65 %
3.7 %
3.75 %
3.8 %
3.8 %
3.85 %
3.85 %
3.85 %
3.85 %
3.9 %


16.05 %
16.05 %
16.05 %
16.1 %
16.15 %
16.2 %
16.2 %
16.25 %
16.25 %
16.25 %
16.25 %
List_of_jazz_festivals already downloaded
16.25 %
16.25 %
16.25 %
16.25 %
16.25 %
16.25 %
16.25 %
16.3 %
16.3 %
16.3 %
16.35 %
16.4 %
16.45 %
16.45 %
16.5 %
16.5 %
16.5 %
16.5 %
List_of_music_festivals already downloaded
16.5 %
16.5 %
16.5 %
16.5 %
16.5 %
16.5 %
16.5 %
16.55 %
16.55 %
16.55 %
16.6 %
Problem downloading :Category:Translation_associations|Translation_associations
16.65 %
16.7 %
16.7 %
Freenet already downloaded
16.7 %
16.7 %
16.7 %
16.7 %
16.75 %
16.75 %
16.75 %
16.75 %
16.75 %
16.75 %
16.75 %
16.8 %
16.8 %
16.8 %
16.85 %
16.9 %
16.95 %
16.95 %
17.0 %
17.0 %
17.0 %
17.0 %
17.05 %
17.05 %
17.05 %
17.05 %
17.05 %
17.05 %
17.05 %
17.1 %
17.1 %
17.1 %
17.15 %
17.2 %
17.25 %
17.25 %
17.3 %
17.3 %
17.3 %
17.3 %
17.35 %
17.35 %
17.35 %
17.35 %
17.35 %
17.35 %
17.35 %
17.4 %
17.4 %
17.4 %
17.45 %
Problem downloading :Category:Translation_scholars|Translation_scholars
17.5 %
International_auxilia

26.9 %
26.9 %
26.95 %
26.95 %
26.95 %
26.95 %
Music_Canada|Canada already downloaded
26.95 %
26.95 %
26.95 %
26.95 %
26.95 %
26.95 %
26.95 %
27.0 %
27.0 %
27.0 %
27.05 %
27.1 %
Asemic_writing already downloaded
27.1 %
27.1 %
27.15 %
27.15 %
27.15 %
27.15 %
27.2 %
27.2 %
27.2 %
27.2 %
27.2 %
27.2 %
27.2 %
27.25 %
27.25 %
27.25 %
27.3 %
Translation already downloaded
27.3 %
Problem downloading AUI_(constructed_language)|aUI
27.35 %
27.35 %
27.4 %
27.4 %
27.4 %
27.4 %
27.45 %
27.45 %
27.45 %
27.45 %
27.45 %
27.45 %
27.45 %
27.5 %
27.5 %
27.5 %
27.55 %
False_friend already downloaded
27.55 %
27.6 %
27.6 %
27.65 %
27.65 %
27.65 %
27.65 %
Federation_of_the_Italian_Music_Industry|Italy already downloaded
27.65 %
27.65 %
27.65 %
27.65 %
27.65 %
27.65 %
27.65 %
27.7 %
27.7 %
27.7 %
27.75 %
27.8 %
27.85 %
27.85 %
27.9 %
27.9 %
27.9 %
27.9 %
27.95 %
27.95 %
27.95 %
27.95 %
27.95 %
27.95 %
27.95 %
28.0 %
28.0 %
28.0 %
28.05 %
28.1 %
28.15 %
28.15 %
28.2 %
28.2 %
28.2 %
28.2 %
IFPI already download

37.45 %
37.45 %
37.45 %
37.5 %
37.55 %
37.6 %
37.6 %
Anonymous_remailer already downloaded
37.6 %
37.6 %
37.6 %
37.6 %
Problem downloading International_Federation_of_the_Phonographic_Industry|IFPI
37.65 %
37.65 %
37.65 %
37.65 %
37.65 %
37.65 %
37.65 %
37.7 %
37.7 %
37.7 %
37.75 %
37.8 %
37.85 %
37.85 %
Data_privacy already downloaded
37.85 %
37.85 %
37.85 %
37.85 %
List_of_best-selling_music_artists already downloaded
37.85 %
37.85 %
37.85 %
37.85 %
37.85 %
37.85 %
37.85 %
37.9 %
37.9 %
37.9 %
Problem downloading Espanca_script|Espanca
37.95 %
38.0 %
38.05 %
38.05 %
38.1 %
38.1 %
38.1 %
38.1 %
38.15 %
38.15 %
38.15 %
38.15 %
38.15 %
38.15 %
38.15 %
38.2 %
38.2 %
38.2 %
38.25 %
38.3 %
38.35 %
38.35 %
38.4 %
38.4 %
38.4 %
38.4 %
List_of_best-selling_singles_in_the_United_States#Top_ten_best-selling_artists|List_of_best_selling_digital_artists_in_the_United_States already downloaded
38.4 %
38.4 %
38.4 %
38.4 %
38.4 %
38.4 %
38.4 %
38.45 %
38.45 %
38.45 %
Rohonc_Codex already downloaded


48.0 %
48.0 %
48.0 %
48.05 %
Lexicology already downloaded
48.05 %
48.1 %
48.1 %
Problem downloading Privacy_Act_of_1974|U.S._Privacy_Act_of_1974
48.15 %
48.15 %
48.15 %
48.15 %
Global_music_industry_market_share_data already downloaded
48.15 %
48.15 %
48.15 %
48.15 %
48.15 %
48.15 %
48.15 %
48.2 %
48.2 %
48.2 %
48.25 %
48.3 %
48.35 %
48.35 %
48.4 %
48.4 %
48.4 %
48.4 %
Problem downloading List_of_number-one_hits_(France)|List_of_number_one_hits_in_France
48.45 %
48.45 %
48.45 %
48.45 %
48.45 %
48.45 %
48.45 %
48.5 %
48.5 %
48.5 %
48.55 %
48.6 %
Adamorobe_Sign_Language already downloaded
48.6 %
48.6 %
48.65 %
48.65 %
48.65 %
48.65 %
48.7 %
48.7 %
48.7 %
48.7 %
48.7 %
48.7 %
48.7 %
48.75 %
48.75 %
48.75 %
48.8 %
48.85 %
Martha's_Vineyard_Sign_Language already downloaded
48.85 %
48.85 %
48.9 %
48.9 %
48.9 %
48.9 %
48.95 %
48.95 %
48.95 %
48.95 %
48.95 %
48.95 %
48.95 %
49.0 %
49.0 %
49.0 %
49.05 %
49.1 %
Nicaraguan_Sign_Language already downloaded
49.1 %
49.1 %
49.15 %
49.15 %
49.15 %
49

58.3 %
58.3 %
58.3 %
58.3 %
58.3 %
58.3 %
58.3 %
58.35 %
58.35 %
58.35 %
58.4 %
58.45 %
Historical_linguistics already downloaded
58.45 %
58.45 %
58.5 %
58.5 %
58.5 %
58.5 %
58.55 %
58.55 %
58.55 %
58.55 %
58.55 %
58.55 %
58.55 %
58.6 %
58.6 %
58.6 %
58.65 %
58.7 %
Origin_of_language already downloaded
58.7 %
58.7 %
58.75 %
58.75 %
58.75 %
58.75 %
58.8 %
58.8 %
58.8 %
58.8 %
58.8 %
58.8 %
58.8 %
Big_Bang already downloaded
58.8 %
58.8 %
58.8 %
58.85 %
Skopos_theory already downloaded
58.85 %
58.9 %
58.9 %
58.95 %
58.95 %
58.95 %
58.95 %
59.0 %
59.0 %
59.0 %
59.0 %
59.0 %
59.0 %
59.0 %
Bit-string_physics already downloaded
59.0 %
59.0 %
59.0 %
59.05 %
Translation already downloaded
59.05 %
Universal_language already downloaded
59.05 %
59.05 %
59.1 %
59.1 %
59.1 %
59.1 %
59.15 %
59.15 %
59.15 %
59.15 %
59.15 %
59.15 %
59.15 %
59.2 %
59.2 %
59.2 %
Problem downloading Natural_language_processing|Natural_Language_Processing
59.25 %
Translation_criticism already downloaded
59.25 %
59.3 %
59.

68.15 %
68.15 %
68.15 %
68.2 %
Applied_linguistics already downloaded
68.2 %
Mobile_translation already downloaded
68.2 %
68.2 %
BeeGFS already downloaded
68.2 %
68.2 %
68.2 %
68.2 %
68.25 %
68.25 %
68.25 %
68.25 %
68.25 %
68.25 %
68.25 %
68.3 %
68.3 %
68.3 %
68.35 %
Back-translation already downloaded
68.35 %
68.4 %
68.4 %
68.45 %
68.45 %
68.45 %
68.45 %
Problem downloading Template:Australian_music_charts|Australian_music_charts
68.5 %
68.5 %
68.5 %
68.5 %
68.5 %
68.5 %
68.5 %
68.55 %
68.55 %
68.55 %
68.6 %
Bible_translations already downloaded
68.6 %
Phraselator already downloaded
68.6 %
68.6 %
68.65 %
68.65 %
68.65 %
68.65 %
68.7 %
68.7 %
68.7 %
68.7 %
68.7 %
68.7 %
68.7 %
68.75 %
68.75 %
68.75 %
68.8 %
Bilingual_dictionary already downloaded
68.8 %
68.85 %
68.85 %
Cloud_collaboration already downloaded
68.85 %
68.85 %
68.85 %
68.85 %
68.9 %
68.9 %
68.9 %
68.9 %
68.9 %
68.9 %
68.9 %
68.95 %
68.95 %
68.95 %
69.0 %
Calque#Translation|Calque already downloaded
69.0 %
Universal_languag

74.1 %
74.1 %
74.1 %
74.15 %
National_Translation_Mission already downloaded
74.15 %
74.2 %
74.2 %
74.25 %
74.25 %
74.25 %
74.25 %
List_of_college_nickname_changes_in_the_United_States already downloaded
74.25 %
74.25 %
74.25 %
74.25 %
74.25 %
74.25 %
74.25 %
74.3 %
74.3 %
74.3 %
74.35 %
Paraphrase already downloaded
74.35 %
74.4 %
74.4 %
74.45 %
74.45 %
74.45 %
74.45 %
List_of_college_sports_team_nicknames already downloaded
74.45 %
74.45 %
74.45 %
74.45 %
74.45 %
74.45 %
74.45 %
74.5 %
74.5 %
74.5 %
74.55 %
Phono-semantic_matching already downloaded
74.55 %
74.6 %
74.6 %
74.65 %
74.65 %
74.65 %
74.65 %
List_of_college_team_nicknames_in_the_United_States already downloaded
74.65 %
74.65 %
74.65 %
74.65 %
74.65 %
74.65 %
74.65 %
Astrobotic_Technology already downloaded
74.65 %
74.65 %
74.65 %
74.7 %
Postediting already downloaded
74.7 %
Lexicography already downloaded
74.7 %
74.7 %
74.75 %
74.75 %
74.75 %
74.75 %
List_of_college_mascots_in_the_United_States already downloaded
74.75 %
7

80.6 %
80.6 %
80.6 %
80.6 %
80.6 %
80.6 %
80.6 %
Problem downloading Spirit_rover|''Spirit''_rover
80.65 %
80.65 %
80.65 %
80.7 %
80.75 %
80.8 %
80.8 %
Onion_routing already downloaded
80.8 %
80.8 %
80.8 %
80.8 %
80.85 %
80.85 %
80.85 %
80.85 %
80.85 %
80.85 %
80.85 %
80.9 %
80.9 %
80.9 %
80.95 %
81.0 %
Gibberish already downloaded
81.0 %
81.0 %
Tor_(anonymity_network) already downloaded
81.0 %
81.0 %
81.0 %
81.0 %
81.05 %
81.05 %
81.05 %
81.05 %
81.05 %
81.05 %
81.05 %
81.1 %
81.1 %
81.1 %
81.15 %
81.2 %
81.25 %
81.25 %
Pseudonymous_remailer already downloaded
81.25 %
81.25 %
81.25 %
81.25 %
Styles_of_house_music already downloaded
81.25 %
81.25 %
81.25 %
81.25 %
81.25 %
81.25 %
81.25 %
Problem downloading Family_Portrait_(Voyager)|Family_Portrait
81.3 %
81.3 %
81.3 %
81.35 %
81.4 %
81.45 %
81.45 %
Penet_remailer already downloaded
81.45 %
81.45 %
81.45 %
81.45 %
Problem downloading :Category:Deep_house_producers|List_of_deep_house_music_artists
81.5 %
81.5 %
81.5 %
81.5 %
81.5 %
81.5

Problem downloading Cognitive_development|development
87.25 %
Language_industry already downloaded
87.25 %
87.3 %
87.3 %
Problem downloading CNET_Networks|CNET_News
87.35 %
87.35 %
87.35 %
87.35 %
87.4 %
87.4 %
87.4 %
87.4 %
87.4 %
87.4 %
87.4 %
87.45 %
87.45 %
87.45 %
Problem downloading Cognitive_interventions|interventions
87.5 %
Language_interpretation already downloaded
87.5 %
87.55 %
87.55 %
87.6 %
87.6 %
87.6 %
87.6 %
87.65 %
87.65 %
87.65 %
87.65 %
87.65 %
87.65 %
87.65 %
87.7 %
87.7 %
87.7 %
Problem downloading Cognitive_module|module
87.75 %
Language_localisation already downloaded
87.75 %
Costermonger already downloaded
87.75 %
87.75 %
Electronic_Frontier_Foundation already downloaded
87.75 %
87.75 %
87.75 %
87.75 %
87.8 %
87.8 %
87.8 %
87.8 %
87.8 %
87.8 %
87.8 %
Yuri's_Night already downloaded
87.8 %
87.8 %
87.8 %
Problem downloading Cognitive_neuropsychology|neuropsychology
87.85 %
Language_professional already downloaded
87.85 %
87.9 %
87.9 %
Electronic_Frontier_Foundati

93.4 %
93.4 %
93.4 %
93.4 %
93.4 %
93.4 %
93.4 %
Space_exploration already downloaded
93.4 %
93.4 %
93.4 %
93.45 %
Translation_studies already downloaded
93.45 %
93.5 %
93.5 %
Cypherpunk_anonymous_remailer already downloaded
93.5 %
93.5 %
93.5 %
93.5 %
93.55 %
93.55 %
93.55 %
93.55 %
93.55 %
93.55 %
93.55 %
93.6 %
93.6 %
93.6 %
93.65 %
Translation-quality_standards already downloaded
93.65 %
93.7 %
93.7 %
Mixmaster_anonymous_remailer already downloaded
93.7 %
93.7 %
93.7 %
93.7 %
93.75 %
93.75 %
93.75 %
93.75 %
93.75 %
93.75 %
93.75 %
93.8 %
93.8 %
93.8 %
93.85 %
Transliteration already downloaded
93.85 %
93.9 %
93.9 %
Mixminion already downloaded
93.9 %
93.9 %
93.9 %
93.9 %
List_of_artists_who_reached_number_one_on_the_UK_Singles_Chart already downloaded
93.9 %
93.9 %
93.9 %
93.9 %
93.9 %
93.9 %
93.9 %
Problem downloading Shuttle:_the_Space_Flight_Simulator_(Virgin_game)|Space_Flight_Simulator
93.95 %
93.95 %
93.95 %
94.0 %
Untranslatability already downloaded
94.0 %
94.05 %
94.05 %
I

In [11]:
save_documents(documents)