# Downloading Wikipedia Articles for Indexing

This notebook downloads ~5,000 Wikipedia articles programmatically using the `wikipedia` library, and saves the results for subsequent indexing.

In [None]:
import itertools
import logging
import os
import time
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache

import pandas as pd
import wikipedia
from tqdm import tqdm

In [31]:
# set language to English
wikipedia.set_lang("en")

# directory to save articles
OUTPUT_DIR = "../data/input"
os.makedirs(OUTPUT_DIR, exist_ok=True)

## Step 1: Collect article titles via two-letter prefixes

We generate all two-letter combinations (`aa`, `ab`, ..., `zz`) to search Wikipedia. For each prefix, we retrieve up to 10 article titles.

In [46]:
@lru_cache(maxsize=None)
def list_prefix(prefix: str, results: int = 10) -> list:
    time.sleep(0.1)  # avoid throttling
    titles = wikipedia.search(prefix, results=results)
    return titles

In [None]:
# generate two-letter prefixes (aa, ab, ..., zz)
letters = "abcdefghijklmnopqrstuvwxyz"
prefixes = ("".join(p) for p in itertools.product(letters, repeat=2))

In [None]:
collected_titles = []

with ThreadPoolExecutor(max_workers=3) as executor:
    futures = []
    for prefix in prefixes:
        future = executor.submit(list_prefix, prefix)
        futures.append(future)

    for future in tqdm(futures):
        try:
            results = future.result()
            collected_titles.extend(results)
        except Exception as e:
            logging.error(f"Error processing future: {e}")

100%|██████████| 676/676 [02:35<00:00,  4.34it/s]


In [7]:
# remove duplicates
collected_titles = sorted(set(collected_titles))

len(collected_titles)

6581

## Step 2: Download Each Article's Content

Using the list of unique titles, we fetch page details for each article.

In [11]:
@lru_cache(maxsize=None)
def cached_get_page(title: str) -> str:
    time.sleep(0.1)
    page = wikipedia.page(title)
    # print(f"Fetched page: {title}")

    return {
        "title": page.title,
        "content": page.content,
        "url": page.url,
        "summary": page.summary,
    }

In [24]:
articles = []

with ThreadPoolExecutor(max_workers=4) as executor:
    futures = []
    for title in collected_titles:
        future = executor.submit(cached_get_page, title)
        futures.append(future)

    for future in tqdm(futures):
        try:
            page_item = future.result()
            articles.append(page_item)
        except wikipedia.exceptions.DisambiguationError as e:
            # logging.warning(f"Disambiguation error for {e.title}: {e.options}")
            pass
        except wikipedia.exceptions.PageError as e:
            # logging.error(f"Page error: {e}")
            pass
        except Exception as e:
            logging.error(f"Error processing future: {e}")

 74%|███████▍  | 4867/6581 [04:56<01:15, 22.72it/s]ERROR:root:Error processing future: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /w/api.php?prop=revisions&rvprop=content&rvparse=&rvlimit=1&titles=Rx+%28disambiguation%29&format=json&action=query (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fd90ecf1f10>: Failed to establish a new connection: [Errno 101] Network is unreachable'))
 89%|████████▉ | 5876/6581 [05:47<00:36, 19.06it/s]ERROR:root:Error processing future: HTTPConnectionPool(host='en.wikipedia.org', port=80): Max retries exceeded with url: /w/api.php?prop=info%7Cpageprops&inprop=url&ppprop=disambiguation&redirects=&titles=volkswagen+r&format=json&action=query (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fd90f310590>: Failed to establish a new connection: [Errno 101] Network is unreachable'))
100%|██████████| 6581/6581 [06:33<00:00, 16.74it/s]


In [25]:
len(articles)

4573

In [30]:
articles_df = pd.DataFrame(articles)

In [40]:
articles_df.sample(5)

Unnamed: 0,title,content,url,summary
151,AAI RQ-2 Pioneer,The AAI RQ-2 Pioneer is an unmanned aerial veh...,https://en.wikipedia.org/wiki/AAI_RQ-2_Pioneer,The AAI RQ-2 Pioneer is an unmanned aerial veh...
2717,News,News is information about current events. This...,https://en.wikipedia.org/wiki/News,News is information about current events. This...
1451,H. L. Green Company,H. L. Green was a five and dime store chain in...,https://en.wikipedia.org/wiki/H._L._Green_Company,H. L. Green was a five and dime store chain in...
153,A,"A, or a, is the first letter and the first vow...",https://en.wikipedia.org/wiki/A,"A, or a, is the first letter and the first vow..."
1814,JK Place,J.K.Place is a chain of Italian boutique hotel...,https://en.wikipedia.org/wiki/JK_Place,J.K.Place is a chain of Italian boutique hotel...


## Step 3: Saving the articles

We save the DataFrame for later use.

In [35]:
articles_df.to_parquet(
    os.path.join(OUTPUT_DIR, "wikipedia_articles.parquet"),
    index=False,
    engine="pyarrow",
    compression="snappy",
)