## Get List of URLs under 'Wikipedia list cleanup'

In [1]:
import pandas as pd
import random
import requests
import time

from bs4 import BeautifulSoup
from typing import List, Tuple, Dict

In [2]:
BASE_URL = "https://en.wikipedia.org"
CATEGORY_URL = f"{BASE_URL}/wiki/Category:Wikipedia_list_cleanup"

def fetch_subcategory_links(category_url: str) -> List[str]:
    """
    Fetch subcategory URLs under 'Wikipedia list cleanup'.

    Args:
        category_url (str): The full URL to the category page.

    Returns:
        List[str]: A list of subcategory URLs.
    """
    try:
        response = requests.get(category_url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"❌ Error fetching category page: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    subcategory_links = []

    for bdi in soup.find_all("bdi"):
        a_tag = bdi.find("a", href=True)
        if a_tag and a_tag["href"].startswith("/wiki/Category:Wikipedia_list_cleanup_from"):
            subcategory_links.append(BASE_URL + a_tag["href"])

    return subcategory_links

def main() -> List[str]:
    print("📂 Fetching subcategories under 'Wikipedia list cleanup'...\n")
    subcategories = fetch_subcategory_links(CATEGORY_URL)

    if not subcategories:
        print("⚠️ No subcategories found.")
        return []

    print(f"✅ Found {len(subcategories)} subcategories.\n")
    for url in subcategories:
        print(f"🔗 {url}")

    return subcategories

if __name__ == "__main__":
    category_urls = main()

📂 Fetching subcategories under 'Wikipedia list cleanup'...

✅ Found 161 subcategories.

🔗 https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_November_2010
🔗 https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_December_2010
🔗 https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_February_2011
🔗 https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2011
🔗 https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_April_2011
🔗 https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_May_2011
🔗 https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_July_2011
🔗 https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_August_2011
🔗 https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_September_2011
🔗 https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_December_2011
🔗 https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_January_2012
🔗 https://en.wik

## 🔍 Extracting Article Links from Wikipedia Cleanup Categories

In [3]:
def extract_article_links(category_url: str) -> List[Tuple[str, str]]:
    """
    Extracts article titles and URLs from a given Wikipedia category page.

    Args:
        category_url (str): Full URL to a Wikipedia category page.

    Returns:
        List[Tuple[str, str]]: A list of (title, full_url) tuples.
    """
    print(f"\n🔍 Extracting from: {category_url}")
    try:
        response = requests.get(category_url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"❌ Failed to fetch page: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    article_links = []

    for a_tag in soup.select("div.mw-category li a"):
        title = a_tag.get("title")
        href = a_tag.get("href")

        if title and href and href.startswith("/wiki/"):
            full_url = BASE_URL + href
            article_links.append((title, full_url))

    print(f"✅ Found {len(article_links)} articles.")
    return article_links


def main(category_urls: List[str]) -> Dict[str, List[Dict[str, str]]]:
    """
    Processes a list of Wikipedia category URLs and collects article data.

    Args:
        category_urls (List[str]): List of Wikipedia category page URLs.

    Returns:
        Dict[str, List[Dict[str, str]]]: Mapping of category URL to article dicts.
    """
    all_links: List[Tuple[str, str]] = []
    cleanup_dict: Dict[str, List[Dict[str, str]]] = {}

    for category_url in category_urls:
        article_links = extract_article_links(category_url)
        all_links.extend(article_links)

        cleanup_dict[category_url] = []
        for title, link in article_links:
            print(f"- {title}: {link}")
            cleanup_dict[category_url].append({title: link})

    print(f"\n🧮 Total articles extracted: {len(all_links)}")
    return cleanup_dict


if __name__ == "__main__":
    cleanup_dict = main(category_urls)


🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_November_2010
✅ Found 2 articles.
- Dorothea Brande: https://en.wikipedia.org/wiki/Dorothea_Brande
- DisplayPort: https://en.wikipedia.org/wiki/DisplayPort

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_December_2010
✅ Found 3 articles.
- Atínale al precio: https://en.wikipedia.org/wiki/At%C3%ADnale_al_precio
- List of Dominican Republic films: https://en.wikipedia.org/wiki/List_of_Dominican_Republic_films
- Instructional design: https://en.wikipedia.org/wiki/Instructional_design

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_February_2011
✅ Found 1 articles.
- California Sun: https://en.wikipedia.org/wiki/California_Sun

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2011
✅ Found 1 articles.
- List of people educated at Haileybury (Melbourne): https://en.wikipedia.org/wiki/Lis

✅ Found 4 articles.
- Hiroshi Kamiya: https://en.wikipedia.org/wiki/Hiroshi_Kamiya
- Old Crown Brewing Corporation: https://en.wikipedia.org/wiki/Old_Crown_Brewing_Corporation
- List of programs broadcast by A-Channel: https://en.wikipedia.org/wiki/List_of_programs_broadcast_by_A-Channel
- Stefaan Verhulst: https://en.wikipedia.org/wiki/Stefaan_Verhulst

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_February_2014
✅ Found 1 articles.
- Tomoaki Maeno: https://en.wikipedia.org/wiki/Tomoaki_Maeno

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2014
✅ Found 3 articles.
- List of fictional Asian countries: https://en.wikipedia.org/wiki/List_of_fictional_Asian_countries
- List of McDonald's marketing campaigns: https://en.wikipedia.org/wiki/List_of_McDonald%27s_marketing_campaigns
- Roger Craig Vogel: https://en.wikipedia.org/wiki/Roger_Craig_Vogel

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wi

✅ Found 6 articles.
- Axis of evil: https://en.wikipedia.org/wiki/Axis_of_evil
- Bigg Boss Kannada season 1: https://en.wikipedia.org/wiki/Bigg_Boss_Kannada_season_1
- Takis Fotopoulos: https://en.wikipedia.org/wiki/Takis_Fotopoulos
- Henry Hugglemonster: https://en.wikipedia.org/wiki/Henry_Hugglemonster
- Hiroaki Miura: https://en.wikipedia.org/wiki/Hiroaki_Miura
- Yuko Sasamoto: https://en.wikipedia.org/wiki/Yuko_Sasamoto

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_October_2015
✅ Found 29 articles.
- Bakusō Kyōdai Let's & Go!!: https://en.wikipedia.org/wiki/Bakus%C5%8D_Ky%C5%8Ddai_Let%27s_%26_Go!!
- Hades in popular culture: https://en.wikipedia.org/wiki/Hades_in_popular_culture
- Hera Pheri (film series): https://en.wikipedia.org/wiki/Hera_Pheri_(film_series)
- Mitsuo Iwata: https://en.wikipedia.org/wiki/Mitsuo_Iwata
- Ami Kawai: https://en.wikipedia.org/wiki/Ami_Kawai
- Machiko Kawana: https://en.wikipedia.org/wiki/Machiko_Kawana
- Ayako K

✅ Found 2 articles.
- Roman Catholic Archdiocese of Cambrai: https://en.wikipedia.org/wiki/Roman_Catholic_Archdiocese_of_Cambrai
- Chromium Embedded Framework: https://en.wikipedia.org/wiki/Chromium_Embedded_Framework

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_October_2016
✅ Found 6 articles.
- Hideo Ishikawa: https://en.wikipedia.org/wiki/Hideo_Ishikawa
- Yui Makino: https://en.wikipedia.org/wiki/Yui_Makino
- Yuji Moriyama: https://en.wikipedia.org/wiki/Yuji_Moriyama
- Mrinal Kanti Sen: https://en.wikipedia.org/wiki/Mrinal_Kanti_Sen
- Tayum: https://en.wikipedia.org/wiki/Tayum
- The Yale Record: https://en.wikipedia.org/wiki/The_Yale_Record

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_November_2016
✅ Found 8 articles.
- Bigg Boss Kannada season 2: https://en.wikipedia.org/wiki/Bigg_Boss_Kannada_season_2
- Bigg Boss Kannada season 3: https://en.wikipedia.org/wiki/Bigg_Boss_Kannada_season_3
- Bigg Boss

✅ Found 6 articles.
- Charles Anthony Fager: https://en.wikipedia.org/wiki/Charles_Anthony_Fager
- List of Musical: The Prince of Tennis productions: https://en.wikipedia.org/wiki/List_of_Musical:_The_Prince_of_Tennis_productions
- Los Americans: https://en.wikipedia.org/wiki/Los_Americans
- Musical: The Prince of Tennis discography: https://en.wikipedia.org/wiki/Musical:_The_Prince_of_Tennis_discography
- Shamsabad, Farrukhabad: https://en.wikipedia.org/wiki/Shamsabad,_Farrukhabad
- Social media mining: https://en.wikipedia.org/wiki/Social_media_mining

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_September_2018
✅ Found 2 articles.
- Stadion Kantrida: https://en.wikipedia.org/wiki/Stadion_Kantrida
- Kidderpore: https://en.wikipedia.org/wiki/Kidderpore

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_October_2018
✅ Found 4 articles.
- Bjørn Lynne: https://en.wikipedia.org/wiki/Bj%C3%B8rn_Lynne
- Offset agree

✅ Found 2 articles.
- Babylon 5 Wars: https://en.wikipedia.org/wiki/Babylon_5_Wars
- Haruko Momoi: https://en.wikipedia.org/wiki/Haruko_Momoi

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_July_2020
✅ Found 4 articles.
- Martin A. Hainz: https://en.wikipedia.org/wiki/Martin_A._Hainz
- Kamalamba Navavarna Kritis: https://en.wikipedia.org/wiki/Kamalamba_Navavarna_Kritis
- Martin Schadt: https://en.wikipedia.org/wiki/Martin_Schadt
- Translink (Queensland): https://en.wikipedia.org/wiki/Translink_(Queensland)

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_August_2020
✅ Found 7 articles.
- Bachok District: https://en.wikipedia.org/wiki/Bachok_District
- Bailhongal: https://en.wikipedia.org/wiki/Bailhongal
- Belavanaki: https://en.wikipedia.org/wiki/Belavanaki
- Bhadravati, Maharashtra: https://en.wikipedia.org/wiki/Bhadravati,_Maharashtra
- Culture of Finland: https://en.wikipedia.org/wiki/Culture_of_Finland
- É

✅ Found 5 articles.
- Akwa Ibom State: https://en.wikipedia.org/wiki/Akwa_Ibom_State
- Children's fantasy: https://en.wikipedia.org/wiki/Children%27s_fantasy
- List of former professional sports teams in Houston: https://en.wikipedia.org/wiki/List_of_former_professional_sports_teams_in_Houston
- List of potato chip brands: https://en.wikipedia.org/wiki/List_of_potato_chip_brands
- Salt Lake City: https://en.wikipedia.org/wiki/Salt_Lake_City

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2022
✅ Found 9 articles.
- AMOLED: https://en.wikipedia.org/wiki/AMOLED
- Hilaire du Berrier: https://en.wikipedia.org/wiki/Hilaire_du_Berrier
- Suzi Ferrer: https://en.wikipedia.org/wiki/Suzi_Ferrer
- Jim Hougan: https://en.wikipedia.org/wiki/Jim_Hougan
- List of institutions of higher education in Chandigarh: https://en.wikipedia.org/wiki/List_of_institutions_of_higher_education_in_Chandigarh
- Szymon Lenkowski: https://en.wikipedia.org/wiki/Szymon_Lenkows

✅ Found 3 articles.
- Pasha: https://en.wikipedia.org/wiki/Pasha
- Thérèse of Lisieux: https://en.wikipedia.org/wiki/Th%C3%A9r%C3%A8se_of_Lisieux
- Timeline of space exploration: https://en.wikipedia.org/wiki/Timeline_of_space_exploration

🔍 Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2023
✅ Found 9 articles.
- Daisy (given name): https://en.wikipedia.org/wiki/Daisy_(given_name)
- Government Model Boys Higher Secondary School: https://en.wikipedia.org/wiki/Government_Model_Boys_Higher_Secondary_School
- Kagaznagar, Telangana: https://en.wikipedia.org/wiki/Kagaznagar,_Telangana
- List of universities in Zambia: https://en.wikipedia.org/wiki/List_of_universities_in_Zambia
- Pukkattupadi: https://en.wikipedia.org/wiki/Pukkattupadi
- Imperial, royal and noble ranks: https://en.wikipedia.org/wiki/Imperial,_royal_and_noble_ranks
- Vivek Sachidanand: https://en.wikipedia.org/wiki/Vivek_Sachidanand
- Sirpur (T): https://en.wikipedia.org/wiki/Sirpur

✅ Found 16 articles.
- Crave Entertainment: https://en.wikipedia.org/wiki/Crave_Entertainment
- Dandeli: https://en.wikipedia.org/wiki/Dandeli
- Gandhidham: https://en.wikipedia.org/wiki/Gandhidham
- List of insurance companies in Kenya: https://en.wikipedia.org/wiki/List_of_insurance_companies_in_Kenya
- List of top 10 singles in 2020 (France): https://en.wikipedia.org/wiki/List_of_top_10_singles_in_2020_(France)
- List of top 10 singles in 2021 (France): https://en.wikipedia.org/wiki/List_of_top_10_singles_in_2021_(France)
- List of top 10 singles in 2022 (France): https://en.wikipedia.org/wiki/List_of_top_10_singles_in_2022_(France)
- List of top 10 singles in 2023 (France): https://en.wikipedia.org/wiki/List_of_top_10_singles_in_2023_(France)
- List of vaping bans in the United States: https://en.wikipedia.org/wiki/List_of_vaping_bans_in_the_United_States
- Media circus: https://en.wikipedia.org/wiki/Media_circus
- Steven Mithen: https://en.wikipedia.org/wiki/Steven_Mithen
- Newent

✅ Found 20 articles.
- Ancona: https://en.wikipedia.org/wiki/Ancona
- Attock: https://en.wikipedia.org/wiki/Attock
- Batala: https://en.wikipedia.org/wiki/Batala
- Bhimavaram: https://en.wikipedia.org/wiki/Bhimavaram
- Dantla, Rajasthan: https://en.wikipedia.org/wiki/Dantla,_Rajasthan
- Etawah: https://en.wikipedia.org/wiki/Etawah
- List of fictional horses: https://en.wikipedia.org/wiki/List_of_fictional_horses
- Ghanche District: https://en.wikipedia.org/wiki/Ghanche_District
- Human geography: https://en.wikipedia.org/wiki/Human_geography
- List of institutions of higher education in Tripura: https://en.wikipedia.org/wiki/List_of_institutions_of_higher_education_in_Tripura
- Jarral Shareef: https://en.wikipedia.org/wiki/Jarral_Shareef
- Jhumri Telaiya: https://en.wikipedia.org/wiki/Jhumri_Telaiya
- Koderma subdivision: https://en.wikipedia.org/wiki/Koderma_subdivision
- List of educational institutions in Nagpur: https://en.wikipedia.org/wiki/List_of_educational_institutions_in_Nagp

✅ Found 12 articles.
- 2025 NBA playoffs: https://en.wikipedia.org/wiki/2025_NBA_playoffs
- Begusarai: https://en.wikipedia.org/wiki/Begusarai
- Sheila Kitzinger: https://en.wikipedia.org/wiki/Sheila_Kitzinger
- Philip J. Landrigan: https://en.wikipedia.org/wiki/Philip_J._Landrigan
- List of films shot on digital video prior to 2012: https://en.wikipedia.org/wiki/List_of_films_shot_on_digital_video_prior_to_2012
- List of universities in Cameroon: https://en.wikipedia.org/wiki/List_of_universities_in_Cameroon
- List of University of New South Wales faculty: https://en.wikipedia.org/wiki/List_of_University_of_New_South_Wales_faculty
- Queer Contemporary Art of Southwest Asia and North Africa: https://en.wikipedia.org/wiki/Queer_Contemporary_Art_of_Southwest_Asia_and_North_Africa
- List of rose cultivars named after people: https://en.wikipedia.org/wiki/List_of_rose_cultivars_named_after_people
- Bill Schnee: https://en.wikipedia.org/wiki/Bill_Schnee
- Lydia Sigourney bibliography: https

### Process Wikipedia Cleanup Articles

In [4]:
import time
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
from typing import Dict, List


def clean_messages(title: str, url: str) -> pd.DataFrame:
    """
    Extracts cleanup messages and categories from a Wikipedia page.

    Args:
        title (str): Article title.
        url (str): Full Wikipedia article URL.

    Returns:
        pd.DataFrame: DataFrame with columns: title, url, cleanup_message, categories.
    """
    data = {
        'title': [],
        'url': [],
        'cleanup_message': [],
        'categories': []
    }

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"❌ Failed to fetch page {url}: {e}")
        return pd.DataFrame()

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract cleanup messages
    cleanup_boxes = soup.find_all('div', class_='mbox-text-span')

    # Extract categories
    categories = []
    cat_div = soup.find('div', id='mw-normal-catlinks')
    if cat_div:
        categories = [li.get_text(strip=True) for li in cat_div.select('ul li')]

    if cleanup_boxes:
        for mbox in cleanup_boxes:
            data['title'].append(title)
            data['url'].append(url)
            data['cleanup_message'].append(mbox.get_text(strip=True, separator=' '))
            data['categories'].append(categories)
    else:
        # At least one entry with no cleanup
        data['title'].append(title)
        data['url'].append(url)
        data['cleanup_message'].append(None)
        data['categories'].append(categories)

    return pd.DataFrame(data)


def extract_characteristics(wiki_url: str) -> Dict[str, str]:
    """
    Extracts characteristics (title, summary, infobox fields) from a Wikipedia page.

    Args:
        wiki_url (str): Wikipedia article URL.

    Returns:
        Dict[str, str]: Extracted information.
    """
    try:
        response = requests.get(wiki_url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"❌ Failed to fetch {wiki_url}: {e}")
        return {}

    soup = BeautifulSoup(response.text, 'html.parser')
    data = {}

    # Title
    title_tag = soup.find("h1", id="firstHeading")
    data["Title"] = title_tag.get_text(strip=True) if title_tag else "N/A"

    # First paragraph (summary)
    for p in soup.select("div.mw-parser-output > p"):
        text = p.get_text(strip=True)
        if text:
            data["Summary"] = text
            break

    # Infobox
    infobox = soup.find("table", class_="infobox")
    if infobox:
        for row in infobox.find_all("tr"):
            if row.th and row.td:
                key = row.th.get_text(" ", strip=True)
                value = row.td.get_text(" ", strip=True)
                data[key] = value

    return data


# Initialize overall data container
overall_df = pd.DataFrame(columns=['title', 'url', 'cleanup_message', 'categories'])

count = 0

for category_url, article_list in cleanup_dict.items():
    print(f"\n📂 Processing category: {category_url}")

    for article in article_list:
        title, url = list(article.items())[0]
        count += 1
        print(f"\n🔢 [{count}] Processing: {title}")

        # Extract details and messages
        characteristics = extract_characteristics(url)

        df = clean_messages(title, url)
        overall_df = pd.concat([overall_df, df], ignore_index=True)

        # Polite crawling
        sleep_time = random.uniform(1, 5)
        print(f"⏱ Sleeping for {sleep_time:.2f} seconds...\n")
        time.sleep(sleep_time)

# Final summary
print(f"\n✅ Completed. Total articles processed: {count}")
display(overall_df)


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_November_2010

🔢 [1] Processing: Dorothea Brande
⏱ Sleeping for 3.33 seconds...


🔢 [2] Processing: DisplayPort
⏱ Sleeping for 1.07 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_December_2010

🔢 [3] Processing: Atínale al precio
⏱ Sleeping for 1.21 seconds...


🔢 [4] Processing: List of Dominican Republic films
⏱ Sleeping for 2.55 seconds...


🔢 [5] Processing: Instructional design
⏱ Sleeping for 2.59 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_February_2011

🔢 [6] Processing: California Sun
⏱ Sleeping for 1.88 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2011

🔢 [7] Processing: List of people educated at Haileybury (Melbourne)
⏱ Sleeping for 2.05 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Categor

⏱ Sleeping for 3.58 seconds...


🔢 [68] Processing: List of McDonald's marketing campaigns
⏱ Sleeping for 2.45 seconds...


🔢 [69] Processing: Roger Craig Vogel
⏱ Sleeping for 1.22 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_May_2014

🔢 [70] Processing: Pattern recognition
⏱ Sleeping for 2.16 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_June_2014

🔢 [71] Processing: Craig Barnett
⏱ Sleeping for 4.27 seconds...


🔢 [72] Processing: Romolo Calabrese
⏱ Sleeping for 2.40 seconds...


🔢 [73] Processing: Harry Reis
⏱ Sleeping for 4.11 seconds...


🔢 [74] Processing: Theora
⏱ Sleeping for 4.64 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_July_2014

🔢 [75] Processing: Trusted Computing
⏱ Sleeping for 3.07 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_August_2014

🔢 


🔢 [153] Processing: Machiko Kawana
⏱ Sleeping for 3.19 seconds...


🔢 [154] Processing: Ayako Kawasumi
⏱ Sleeping for 4.38 seconds...


🔢 [155] Processing: Shōko Kikuchi
⏱ Sleeping for 2.64 seconds...


🔢 [156] Processing: Yūji Kishi
⏱ Sleeping for 4.45 seconds...


🔢 [157] Processing: Mapfre
⏱ Sleeping for 2.42 seconds...


🔢 [158] Processing: Yuuki Matsuda
⏱ Sleeping for 1.23 seconds...


🔢 [159] Processing: Dai Matsumoto
⏱ Sleeping for 1.05 seconds...


🔢 [160] Processing: Ui Miyazaki
⏱ Sleeping for 4.33 seconds...


🔢 [161] Processing: Takahiro Mizushima
⏱ Sleeping for 1.92 seconds...


🔢 [162] Processing: Shunichi Nagasaki
⏱ Sleeping for 3.35 seconds...


🔢 [163] Processing: Kumiko Nishihara
⏱ Sleeping for 3.40 seconds...


🔢 [164] Processing: Sakura Nogawa
⏱ Sleeping for 3.77 seconds...


🔢 [165] Processing: Mitsuru Ogata
⏱ Sleeping for 4.80 seconds...


🔢 [166] Processing: Makiko Ohmoto
⏱ Sleeping for 1.78 seconds...


🔢 [167] Processing: Ryōko Ono
⏱ Sleeping for 3.15 seconds..


🔢 [251] Processing: The Yale Record
⏱ Sleeping for 2.27 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_November_2016

🔢 [252] Processing: Bigg Boss Kannada season 2
⏱ Sleeping for 1.05 seconds...


🔢 [253] Processing: Bigg Boss Kannada season 3
⏱ Sleeping for 3.10 seconds...


🔢 [254] Processing: Bigg Boss Kannada season 4
⏱ Sleeping for 1.76 seconds...


🔢 [255] Processing: List of family-and-homemaking blogs
⏱ Sleeping for 1.10 seconds...


🔢 [256] Processing: List of Slovenian mathematicians
⏱ Sleeping for 1.66 seconds...


🔢 [257] Processing: List of non-alcoholic mixed drinks
⏱ Sleeping for 4.48 seconds...


🔢 [258] Processing: Yui Sakakibara
⏱ Sleeping for 3.13 seconds...


🔢 [259] Processing: Peter Zizka
⏱ Sleeping for 4.15 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_December_2016

🔢 [260] Processing: Al-Izhar Pondok Labu
⏱ Sleeping for 1.80 seconds...


🔢 [261] Pr


🔢 [327] Processing: Chris Ruppenthal
⏱ Sleeping for 1.31 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_November_2018

🔢 [328] Processing: List of Georgia Southern University alumni
⏱ Sleeping for 3.66 seconds...


🔢 [329] Processing: Elis Gruffydd
⏱ Sleeping for 2.09 seconds...


🔢 [330] Processing: List of Galaxy Angel characters
⏱ Sleeping for 2.10 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_December_2018

🔢 [331] Processing: Fourth International (post-reunification)
⏱ Sleeping for 3.76 seconds...


🔢 [332] Processing: Kazuo Kumakura
⏱ Sleeping for 1.63 seconds...


🔢 [333] Processing: Janez Strnad
⏱ Sleeping for 2.98 seconds...


🔢 [334] Processing: Stuart Zender
⏱ Sleeping for 1.11 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_January_2019

🔢 [335] Processing: Richard Hammond
⏱ Sleeping for 3.38 seconds...


🔢 


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_August_2020

🔢 [411] Processing: Bachok District
⏱ Sleeping for 3.21 seconds...


🔢 [412] Processing: Bailhongal
⏱ Sleeping for 3.37 seconds...


🔢 [413] Processing: Belavanaki
⏱ Sleeping for 2.33 seconds...


🔢 [414] Processing: Bhadravati, Maharashtra
⏱ Sleeping for 1.32 seconds...


🔢 [415] Processing: Culture of Finland
⏱ Sleeping for 1.29 seconds...


🔢 [416] Processing: École de management de Normandie
⏱ Sleeping for 4.60 seconds...


🔢 [417] Processing: List of almshouses in the United Kingdom
⏱ Sleeping for 3.31 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_September_2020

🔢 [418] Processing: Archives of Physical Medicine and Rehabilitation
⏱ Sleeping for 2.65 seconds...


🔢 [419] Processing: List of battles 1601–1800
⏱ Sleeping for 1.03 seconds...


🔢 [420] Processing: BOCHK Bauhinia Bowl
⏱ Sleeping for 3.79 seconds...


🔢 [421] Pr

⏱ Sleeping for 2.36 seconds...


🔢 [494] Processing: Children's fantasy
⏱ Sleeping for 3.66 seconds...


🔢 [495] Processing: List of former professional sports teams in Houston
⏱ Sleeping for 1.63 seconds...


🔢 [496] Processing: List of potato chip brands
⏱ Sleeping for 4.75 seconds...


🔢 [497] Processing: Salt Lake City
⏱ Sleeping for 4.29 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2022

🔢 [498] Processing: AMOLED
⏱ Sleeping for 4.38 seconds...


🔢 [499] Processing: Hilaire du Berrier
⏱ Sleeping for 4.11 seconds...


🔢 [500] Processing: Suzi Ferrer
⏱ Sleeping for 4.12 seconds...


🔢 [501] Processing: Jim Hougan
⏱ Sleeping for 1.90 seconds...


🔢 [502] Processing: List of institutions of higher education in Chandigarh
⏱ Sleeping for 4.84 seconds...


🔢 [503] Processing: Szymon Lenkowski
⏱ Sleeping for 3.67 seconds...


🔢 [504] Processing: List of English words of Etruscan origin
⏱ Sleeping for 4.97 seconds...


🔢 [505]

⏱ Sleeping for 3.08 seconds...


🔢 [583] Processing: Government Model Boys Higher Secondary School
⏱ Sleeping for 3.99 seconds...


🔢 [584] Processing: Kagaznagar, Telangana
⏱ Sleeping for 2.04 seconds...


🔢 [585] Processing: List of universities in Zambia
⏱ Sleeping for 2.49 seconds...


🔢 [586] Processing: Pukkattupadi
⏱ Sleeping for 3.48 seconds...


🔢 [587] Processing: Imperial, royal and noble ranks
⏱ Sleeping for 2.52 seconds...


🔢 [588] Processing: Vivek Sachidanand
⏱ Sleeping for 1.71 seconds...


🔢 [589] Processing: Sirpur (T)
⏱ Sleeping for 4.96 seconds...


🔢 [590] Processing: Territory of the Islamic State
⏱ Sleeping for 1.53 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_April_2023

🔢 [591] Processing: Baddi
⏱ Sleeping for 3.61 seconds...


🔢 [592] Processing: Bauchi (city)
⏱ Sleeping for 3.37 seconds...


🔢 [593] Processing: Jan de Hartog
⏱ Sleeping for 1.32 seconds...


🔢 [594] Processing: Pangala
⏱ Sleeping for 1

⏱ Sleeping for 4.97 seconds...


🔢 [678] Processing: List of top 10 singles in 2021 (France)
⏱ Sleeping for 1.74 seconds...


🔢 [679] Processing: List of top 10 singles in 2022 (France)
⏱ Sleeping for 3.20 seconds...


🔢 [680] Processing: List of top 10 singles in 2023 (France)
⏱ Sleeping for 3.55 seconds...


🔢 [681] Processing: List of vaping bans in the United States
⏱ Sleeping for 3.44 seconds...


🔢 [682] Processing: Media circus
⏱ Sleeping for 4.82 seconds...


🔢 [683] Processing: Steven Mithen
⏱ Sleeping for 4.38 seconds...


🔢 [684] Processing: Newent
⏱ Sleeping for 2.23 seconds...


🔢 [685] Processing: Osun State
⏱ Sleeping for 4.96 seconds...


🔢 [686] Processing: Shaheed Benazirabad District
⏱ Sleeping for 2.75 seconds...


🔢 [687] Processing: Pratibha Singhi
⏱ Sleeping for 4.46 seconds...


🔢 [688] Processing: Use of nigger in the arts
⏱ Sleeping for 4.02 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_February_2024

🔢 


🔢 [780] Processing: Patiala
⏱ Sleeping for 1.90 seconds...


🔢 [781] Processing: Shaki, Oyo
⏱ Sleeping for 3.01 seconds...


🔢 [782] Processing: Jon M. Sweeney
⏱ Sleeping for 3.04 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_September_2024

🔢 [783] Processing: Ancona
⏱ Sleeping for 4.52 seconds...


🔢 [784] Processing: Attock
⏱ Sleeping for 2.17 seconds...


🔢 [785] Processing: Batala
⏱ Sleeping for 4.13 seconds...


🔢 [786] Processing: Bhimavaram
⏱ Sleeping for 1.71 seconds...


🔢 [787] Processing: Dantla, Rajasthan
⏱ Sleeping for 3.43 seconds...


🔢 [788] Processing: Etawah
⏱ Sleeping for 3.84 seconds...


🔢 [789] Processing: List of fictional horses
⏱ Sleeping for 1.15 seconds...


🔢 [790] Processing: Ghanche District
⏱ Sleeping for 4.37 seconds...


🔢 [791] Processing: Human geography
⏱ Sleeping for 3.65 seconds...


🔢 [792] Processing: List of institutions of higher education in Tripura
⏱ Sleeping for 1.58 seconds...


🔢 [


🔢 [886] Processing: Nozomi Tsuji
⏱ Sleeping for 1.66 seconds...


📂 Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_April_2025

🔢 [887] Processing: Behala
⏱ Sleeping for 1.95 seconds...


🔢 [888] Processing: Francis Boyle
⏱ Sleeping for 2.82 seconds...


🔢 [889] Processing: List of demons in fiction
⏱ Sleeping for 3.80 seconds...


🔢 [890] Processing: Institute of Development Studies
⏱ Sleeping for 4.59 seconds...


🔢 [891] Processing: Lagavulin distillery
⏱ Sleeping for 3.55 seconds...


🔢 [892] Processing: Edward E. Lawler
⏱ Sleeping for 1.79 seconds...


🔢 [893] Processing: List of programmes broadcast by CNA
⏱ Sleeping for 1.17 seconds...


🔢 [894] Processing: List of protests and demonstrations in the United States by size
⏱ Sleeping for 1.63 seconds...


🔢 [895] Processing: Spanish Steps
⏱ Sleeping for 2.73 seconds...


🔢 [896] Processing: Ukrainians in Russia
⏱ Sleeping for 2.10 seconds...


📂 Processing category: https://en.wikipedia.org

Unnamed: 0,title,url,cleanup_message,categories
0,Dorothea Brande,https://en.wikipedia.org/wiki/Dorothea_Brande,This article has multiple issues. Please help ...,"[Illinois Institute of Technology alumni, 1893..."
1,Dorothea Brande,https://en.wikipedia.org/wiki/Dorothea_Brande,This article may need to be rewritten to compl...,"[Illinois Institute of Technology alumni, 1893..."
2,Dorothea Brande,https://en.wikipedia.org/wiki/Dorothea_Brande,This article may be in need of reorganization ...,"[Illinois Institute of Technology alumni, 1893..."
3,Dorothea Brande,https://en.wikipedia.org/wiki/Dorothea_Brande,This article contains a list that has not been...,"[Illinois Institute of Technology alumni, 1893..."
4,DisplayPort,https://en.wikipedia.org/wiki/DisplayPort,This article may be too technical for most rea...,"[Digital display connectors, VESA, Computer co..."
...,...,...,...,...
2291,Lydia Sigourney bibliography,https://en.wikipedia.org/wiki/Lydia_Sigourney_...,This article relies largely or entirely on a s...,"[Bibliographies by writer, Bibliographies of A..."
2292,Tulane Journal of Law & Sexuality,https://en.wikipedia.org/wiki/Tulane_Journal_o...,This section does not cite any sources . Pleas...,"[American law journals, Tulane University Law ..."
2293,Tulane Journal of Law & Sexuality,https://en.wikipedia.org/wiki/Tulane_Journal_o...,This article may contain unverified or indiscr...,"[American law journals, Tulane University Law ..."
2294,Kaveh Akbar,https://en.wikipedia.org/wiki/Kaveh_Akbar,This section contains a list that has not been...,"[21st-century American poets, 21st-century Ame..."


In [5]:
overall_df.to_csv('data/wikipedia.csv')