## Get List of URLs under 'Wikipedia list cleanup'

In [1]:
import pandas as pd
import random
import requests
import time

from bs4 import BeautifulSoup
from typing import List, Tuple, Dict

In [2]:
BASE_URL = "https://en.wikipedia.org"
CATEGORY_URL = f"{BASE_URL}/wiki/Category:Wikipedia_list_cleanup"

def fetch_subcategory_links(category_url: str) -> List[str]:
    """
    Fetch subcategory URLs under 'Wikipedia list cleanup'.

    Args:
        category_url (str): The full URL to the category page.

    Returns:
        List[str]: A list of subcategory URLs.
    """
    try:
        response = requests.get(category_url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"‚ùå Error fetching category page: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    subcategory_links = []

    for bdi in soup.find_all("bdi"):
        a_tag = bdi.find("a", href=True)
        if a_tag and a_tag["href"].startswith("/wiki/Category:Wikipedia_list_cleanup_from"):
            subcategory_links.append(BASE_URL + a_tag["href"])

    return subcategory_links

def main() -> List[str]:
    print("üìÇ Fetching subcategories under 'Wikipedia list cleanup'...\n")
    subcategories = fetch_subcategory_links(CATEGORY_URL)

    if not subcategories:
        print("‚ö†Ô∏è No subcategories found.")
        return []

    print(f"‚úÖ Found {len(subcategories)} subcategories.\n")
    for url in subcategories:
        print(f"üîó {url}")

    return subcategories

if __name__ == "__main__":
    category_urls = main()

üìÇ Fetching subcategories under 'Wikipedia list cleanup'...

‚úÖ Found 161 subcategories.

üîó https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_November_2010
üîó https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_December_2010
üîó https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_February_2011
üîó https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2011
üîó https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_April_2011
üîó https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_May_2011
üîó https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_July_2011
üîó https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_August_2011
üîó https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_September_2011
üîó https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_December_2011
üîó https://en.wikipedia.org/wiki/Category:Wikipedia_list_clea

## üîç Extracting Article Links from Wikipedia Cleanup Categories

In [3]:
def extract_article_links(category_url: str) -> List[Tuple[str, str]]:
    """
    Extracts article titles and URLs from a given Wikipedia category page.

    Args:
        category_url (str): Full URL to a Wikipedia category page.

    Returns:
        List[Tuple[str, str]]: A list of (title, full_url) tuples.
    """
    print(f"\nüîç Extracting from: {category_url}")
    try:
        response = requests.get(category_url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"‚ùå Failed to fetch page: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    article_links = []

    for a_tag in soup.select("div.mw-category li a"):
        title = a_tag.get("title")
        href = a_tag.get("href")

        if title and href and href.startswith("/wiki/"):
            full_url = BASE_URL + href
            article_links.append((title, full_url))

    print(f"‚úÖ Found {len(article_links)} articles.")
    return article_links


def main(category_urls: List[str]) -> Dict[str, List[Dict[str, str]]]:
    """
    Processes a list of Wikipedia category URLs and collects article data.

    Args:
        category_urls (List[str]): List of Wikipedia category page URLs.

    Returns:
        Dict[str, List[Dict[str, str]]]: Mapping of category URL to article dicts.
    """
    all_links: List[Tuple[str, str]] = []
    cleanup_dict: Dict[str, List[Dict[str, str]]] = {}

    for category_url in category_urls:
        article_links = extract_article_links(category_url)
        all_links.extend(article_links)

        cleanup_dict[category_url] = []
        for title, link in article_links:
            print(f"- {title}: {link}")
            cleanup_dict[category_url].append({title: link})

    print(f"\nüßÆ Total articles extracted: {len(all_links)}")
    return cleanup_dict


if __name__ == "__main__":
    cleanup_dict = main(category_urls)


üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_November_2010
‚úÖ Found 2 articles.
- Dorothea Brande: https://en.wikipedia.org/wiki/Dorothea_Brande
- DisplayPort: https://en.wikipedia.org/wiki/DisplayPort

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_December_2010
‚úÖ Found 3 articles.
- At√≠nale al precio: https://en.wikipedia.org/wiki/At%C3%ADnale_al_precio
- List of Dominican Republic films: https://en.wikipedia.org/wiki/List_of_Dominican_Republic_films
- Instructional design: https://en.wikipedia.org/wiki/Instructional_design

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_February_2011
‚úÖ Found 1 articles.
- California Sun: https://en.wikipedia.org/wiki/California_Sun

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2011
‚úÖ Found 1 articles.
- List of people educated at Haileybury (Melbourne): https://en.w

‚úÖ Found 4 articles.
- Hiroshi Kamiya: https://en.wikipedia.org/wiki/Hiroshi_Kamiya
- Old Crown Brewing Corporation: https://en.wikipedia.org/wiki/Old_Crown_Brewing_Corporation
- List of programs broadcast by A-Channel: https://en.wikipedia.org/wiki/List_of_programs_broadcast_by_A-Channel
- Stefaan Verhulst: https://en.wikipedia.org/wiki/Stefaan_Verhulst

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_February_2014
‚úÖ Found 1 articles.
- Tomoaki Maeno: https://en.wikipedia.org/wiki/Tomoaki_Maeno

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2014
‚úÖ Found 3 articles.
- List of fictional Asian countries: https://en.wikipedia.org/wiki/List_of_fictional_Asian_countries
- List of McDonald's marketing campaigns: https://en.wikipedia.org/wiki/List_of_McDonald%27s_marketing_campaigns
- Roger Craig Vogel: https://en.wikipedia.org/wiki/Roger_Craig_Vogel

üîç Extracting from: https://en.wikipedia.org/w

‚úÖ Found 6 articles.
- Axis of evil: https://en.wikipedia.org/wiki/Axis_of_evil
- Bigg Boss Kannada season 1: https://en.wikipedia.org/wiki/Bigg_Boss_Kannada_season_1
- Takis Fotopoulos: https://en.wikipedia.org/wiki/Takis_Fotopoulos
- Henry Hugglemonster: https://en.wikipedia.org/wiki/Henry_Hugglemonster
- Hiroaki Miura: https://en.wikipedia.org/wiki/Hiroaki_Miura
- Yuko Sasamoto: https://en.wikipedia.org/wiki/Yuko_Sasamoto

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_October_2015
‚úÖ Found 29 articles.
- Bakus≈ç Ky≈çdai Let's & Go!!: https://en.wikipedia.org/wiki/Bakus%C5%8D_Ky%C5%8Ddai_Let%27s_%26_Go!!
- Hades in popular culture: https://en.wikipedia.org/wiki/Hades_in_popular_culture
- Hera Pheri (film series): https://en.wikipedia.org/wiki/Hera_Pheri_(film_series)
- Mitsuo Iwata: https://en.wikipedia.org/wiki/Mitsuo_Iwata
- Ami Kawai: https://en.wikipedia.org/wiki/Ami_Kawai
- Machiko Kawana: https://en.wikipedia.org/wiki/Machiko_Kawana


‚úÖ Found 2 articles.
- Roman Catholic Archdiocese of Cambrai: https://en.wikipedia.org/wiki/Roman_Catholic_Archdiocese_of_Cambrai
- Chromium Embedded Framework: https://en.wikipedia.org/wiki/Chromium_Embedded_Framework

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_October_2016
‚úÖ Found 6 articles.
- Hideo Ishikawa: https://en.wikipedia.org/wiki/Hideo_Ishikawa
- Yui Makino: https://en.wikipedia.org/wiki/Yui_Makino
- Yuji Moriyama: https://en.wikipedia.org/wiki/Yuji_Moriyama
- Mrinal Kanti Sen: https://en.wikipedia.org/wiki/Mrinal_Kanti_Sen
- Tayum: https://en.wikipedia.org/wiki/Tayum
- The Yale Record: https://en.wikipedia.org/wiki/The_Yale_Record

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_November_2016
‚úÖ Found 8 articles.
- Bigg Boss Kannada season 2: https://en.wikipedia.org/wiki/Bigg_Boss_Kannada_season_2
- Bigg Boss Kannada season 3: https://en.wikipedia.org/wiki/Bigg_Boss_Kannada_season_3

‚úÖ Found 6 articles.
- Charles Anthony Fager: https://en.wikipedia.org/wiki/Charles_Anthony_Fager
- List of Musical: The Prince of Tennis productions: https://en.wikipedia.org/wiki/List_of_Musical:_The_Prince_of_Tennis_productions
- Los Americans: https://en.wikipedia.org/wiki/Los_Americans
- Musical: The Prince of Tennis discography: https://en.wikipedia.org/wiki/Musical:_The_Prince_of_Tennis_discography
- Shamsabad, Farrukhabad: https://en.wikipedia.org/wiki/Shamsabad,_Farrukhabad
- Social media mining: https://en.wikipedia.org/wiki/Social_media_mining

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_September_2018
‚úÖ Found 2 articles.
- Stadion Kantrida: https://en.wikipedia.org/wiki/Stadion_Kantrida
- Kidderpore: https://en.wikipedia.org/wiki/Kidderpore

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_October_2018
‚úÖ Found 4 articles.
- Bj√∏rn Lynne: https://en.wikipedia.org/wiki/Bj%C3%B8rn_Lynne
-

‚úÖ Found 2 articles.
- Babylon 5 Wars: https://en.wikipedia.org/wiki/Babylon_5_Wars
- Haruko Momoi: https://en.wikipedia.org/wiki/Haruko_Momoi

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_July_2020
‚úÖ Found 4 articles.
- Martin A. Hainz: https://en.wikipedia.org/wiki/Martin_A._Hainz
- Kamalamba Navavarna Kritis: https://en.wikipedia.org/wiki/Kamalamba_Navavarna_Kritis
- Martin Schadt: https://en.wikipedia.org/wiki/Martin_Schadt
- Translink (Queensland): https://en.wikipedia.org/wiki/Translink_(Queensland)

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_August_2020
‚úÖ Found 7 articles.
- Bachok District: https://en.wikipedia.org/wiki/Bachok_District
- Bailhongal: https://en.wikipedia.org/wiki/Bailhongal
- Belavanaki: https://en.wikipedia.org/wiki/Belavanaki
- Bhadravati, Maharashtra: https://en.wikipedia.org/wiki/Bhadravati,_Maharashtra
- Culture of Finland: https://en.wikipedia.org/wiki/Culture_of

‚úÖ Found 5 articles.
- Akwa Ibom State: https://en.wikipedia.org/wiki/Akwa_Ibom_State
- Children's fantasy: https://en.wikipedia.org/wiki/Children%27s_fantasy
- List of former professional sports teams in Houston: https://en.wikipedia.org/wiki/List_of_former_professional_sports_teams_in_Houston
- List of potato chip brands: https://en.wikipedia.org/wiki/List_of_potato_chip_brands
- Salt Lake City: https://en.wikipedia.org/wiki/Salt_Lake_City

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2022
‚úÖ Found 9 articles.
- AMOLED: https://en.wikipedia.org/wiki/AMOLED
- Hilaire du Berrier: https://en.wikipedia.org/wiki/Hilaire_du_Berrier
- Suzi Ferrer: https://en.wikipedia.org/wiki/Suzi_Ferrer
- Jim Hougan: https://en.wikipedia.org/wiki/Jim_Hougan
- List of institutions of higher education in Chandigarh: https://en.wikipedia.org/wiki/List_of_institutions_of_higher_education_in_Chandigarh
- Szymon Lenkowski: https://en.wikipedia.org/wiki/Szymon_

‚úÖ Found 3 articles.
- Pasha: https://en.wikipedia.org/wiki/Pasha
- Th√©r√®se of Lisieux: https://en.wikipedia.org/wiki/Th%C3%A9r%C3%A8se_of_Lisieux
- Timeline of space exploration: https://en.wikipedia.org/wiki/Timeline_of_space_exploration

üîç Extracting from: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2023
‚úÖ Found 9 articles.
- Daisy (given name): https://en.wikipedia.org/wiki/Daisy_(given_name)
- Government Model Boys Higher Secondary School: https://en.wikipedia.org/wiki/Government_Model_Boys_Higher_Secondary_School
- Kagaznagar, Telangana: https://en.wikipedia.org/wiki/Kagaznagar,_Telangana
- List of universities in Zambia: https://en.wikipedia.org/wiki/List_of_universities_in_Zambia
- Pukkattupadi: https://en.wikipedia.org/wiki/Pukkattupadi
- Imperial, royal and noble ranks: https://en.wikipedia.org/wiki/Imperial,_royal_and_noble_ranks
- Vivek Sachidanand: https://en.wikipedia.org/wiki/Vivek_Sachidanand
- Sirpur (T): https://en.wikipedia.org/wi

‚úÖ Found 16 articles.
- Crave Entertainment: https://en.wikipedia.org/wiki/Crave_Entertainment
- Dandeli: https://en.wikipedia.org/wiki/Dandeli
- Gandhidham: https://en.wikipedia.org/wiki/Gandhidham
- List of insurance companies in Kenya: https://en.wikipedia.org/wiki/List_of_insurance_companies_in_Kenya
- List of top 10 singles in 2020 (France): https://en.wikipedia.org/wiki/List_of_top_10_singles_in_2020_(France)
- List of top 10 singles in 2021 (France): https://en.wikipedia.org/wiki/List_of_top_10_singles_in_2021_(France)
- List of top 10 singles in 2022 (France): https://en.wikipedia.org/wiki/List_of_top_10_singles_in_2022_(France)
- List of top 10 singles in 2023 (France): https://en.wikipedia.org/wiki/List_of_top_10_singles_in_2023_(France)
- List of vaping bans in the United States: https://en.wikipedia.org/wiki/List_of_vaping_bans_in_the_United_States
- Media circus: https://en.wikipedia.org/wiki/Media_circus
- Steven Mithen: https://en.wikipedia.org/wiki/Steven_Mithen
- Newe

‚úÖ Found 20 articles.
- Ancona: https://en.wikipedia.org/wiki/Ancona
- Attock: https://en.wikipedia.org/wiki/Attock
- Batala: https://en.wikipedia.org/wiki/Batala
- Bhimavaram: https://en.wikipedia.org/wiki/Bhimavaram
- Dantla, Rajasthan: https://en.wikipedia.org/wiki/Dantla,_Rajasthan
- Etawah: https://en.wikipedia.org/wiki/Etawah
- List of fictional horses: https://en.wikipedia.org/wiki/List_of_fictional_horses
- Ghanche District: https://en.wikipedia.org/wiki/Ghanche_District
- Human geography: https://en.wikipedia.org/wiki/Human_geography
- List of institutions of higher education in Tripura: https://en.wikipedia.org/wiki/List_of_institutions_of_higher_education_in_Tripura
- Jarral Shareef: https://en.wikipedia.org/wiki/Jarral_Shareef
- Jhumri Telaiya: https://en.wikipedia.org/wiki/Jhumri_Telaiya
- Koderma subdivision: https://en.wikipedia.org/wiki/Koderma_subdivision
- List of educational institutions in Nagpur: https://en.wikipedia.org/wiki/List_of_educational_institutions_in_Na

‚úÖ Found 12 articles.
- 2025 NBA playoffs: https://en.wikipedia.org/wiki/2025_NBA_playoffs
- Begusarai: https://en.wikipedia.org/wiki/Begusarai
- Sheila Kitzinger: https://en.wikipedia.org/wiki/Sheila_Kitzinger
- Philip J. Landrigan: https://en.wikipedia.org/wiki/Philip_J._Landrigan
- List of films shot on digital video prior to 2012: https://en.wikipedia.org/wiki/List_of_films_shot_on_digital_video_prior_to_2012
- List of universities in Cameroon: https://en.wikipedia.org/wiki/List_of_universities_in_Cameroon
- List of University of New South Wales faculty: https://en.wikipedia.org/wiki/List_of_University_of_New_South_Wales_faculty
- Queer Contemporary Art of Southwest Asia and North Africa: https://en.wikipedia.org/wiki/Queer_Contemporary_Art_of_Southwest_Asia_and_North_Africa
- List of rose cultivars named after people: https://en.wikipedia.org/wiki/List_of_rose_cultivars_named_after_people
- Bill Schnee: https://en.wikipedia.org/wiki/Bill_Schnee
- Lydia Sigourney bibliography: htt

### Process Wikipedia Cleanup Articles

In [4]:
import time
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
from typing import Dict, List


def clean_messages(title: str, url: str) -> pd.DataFrame:
    """
    Extracts cleanup messages and categories from a Wikipedia page.

    Args:
        title (str): Article title.
        url (str): Full Wikipedia article URL.

    Returns:
        pd.DataFrame: DataFrame with columns: title, url, cleanup_message, categories.
    """
    data = {
        'title': [],
        'url': [],
        'cleanup_message': [],
        'categories': []
    }

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"‚ùå Failed to fetch page {url}: {e}")
        return pd.DataFrame()

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract cleanup messages
    cleanup_boxes = soup.find_all('div', class_='mbox-text-span')

    # Extract categories
    categories = []
    cat_div = soup.find('div', id='mw-normal-catlinks')
    if cat_div:
        categories = [li.get_text(strip=True) for li in cat_div.select('ul li')]

    if cleanup_boxes:
        for mbox in cleanup_boxes:
            data['title'].append(title)
            data['url'].append(url)
            data['cleanup_message'].append(mbox.get_text(strip=True, separator=' '))
            data['categories'].append(categories)
    else:
        # At least one entry with no cleanup
        data['title'].append(title)
        data['url'].append(url)
        data['cleanup_message'].append(None)
        data['categories'].append(categories)

    return pd.DataFrame(data)


def extract_characteristics(wiki_url: str) -> Dict[str, str]:
    """
    Extracts characteristics (title, summary, infobox fields) from a Wikipedia page.

    Args:
        wiki_url (str): Wikipedia article URL.

    Returns:
        Dict[str, str]: Extracted information.
    """
    try:
        response = requests.get(wiki_url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"‚ùå Failed to fetch {wiki_url}: {e}")
        return {}

    soup = BeautifulSoup(response.text, 'html.parser')
    data = {}

    # Title
    title_tag = soup.find("h1", id="firstHeading")
    data["Title"] = title_tag.get_text(strip=True) if title_tag else "N/A"

    # First paragraph (summary)
    for p in soup.select("div.mw-parser-output > p"):
        text = p.get_text(strip=True)
        if text:
            data["Summary"] = text
            break

    # Infobox
    infobox = soup.find("table", class_="infobox")
    if infobox:
        for row in infobox.find_all("tr"):
            if row.th and row.td:
                key = row.th.get_text(" ", strip=True)
                value = row.td.get_text(" ", strip=True)
                data[key] = value

    return data


# Initialize overall data container
overall_df = pd.DataFrame(columns=['title', 'url', 'cleanup_message', 'categories'])

count = 0

for category_url, article_list in cleanup_dict.items():
    print(f"\nüìÇ Processing category: {category_url}")

    for article in article_list:
        title, url = list(article.items())[0]
        count += 1
        print(f"\nüî¢ [{count}] Processing: {title}")

        # Extract details and messages
        characteristics = extract_characteristics(url)

        df = clean_messages(title, url)
        overall_df = pd.concat([overall_df, df], ignore_index=True)

        # Polite crawling
        sleep_time = random.uniform(1, 5)
        print(f"‚è± Sleeping for {sleep_time:.2f} seconds...\n")
        time.sleep(sleep_time)

# Final summary
print(f"\n‚úÖ Completed. Total articles processed: {count}")
display(overall_df)


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_November_2010

üî¢ [1] Processing: Dorothea Brande
‚è± Sleeping for 3.33 seconds...


üî¢ [2] Processing: DisplayPort
‚è± Sleeping for 1.07 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_December_2010

üî¢ [3] Processing: At√≠nale al precio
‚è± Sleeping for 1.21 seconds...


üî¢ [4] Processing: List of Dominican Republic films
‚è± Sleeping for 2.55 seconds...


üî¢ [5] Processing: Instructional design
‚è± Sleeping for 2.59 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_February_2011

üî¢ [6] Processing: California Sun
‚è± Sleeping for 1.88 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2011

üî¢ [7] Processing: List of people educated at Haileybury (Melbourne)
‚è± Sleeping for 2.05 seconds...


üìÇ Process

‚è± Sleeping for 3.58 seconds...


üî¢ [68] Processing: List of McDonald's marketing campaigns
‚è± Sleeping for 2.45 seconds...


üî¢ [69] Processing: Roger Craig Vogel
‚è± Sleeping for 1.22 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_May_2014

üî¢ [70] Processing: Pattern recognition
‚è± Sleeping for 2.16 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_June_2014

üî¢ [71] Processing: Craig Barnett
‚è± Sleeping for 4.27 seconds...


üî¢ [72] Processing: Romolo Calabrese
‚è± Sleeping for 2.40 seconds...


üî¢ [73] Processing: Harry Reis
‚è± Sleeping for 4.11 seconds...


üî¢ [74] Processing: Theora
‚è± Sleeping for 4.64 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_July_2014

üî¢ [75] Processing: Trusted Computing
‚è± Sleeping for 3.07 seconds...


üìÇ Processing category: https://en.wikipedia.org/wik


üî¢ [153] Processing: Machiko Kawana
‚è± Sleeping for 3.19 seconds...


üî¢ [154] Processing: Ayako Kawasumi
‚è± Sleeping for 4.38 seconds...


üî¢ [155] Processing: Sh≈çko Kikuchi
‚è± Sleeping for 2.64 seconds...


üî¢ [156] Processing: Y≈´ji Kishi
‚è± Sleeping for 4.45 seconds...


üî¢ [157] Processing: Mapfre
‚è± Sleeping for 2.42 seconds...


üî¢ [158] Processing: Yuuki Matsuda
‚è± Sleeping for 1.23 seconds...


üî¢ [159] Processing: Dai Matsumoto
‚è± Sleeping for 1.05 seconds...


üî¢ [160] Processing: Ui Miyazaki
‚è± Sleeping for 4.33 seconds...


üî¢ [161] Processing: Takahiro Mizushima
‚è± Sleeping for 1.92 seconds...


üî¢ [162] Processing: Shunichi Nagasaki
‚è± Sleeping for 3.35 seconds...


üî¢ [163] Processing: Kumiko Nishihara
‚è± Sleeping for 3.40 seconds...


üî¢ [164] Processing: Sakura Nogawa
‚è± Sleeping for 3.77 seconds...


üî¢ [165] Processing: Mitsuru Ogata
‚è± Sleeping for 4.80 seconds...


üî¢ [166] Processing: Makiko Ohmoto
‚è± Sleeping for 1.78 


üî¢ [251] Processing: The Yale Record
‚è± Sleeping for 2.27 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_November_2016

üî¢ [252] Processing: Bigg Boss Kannada season 2
‚è± Sleeping for 1.05 seconds...


üî¢ [253] Processing: Bigg Boss Kannada season 3
‚è± Sleeping for 3.10 seconds...


üî¢ [254] Processing: Bigg Boss Kannada season 4
‚è± Sleeping for 1.76 seconds...


üî¢ [255] Processing: List of family-and-homemaking blogs
‚è± Sleeping for 1.10 seconds...


üî¢ [256] Processing: List of Slovenian mathematicians
‚è± Sleeping for 1.66 seconds...


üî¢ [257] Processing: List of non-alcoholic mixed drinks
‚è± Sleeping for 4.48 seconds...


üî¢ [258] Processing: Yui Sakakibara
‚è± Sleeping for 3.13 seconds...


üî¢ [259] Processing: Peter Zizka
‚è± Sleeping for 4.15 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_December_2016

üî¢ [260] Processing: Al-Izhar P


üî¢ [327] Processing: Chris Ruppenthal
‚è± Sleeping for 1.31 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_November_2018

üî¢ [328] Processing: List of Georgia Southern University alumni
‚è± Sleeping for 3.66 seconds...


üî¢ [329] Processing: Elis Gruffydd
‚è± Sleeping for 2.09 seconds...


üî¢ [330] Processing: List of Galaxy Angel characters
‚è± Sleeping for 2.10 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_December_2018

üî¢ [331] Processing: Fourth International (post-reunification)
‚è± Sleeping for 3.76 seconds...


üî¢ [332] Processing: Kazuo Kumakura
‚è± Sleeping for 1.63 seconds...


üî¢ [333] Processing: Janez Strnad
‚è± Sleeping for 2.98 seconds...


üî¢ [334] Processing: Stuart Zender
‚è± Sleeping for 1.11 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_January_2019

üî¢ [335] Processing:


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_August_2020

üî¢ [411] Processing: Bachok District
‚è± Sleeping for 3.21 seconds...


üî¢ [412] Processing: Bailhongal
‚è± Sleeping for 3.37 seconds...


üî¢ [413] Processing: Belavanaki
‚è± Sleeping for 2.33 seconds...


üî¢ [414] Processing: Bhadravati, Maharashtra
‚è± Sleeping for 1.32 seconds...


üî¢ [415] Processing: Culture of Finland
‚è± Sleeping for 1.29 seconds...


üî¢ [416] Processing: √âcole de management de Normandie
‚è± Sleeping for 4.60 seconds...


üî¢ [417] Processing: List of almshouses in the United Kingdom
‚è± Sleeping for 3.31 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_September_2020

üî¢ [418] Processing: Archives of Physical Medicine and Rehabilitation
‚è± Sleeping for 2.65 seconds...


üî¢ [419] Processing: List of battles 1601‚Äì1800
‚è± Sleeping for 1.03 seconds...


üî¢ [420] Processing: BOCHK 

‚è± Sleeping for 2.36 seconds...


üî¢ [494] Processing: Children's fantasy
‚è± Sleeping for 3.66 seconds...


üî¢ [495] Processing: List of former professional sports teams in Houston
‚è± Sleeping for 1.63 seconds...


üî¢ [496] Processing: List of potato chip brands
‚è± Sleeping for 4.75 seconds...


üî¢ [497] Processing: Salt Lake City
‚è± Sleeping for 4.29 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_March_2022

üî¢ [498] Processing: AMOLED
‚è± Sleeping for 4.38 seconds...


üî¢ [499] Processing: Hilaire du Berrier
‚è± Sleeping for 4.11 seconds...


üî¢ [500] Processing: Suzi Ferrer
‚è± Sleeping for 4.12 seconds...


üî¢ [501] Processing: Jim Hougan
‚è± Sleeping for 1.90 seconds...


üî¢ [502] Processing: List of institutions of higher education in Chandigarh
‚è± Sleeping for 4.84 seconds...


üî¢ [503] Processing: Szymon Lenkowski
‚è± Sleeping for 3.67 seconds...


üî¢ [504] Processing: List of English words o

‚è± Sleeping for 3.08 seconds...


üî¢ [583] Processing: Government Model Boys Higher Secondary School
‚è± Sleeping for 3.99 seconds...


üî¢ [584] Processing: Kagaznagar, Telangana
‚è± Sleeping for 2.04 seconds...


üî¢ [585] Processing: List of universities in Zambia
‚è± Sleeping for 2.49 seconds...


üî¢ [586] Processing: Pukkattupadi
‚è± Sleeping for 3.48 seconds...


üî¢ [587] Processing: Imperial, royal and noble ranks
‚è± Sleeping for 2.52 seconds...


üî¢ [588] Processing: Vivek Sachidanand
‚è± Sleeping for 1.71 seconds...


üî¢ [589] Processing: Sirpur (T)
‚è± Sleeping for 4.96 seconds...


üî¢ [590] Processing: Territory of the Islamic State
‚è± Sleeping for 1.53 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_April_2023

üî¢ [591] Processing: Baddi
‚è± Sleeping for 3.61 seconds...


üî¢ [592] Processing: Bauchi (city)
‚è± Sleeping for 3.37 seconds...


üî¢ [593] Processing: Jan de Hartog
‚è± Sleeping for 1.

‚è± Sleeping for 4.97 seconds...


üî¢ [678] Processing: List of top 10 singles in 2021 (France)
‚è± Sleeping for 1.74 seconds...


üî¢ [679] Processing: List of top 10 singles in 2022 (France)
‚è± Sleeping for 3.20 seconds...


üî¢ [680] Processing: List of top 10 singles in 2023 (France)
‚è± Sleeping for 3.55 seconds...


üî¢ [681] Processing: List of vaping bans in the United States
‚è± Sleeping for 3.44 seconds...


üî¢ [682] Processing: Media circus
‚è± Sleeping for 4.82 seconds...


üî¢ [683] Processing: Steven Mithen
‚è± Sleeping for 4.38 seconds...


üî¢ [684] Processing: Newent
‚è± Sleeping for 2.23 seconds...


üî¢ [685] Processing: Osun State
‚è± Sleeping for 4.96 seconds...


üî¢ [686] Processing: Shaheed Benazirabad District
‚è± Sleeping for 2.75 seconds...


üî¢ [687] Processing: Pratibha Singhi
‚è± Sleeping for 4.46 seconds...


üî¢ [688] Processing: Use of nigger in the arts
‚è± Sleeping for 4.02 seconds...


üìÇ Processing category: https://en.wikipedia.org


üî¢ [780] Processing: Patiala
‚è± Sleeping for 1.90 seconds...


üî¢ [781] Processing: Shaki, Oyo
‚è± Sleeping for 3.01 seconds...


üî¢ [782] Processing: Jon M. Sweeney
‚è± Sleeping for 3.04 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_September_2024

üî¢ [783] Processing: Ancona
‚è± Sleeping for 4.52 seconds...


üî¢ [784] Processing: Attock
‚è± Sleeping for 2.17 seconds...


üî¢ [785] Processing: Batala
‚è± Sleeping for 4.13 seconds...


üî¢ [786] Processing: Bhimavaram
‚è± Sleeping for 1.71 seconds...


üî¢ [787] Processing: Dantla, Rajasthan
‚è± Sleeping for 3.43 seconds...


üî¢ [788] Processing: Etawah
‚è± Sleeping for 3.84 seconds...


üî¢ [789] Processing: List of fictional horses
‚è± Sleeping for 1.15 seconds...


üî¢ [790] Processing: Ghanche District
‚è± Sleeping for 4.37 seconds...


üî¢ [791] Processing: Human geography
‚è± Sleeping for 3.65 seconds...


üî¢ [792] Processing: List of institutions o


üî¢ [886] Processing: Nozomi Tsuji
‚è± Sleeping for 1.66 seconds...


üìÇ Processing category: https://en.wikipedia.org/wiki/Category:Wikipedia_list_cleanup_from_April_2025

üî¢ [887] Processing: Behala
‚è± Sleeping for 1.95 seconds...


üî¢ [888] Processing: Francis Boyle
‚è± Sleeping for 2.82 seconds...


üî¢ [889] Processing: List of demons in fiction
‚è± Sleeping for 3.80 seconds...


üî¢ [890] Processing: Institute of Development Studies
‚è± Sleeping for 4.59 seconds...


üî¢ [891] Processing: Lagavulin distillery
‚è± Sleeping for 3.55 seconds...


üî¢ [892] Processing: Edward E. Lawler
‚è± Sleeping for 1.79 seconds...


üî¢ [893] Processing: List of programmes broadcast by CNA
‚è± Sleeping for 1.17 seconds...


üî¢ [894] Processing: List of protests and demonstrations in the United States by size
‚è± Sleeping for 1.63 seconds...


üî¢ [895] Processing: Spanish Steps
‚è± Sleeping for 2.73 seconds...


üî¢ [896] Processing: Ukrainians in Russia
‚è± Sleeping for 2.10 se

Unnamed: 0,title,url,cleanup_message,categories
0,Dorothea Brande,https://en.wikipedia.org/wiki/Dorothea_Brande,This article has multiple issues. Please help ...,"[Illinois Institute of Technology alumni, 1893..."
1,Dorothea Brande,https://en.wikipedia.org/wiki/Dorothea_Brande,This article may need to be rewritten to compl...,"[Illinois Institute of Technology alumni, 1893..."
2,Dorothea Brande,https://en.wikipedia.org/wiki/Dorothea_Brande,This article may be in need of reorganization ...,"[Illinois Institute of Technology alumni, 1893..."
3,Dorothea Brande,https://en.wikipedia.org/wiki/Dorothea_Brande,This article contains a list that has not been...,"[Illinois Institute of Technology alumni, 1893..."
4,DisplayPort,https://en.wikipedia.org/wiki/DisplayPort,This article may be too technical for most rea...,"[Digital display connectors, VESA, Computer co..."
...,...,...,...,...
2291,Lydia Sigourney bibliography,https://en.wikipedia.org/wiki/Lydia_Sigourney_...,This article relies largely or entirely on a s...,"[Bibliographies by writer, Bibliographies of A..."
2292,Tulane Journal of Law & Sexuality,https://en.wikipedia.org/wiki/Tulane_Journal_o...,This section does not cite any sources . Pleas...,"[American law journals, Tulane University Law ..."
2293,Tulane Journal of Law & Sexuality,https://en.wikipedia.org/wiki/Tulane_Journal_o...,This article may contain unverified or indiscr...,"[American law journals, Tulane University Law ..."
2294,Kaveh Akbar,https://en.wikipedia.org/wiki/Kaveh_Akbar,This section contains a list that has not been...,"[21st-century American poets, 21st-century Ame..."


In [5]:
overall_df.to_csv('data/wikipedia.csv')