In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def scrape_wikipedia_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find(id="firstHeading").text
    text = soup.find(id="mw-content-text").get_text(separator=' ', strip=True)
    words = text.split()[:1000]
    truncated_text = ' '.join(words)
    return title, truncated_text

In [6]:
def get_random_wikipedia_titles(total_count):
    titles = []
    URL = "https://en.wikipedia.org/w/api.php"

    while len(titles) < total_count:
        PARAMS = {
            "action": "query",
            "format": "json",
            "list": "random",
            "rnnamespace": "0",  # Main namespace
            "rnlimit": min(500, total_count - len(titles))  # Fetch in batches of 500
        }

        response = requests.get(url=URL, params=PARAMS)
        data = response.json()

        for item in data['query']['random']:
            titles.append(item['title'])

    return titles

In [7]:
# Fetch random page titles
random_titles = get_random_wikipedia_titles(100000)
print(len(random_titles))

# Scrape and store data
data = []
for title in random_titles:
    page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
    try:
        page_title, page_text = scrape_wikipedia_page(page_url)
        data.append({'Title': page_title, 'Text': page_text})
    except Exception as e:
        print(f"Error scraping {page_url}: {e}")

# Save data to CSV
df = pd.DataFrame(data)
df.to_csv("wikipedia_data.csv", index=False)

print("Data scraping completed and saved to 'wikipedia_data.csv'")

100000


KeyboardInterrupt: 