In [1]:
import csv
import re
import requests
from bs4 import BeautifulSoup

# URL of the Recent Changes page
url = "https://en.wikipedia.org/wiki/Special:RecentChanges?hidebots=1&hidecategorization=1&hideWikibase=1&limit=5000&days=150&urlversion=2"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all list items with class 'mw-changeslist-line'
    changes = soup.find_all("li", class_="mw-changeslist-line")
    
    # Create a list to store data
    data = []
    
    # Extract revision IDs and titles for each recent change
    for change in changes:
        # Find the link within the list item
        link = change.find("a")
        if link:
            # Extract the href attribute if it exists
            href = link.get("href")
            # Extract the numerical part of the revision ID from the href attribute
            match = re.search(r"\d+$", href)
            revision_id = match.group() if match else "Unknown"
        else:
            revision_id = "Unknown"
        
        # Find the title of the article within the relevant span tag
        title_span = change.find("span", class_="mw-title")
        title = title_span.get_text() if title_span else "Unknown Title"
        
        # Append data to the list
        data.append({ "Revision ID": revision_id,"Title": title})
        
    # Write data to CSV file
    with open("recent_changes.csv", "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = [ "Revision ID","Title"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write header
        writer.writeheader()
        
        # Write data rows
        for row in data:
            writer.writerow(row)
    
    print("Data has been saved to recent_changes.csv")
else:
    print("Error:", response.status_code)


Data has been saved to recent_changes.csv


In [2]:
import csv

# Function to filter rows with "unknown" value in the first column
def filter_unknown_rows(input_file, output_file):
    with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        rows = [row for row in reader if row[0] != 'Unknown']

    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(rows)

# File names
input_file = 'recent_changes.csv'
output_file = 'filtered_recent_changes.csv'

# Filter rows and write to new file
filter_unknown_rows(input_file, output_file)

print("Filtered data written to", output_file)


Filtered data written to filtered_recent_changes.csv
