In [26]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

base_url = 'https://paperswithcode.com/datasets'

# Initialize CSV file and write headers
with open('./scraping.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Source', 'Task Name'])

# Function to scrape datasets from a given page URL
def scrape_datasets_from_page(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    dataset_links = soup.select('div.dataset-wide-box > a')

    # Scraping each dataset page
    for link in dataset_links:
        dataset_url = urljoin(base_url, link['href'])
        scrape_dataset_page(dataset_url)

# Function to scrape a dataset page using its URL
def scrape_dataset_page(dataset_url):
    response = requests.get(dataset_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting tasks from the Benchmarks Table
    benchmarks_table = soup.find('table', id='benchmarks-table')
    if benchmarks_table:
        rows = benchmarks_table.find('tbody').find_all('tr')
        for row in rows:
            cells = row.find_all('td')
            if len(cells) >= 2:
                task_name = cells[1].text.strip()
                task_name = task_name.replace('-', '').replace(':', '').strip()
                with open('./scraping.csv', 'a', newline='', encoding='utf-8') as file:
                    writer = csv.writer(file)
                    writer.writerow(['Benchmark', task_name])
    else:
        with open('./scraping.csv', 'a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Benchmark', f'Not Found at {dataset_url}'])

    # Extracting List Task Counts data
    list_task_counts = soup.find_all('ul', class_='list-unstyled') + soup.find_all('ul', id='remaining-tasks')
    for task_list in list_task_counts:
        task_items = task_list.find_all('li')
        for item in task_items:
            task_name = item.text.strip()
            task_name = task_name.replace('-', '').replace(':', '').strip()
            with open('./scraping.csv', 'a', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow(['List Task', task_name])

    # Extracting Dataset Loaders data
    dataloader_container = soup.find('ul', class_='dataloader-implementations')
    if dataloader_container:
        dataloader_rows = dataloader_container.find_all('div', class_='row')
        for row in dataloader_rows:
            children = row.find_all(recursive=False)
            if len(children) >= 2:
                loader_name = children[0].find('a', class_='code-table-link').text.strip()
                loader_name = ' '.join(loader_name.split())  # Remove line breaks and unnecessary whitespaces
                loader_detail = children[1].text.strip()
                loader_detail = ' '.join(loader_detail.split())  # Remove line breaks and unnecessary whitespaces
                stars_count = ''.join(filter(str.isdigit, loader_detail))
                with open('./scraping.csv', 'a', newline='', encoding='utf-8') as file:
                    writer = csv.writer(file)
                    writer.writerow(['Dataset Loader', f"{loader_name} - {stars_count}"])
    else:
        with open('./scraping.csv', 'a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Dataset Loader', f'Not Found at {dataset_url}'])

# Loop through a range of pages to scrape datasets
for page in range(1, 101):  
    print(page)
    page_url = f"{base_url}?page={page}"
    scrape_datasets_from_page(page_url)


1
2
3


KeyboardInterrupt: 