In [19]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

# ZenRows API key
ZENROWS_API_KEY = "2fb712f035250fa0feba32543c584318e4894544"

# Base URL
BASE_URL = "https://sourceforge.net/software/saas/?sort=rating_count&page="

def fetch_with_zenrows(page):
    """Fetches a page using the ZenRows API."""
    url = f"{BASE_URL}{page}"
    try:
        params = {
            'url': url,
            'apikey': ZENROWS_API_KEY,
        }
        response = requests.get('https://api.zenrows.com/v1/', params=params)

        if response.status_code == 200:
            return page, response.text
        else:
            print(f"ZenRows request failed for page {page}: {response.status_code}, {response.text}")
            return page, None
    except Exception as e:
        print(f"Error with ZenRows request for page {page}: {e}")
        return page, None

def parse_page(page, html_content):
    """Parses a single page's HTML content for .subtitle items."""
    if not html_content:
        return []

    soup = BeautifulSoup(html_content, 'html.parser')
    subtitle_items = soup.select('.subtitle')
    return [item.get_text(strip=True) for item in subtitle_items]

def iterate_pages_parallel(start_page=1, end_page=1000, workers=50):
    """Fetches and parses pages in parallel."""
    subtitles = []

    with ThreadPoolExecutor(max_workers=workers) as executor:
        # Submit tasks for each page
        futures = {executor.submit(fetch_with_zenrows, page): page for page in range(start_page, end_page + 1)}

        for future in futures:
            try:
                page, html_content = future.result()
                print(f"Processing page {page}...")

                # Parse the page and collect subtitles
                page_subtitles = parse_page(page, html_content)
                subtitles.extend(page_subtitles)

                print(f"Page {page} fetched successfully. Found {len(page_subtitles)} .subtitle items. Total: {len(subtitles)}")
            except Exception as e:
                print(f"Error processing page {futures[future]}: {e}")

    return subtitles

if __name__ == "__main__":
    # Fetch all subtitles from pages 1 to 1000 using 50 workers
    all_subtitles = iterate_pages_parallel(start_page=1, end_page=1000, workers=50)
    print(f"Total subtitles retrieved: {len(all_subtitles)}")

    # Save to CSV
    import csv
    with open('subtitles.csv', 'w', newline='', encoding='utf-8') as file:
        csv.writer(file).writerows([[subtitle] for subtitle in all_subtitles])


Processing page 1...
Page 1 fetched successfully. Found 30 .subtitle items. Total: 30
Processing page 2...
Page 2 fetched successfully. Found 30 .subtitle items. Total: 60
Processing page 3...
Page 3 fetched successfully. Found 30 .subtitle items. Total: 90
Processing page 4...
Page 4 fetched successfully. Found 30 .subtitle items. Total: 120
Processing page 5...
Page 5 fetched successfully. Found 30 .subtitle items. Total: 150
Processing page 6...
Page 6 fetched successfully. Found 30 .subtitle items. Total: 180
Processing page 7...
Page 7 fetched successfully. Found 30 .subtitle items. Total: 210
Processing page 8...
Page 8 fetched successfully. Found 30 .subtitle items. Total: 240
Processing page 9...
Page 9 fetched successfully. Found 30 .subtitle items. Total: 270
Processing page 10...
Page 10 fetched successfully. Found 30 .subtitle items. Total: 300
Processing page 11...
Page 11 fetched successfully. Found 30 .subtitle items. Total: 330
Processing page 12...
Page 12 fetched succ

In [8]:
print(len(all_subtitles))

5970


In [6]:
import csv

with open('sourceforge_companies.csv', 'w', newline='', encoding='utf-8') as file: csv.writer(file).writerows([[subtitle] for subtitle in all_subtitles])
