In [12]:
import asyncio
from zenrows import ZenRowsClient
from bs4 import BeautifulSoup
import pandas as pd
from playwright.async_api import async_playwright

# Initialize ZenRowsClient
client = ZenRowsClient("0730ac3e3681ca812896f9d052022fe79d379679", concurrency=5, retries=1)

async def scrape_models_from_page(page_content):
    soup = BeautifulSoup(page_content, 'html.parser')
    models = soup.find_all('div', class_='print-list-item')
    data = []

    for model in models:
        link_tag = model.find('a', class_='link clamp-two-lines')
        likes_span = model.find('span', class_='count cursor-pointer')
        rating_span = model.find('div', class_='rating').find('span', class_='number ml-1')
        downloads_span = model.find('div', class_='small-icon downloads').find('span', class_='ml-1')
        
        data.append({
            'name': link_tag.text.strip(),
            'url': link_tag['href'],
            'likes': likes_span.text.strip(),
            'rating': rating_span.text.strip(),
            'downloads': downloads_span.text.strip()
        })
    
    return data

async def save_data_to_csv(data, file_path='models.csv'):
    df = pd.DataFrame(data)
    df.to_csv(file_path, index=False)

async def main():
    url = 'https://www.printables.com/model?o=download_count&period=all-time'  # Replace with your target URL
    results = []
    max_results = 10000  # You can control this value
    scroll_pause_time = 2  # seconds

    async with async_playwright() as playwright:
        browser = await playwright.chromium.launch(headless=False)
        page = await browser.new_page()
        await page.goto(url)

        while len(results) < max_results:
            # Scroll down to the bottom
            await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
            await asyncio.sleep(scroll_pause_time)  # Wait for new content to load

            # Get the current page content
            content = await page.content()
            new_results = await scrape_models_from_page(content)
            
            # Avoid duplicates by checking URLs
            existing_urls = set(item['url'] for item in results)
            filtered_results = [result for result in new_results if result['url'] not in existing_urls]
            results.extend(filtered_results)
            existing_urls.update(result['url'] for result in filtered_results)

            if len(filtered_results) == 0:
                break  # Stop if no new results are found
        
        await browser.close()

    await save_data_to_csv(results[:max_results])
    print(f"Saved {len(results[:max_results])} models to models.csv")

# Run the main function
if __name__ == "__main__":
    try:
        # If running in a notebook, we need to use a different method to start the loop
        loop = asyncio.get_event_loop()
        if loop and loop.is_running():
            # If there's already a running event loop, use it to run main()
            task = loop.create_task(main())
            await task
        else:
            # If there's no running event loop, start a new one
            asyncio.run(main())
    except RuntimeError as e:
        print(f"RuntimeError: {e}")


Saved 3600 models to models.csv
