In [3]:
import asyncio
import random
import nest_asyncio
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
from bs4 import BeautifulSoup
import pandas as pd
from tenacity import retry, stop_after_attempt, wait_exponential

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Function to fetch URLs from the CSV file
def fetch_urls_from_csv(file_path='links_new.csv'):
    df = pd.read_csv(file_path)
    return df['url'].tolist()

# Function to save the last scraped URL index
def save_last_scraped_index(index, file_path='last_scraped_index.txt'):
    with open(file_path, 'w') as f:
        f.write(str(index))

# Function to load the last scraped URL index
def load_last_scraped_index(file_path='last_scraped_index.txt'):
    try:
        with open(file_path, 'r') as f:
            return int(f.read().strip())
    except FileNotFoundError:
        return 0

def random_delay(min_seconds=1, max_seconds=5):
    return random.uniform(min_seconds, max_seconds)

async def simulate_scrolling(page):
    height = await page.evaluate('document.body.scrollHeight')
    for i in range(0, height, random.randint(100, 300)):
        await page.evaluate(f'window.scrollTo(0, {i})')
        await asyncio.sleep(random_delay(0.1, 0.5))

async def simulate_mouse_movement(page):
    width = await page.evaluate('document.body.clientWidth')
    height = await page.evaluate('document.body.clientHeight')
    for _ in range(random.randint(5, 10)):
        x = random.randint(0, width)
        y = random.randint(0, height)
        await page.mouse.move(x, y)
        await asyncio.sleep(random_delay(0.1, 0.3))

async def handle_cookies_and_popups(page):
    try:
        await page.click('button#accept-cookies', timeout=5000)
    except PlaywrightTimeoutError:
        pass

async def rate_limit():
    await asyncio.sleep(random_delay(2, 5))

user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
]

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def fetch_page_content(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(user_agent=random.choice(user_agents))
        page = await context.new_page()
        try:
            await page.goto(url, timeout=60000)
            await handle_cookies_and_popups(page)
            await asyncio.sleep(random_delay())
            await simulate_scrolling(page)
            await simulate_mouse_movement(page)
            html_content = await page.content()
        except PlaywrightTimeoutError:
            print(f"Timeout error while navigating to {url}")
            html_content = None
        finally:
            await browser.close()
        return html_content

def parse_html_content(html_content):
    if html_content is None:
        return None

    doc = BeautifulSoup(html_content, 'html.parser')
    models = doc.find_all('div', class_='content')

    if not models:
        return None

    all_data = []
    for model in models:
        attributes = model.find('div', class_='attributes').find_all('market-attribute') if model.find('div', class_='attributes') else []
        data = {
            'name': model.find('h1', class_='model-name mb-0').text.strip() if model.find('h1', class_='model-name mb-0') else None,
            'user': model.find('a', class_='user-card')['href'] if model.find('a', class_='user-card') else None, 
            'summary': model.find('div', class_='summary').text.strip() if model.find('div', class_='summary') else None,
            'description': model.find('div', class_='user-inserted').text.strip() if model.find('div', class_='user-inserted') else None,
            'time': attributes[0].find('span').text.strip() if len(attributes) > 0 else None,
            'file_num': attributes[1].find('span').text.strip() if len(attributes) > 1 else None,
            'layer_size': attributes[2].find('span').text.strip() if len(attributes) > 2 else None,
            'head_size': attributes[3].find('span').text.strip() if len(attributes) > 3 else None,
            'weight': attributes[5].find('span').text.strip() if len(attributes) > 5 else None,
            'likes': model.find('div', class_='stats-item cursor-pointer').text.strip() if model.find('div', class_='stats-item cursor-pointer') else None,
            'downloads': model.find('div', 'model-stats').find_all('div', 'stats-item')[1].text.strip() if model.find('div', 'model-stats') else None,
            'visitors': model.find('div', 'model-stats').find_all('div', 'stats-item')[3].text.strip() if model.find('div', 'model-stats') else None,
            'updated': model.find('div', 'model-stats').find_all('div', 'stats-item')[4].text.strip() if model.find('div', 'model-stats') else None,
            'tags_main': [a.text.strip() for a in model.find('div', 'detail-header').find('div', 'breadcrumbs').find_all('a')] if model.find('div', 'detail-header') else None, 
            'tags_user': [a.text.strip() for a in model.find('div', 'tags-wrapper d-inline-flex flex-wrap').find_all('a', 'badge')] if model.find('div', 'tags-wrapper d-inline-flex flex-wrap') else None
        }
        all_data.append(data)
    return all_data

async def scrape_model(url):
    html_content = await fetch_page_content(url)
    data = parse_html_content(html_content)
    return data

async def scrape_models(urls, max_urls=None):
    all_data = []
    semaphore = asyncio.Semaphore(5)
    last_scraped_index = load_last_scraped_index()
    start_index = last_scraped_index
    results_to_save = []

    async def scrape_with_semaphore(url):
        await rate_limit()
        async with semaphore:
            return await scrape_model(url)

    tasks = []
    for i in range(start_index, len(urls)):
        if max_urls is not None and i >= start_index + max_urls:
            break
        url = urls[i]
        print(f"Visiting: {url}")
        html_content = await fetch_page_content(url)
        if html_content:
            print(f"First 100 characters: {html_content[:500]}")
            data = parse_html_content(html_content)
            if data:
                print(f"Scraping successful for: {url}")
                results_to_save.extend(data)
                if len(results_to_save) >= 100:
                    all_data.extend(results_to_save)
                    results_to_save = []
                    df = pd.DataFrame(all_data)
                    df.to_csv('scraped_data_partial.csv', index=False)
                    print("Partial data saved to scraped_data_partial.csv")
            else:
                print(f"Scraping failed for: {url}")
        else:
            print(f"Failed to fetch content for: {url}")
        tasks.append((url, asyncio.create_task(scrape_with_semaphore(url))))
        save_last_scraped_index(i)

    results = await asyncio.gather(*[task for _, task in tasks])
    for url, result in zip([url for url, _ in tasks], results):
        if result:
            all_data.extend(result)
        else:
            print(f"Skipping empty page: {url}")

    if results_to_save:
        all_data.extend(results_to_save)

    return all_data

async def main():
    urls = fetch_urls_from_csv()
    max_urls = 1100

    result = await scrape_models(urls, max_urls)
    if result:
        df = pd.DataFrame(result)
        df.to_csv('scraped_data_1100.csv', index=False)
        print("Data saved to scraped_data_900.csv")
    else:
        print("No data to save")

if __name__ == "__main__":
    asyncio.run(main())


Visiting: https://www.printables.com/model/866943-fidget-squish-fidget
First 100 characters: <!DOCTYPE html><html lang="en" prefix="og: http://ogp.me/ns#"><head>
  <meta charset="utf-8">
  <title>Fidget: Squish Fidget by KingTut | Download free STL model | Printables.com</title>
  <base href="/">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <link rel="preload" href="/assets/loading/loading-wheel-4s.svg" as="image">
  <link rel="icon" type="image/png" sizes="32x32" href="/assets/favicons/favicon-32x32.png">
  <link rel="icon" type="image/png" sizes="16x16" hre
Scraping successful for: https://www.printables.com/model/866943-fidget-squish-fidget
Visiting: https://www.printables.com/model/203925-3dbenchy-s3d-kossel-gcode
First 100 characters: <!DOCTYPE html><html lang="en" prefix="og: http://ogp.me/ns#"><head>
  <meta charset="utf-8">
  <title>3DBenchy S3D Kossel gcode by Joon Chor | Download free STL model | Printables.com</title>
  <base href="/">
  <meta nam