In [16]:
def load_proxies(path="proxies.txt"):
    with open(path, "r") as f:
        return [line.strip() for line in f if line.strip()]


In [18]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
import json
import random

# Initial seed domains
SEED_DOMAINS = [
    # "https:#www.virgio.com",
    # "https:#www.tatacliq.com",
    # "https:#www.nykaafashion.com",
    # "https:#www.westside.com/collections/polo-t-shirts-for-men",
    "https://www.tatacliq.com/mens-clothing-casual-wear-t-shirts-polos/c-msh1116100"
]

# Product URL patterns
PRODUCT_PATTERNS = [r'/product/', r'/p/', r'/item/', r'/shop/', r'/details/', r'/sku/']

# Max pages to crawl per domain to avoid infinite crawling
MAX_PAGES = 1000
HEADERS = {
    "User-Agent": random.choice([
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "CrawltonBot/1.0 (+https:#github.com/yourgithub/crawlton)",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1"
    ])
}

# Optional: list of proxies (use real proxies here)
PROXIES = load_proxies()

# Storage for output
results = {}

import random
import asyncio

# List of real user agents
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    # Add more if you like
]

# Optional: list of proxies (use real proxies here)
PROXIES = [
    # "http:#user:pass@proxy1.com:port",
    # "http:#proxy2.com:port",
]

async def fetch(session, url, retries=3):
    for attempt in range(retries):
        try:
            headers = {
                "User-Agent": "Googlebot"
            }
            # proxy = random.choice(PROXIES) if PROXIES else None
            # kwargs = {"proxy": proxy} if proxy else {}

            async with session.get(url, headers=headers, timeout=10) as response:
                if response.status != 200:
                    raise Exception(f"Blocked with status {response.status}")
                if 'text/html' in response.headers.get('Content-Type', ''):
                    return await response.text()

        except Exception as e:
            print(f"[!] Fetch failed ({attempt + 1}/{retries}) {url}: {e}")
            await asyncio.sleep(2 ** attempt + random.random())  # exponential backoff

    return None  # after all retries fail


def extract_links(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    links = set()
    for a in soup.find_all('a', href=True):
        href = urljoin(base_url, a['href'])
        if is_same_domain(base_url, href):
            links.add(href.split('#')[0])  # Remove fragments
    return links

def is_same_domain(base, target):
    return urlparse(base).netloc == urlparse(target).netloc

def is_product_url(url):
    return any(re.search(pattern, url) for pattern in PRODUCT_PATTERNS)

async def crawl_domain(domain):
    visited = set()
    to_visit = set([domain])
    product_urls = set()

    async with aiohttp.ClientSession() as session:
        while to_visit and len(visited) < MAX_PAGES:
            url = to_visit.pop()
            if url in visited:
                continue
            visited.add(url)

            print(f"[~] Crawling: {url}")
            html = await fetch(session, url)
            if not html:
                continue

            links = extract_links(html, url)
            for link in links:
                if link not in visited:
                    if is_product_url(link):
                        print(f"[+] Found product: {link}")
                        product_urls.add(link)
                    to_visit.add(link)

    results[urlparse(domain).netloc] = sorted(product_urls)

async def main():
    tasks = [crawl_domain(domain) for domain in SEED_DOMAINS]
    await asyncio.gather(*tasks)

    # Save results to a JSON file
    with open("product_urls.json", "w") as f:
        json.dump(results, f, indent=2)

    print("\n✅ Done! Product URLs saved to 'product_urls.json'.")

if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()
    await main()


[~] Crawling: https://www.tatacliq.com/mens-clothing-casual-wear-t-shirts-polos/c-msh1116100
[!] Fetch failed (1/3) https://www.tatacliq.com/mens-clothing-casual-wear-t-shirts-polos/c-msh1116100: Blocked with status 403
[!] Fetch failed (2/3) https://www.tatacliq.com/mens-clothing-casual-wear-t-shirts-polos/c-msh1116100: Blocked with status 403
[!] Fetch failed (3/3) https://www.tatacliq.com/mens-clothing-casual-wear-t-shirts-polos/c-msh1116100: Blocked with status 403

✅ Done! Product URLs saved to 'product_urls.json'.


In [10]:
# Fetch HTML from the given endpoint
url = "https:#www.westside.com/collections/polo-t-shirts-for-men"

async def fetch_html():
    async with aiohttp.ClientSession() as session:
        html = await fetch(session, url)
        if html:
            print("HTML fetched successfully!")
            return html
        else:
            print("Failed to fetch HTML.")
            return None

# Run the fetch_html coroutine
html_content = await fetch_html()

HTML fetched successfully!


In [55]:
from urllib.parse import urljoin, urlparse
async def extract_filtered_links(content, base_url, filtered_link):
    soup = BeautifulSoup(content, 'html.parser')
    for tag in soup.find_all(href=True):
        if tag.name == "link":
            continue
        link = tag.get("href", None)
        if link is None:
            continue
        if link.startswith("/"):
            link = urljoin(base_url, link)
        if urlparse(link).netloc == urlparse(base_url).netloc:
            if link not in filtered_link:
                filtered_link.append(link.split('?')[0])  # Remove fragments
    return filtered_link

In [66]:
import time
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import asyncio
# Fetch HTML from the given endpoint
# url = "https://www.westside.com/collections/polo-t-shirts-for-men"
url = "https://www.tatacliq.com/mens-clothing-casual-wear-t-shirts-polos/c-msh1116100"
# url = "https://www.virgio.com/collections/the-party-edit"
# url = "https://www.nykaafashion.com/women/westernwear/shirts/c/7623"
base_url = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
filtered_link = []
async def slow_scroll_to_bottom(page, step=500, delay=0.5):
    scroll_count = 1
    while True:
        previous_height = step * scroll_count
        await page.evaluate(f"window.scrollBy(0, {step})")
        time.sleep(delay)

        new_height = await page.evaluate("() => document.body.scrollHeight")

        content = await page.content()
        await extract_filtered_links(content, base_url, filtered_link)
        if previous_height > new_height:
            break
        scroll_count += 1
        # if scroll_count > 10:
        #     break

async def scroll_to_bottom(page, scroll_delay=2.0, max_scrolls=50):
    """Scrolls to the bottom of the page to load dynamic content."""
    previous_height = await page.evaluate("() => document.body.scrollHeight")
    
    for _ in range(max_scrolls):
        await page.evaluate(f"window.scrollTo(0, {int(previous_height*0.8)})")
        time.sleep(scroll_delay)
        
        new_height = await page.evaluate("() => document.body.scrollHeight")
        if new_height == previous_height:
            break
        previous_height = new_height

async def fetch_rendered_html(url):
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=False)
            page = await browser.new_page()
            # async def block_requests(route, request):
            #     if request.resource_type in ["image", "stylesheet", "font", "media"]:
            #         await route.abort()
            #     else:
            #         await route.continue_()

            # await page.route("**/*", block_requests)
            await page.goto(url, wait_until='domcontentloaded')

            # Select using DOM query style
            # script_text = await page.locator('script[type="application/ld+json"]').first.inner_text()
            
            # print("Raw script content from DOM:")
            # print(script_text)

            await page.wait_for_timeout(3000)
            # await page.wait_for_selector("img", timeout=10000)
            await slow_scroll_to_bottom(page)

        # Get all product elements
            items = page.locator('a')

            content = await page.content()
            await browser.close()
            return content
    except Exception as e:
        print(f"An error occurred while launching the browser: {e}")

# Fetch the fully rendered HTML
html_content = await fetch_rendered_html(url)

In [61]:
len(filtered_link), filtered_link
# pattern_p_urls = [u for u in filtered_link if '/p/' in u]
# len(pattern_p_urls), pattern_p_urls

(41,
 ['https://www.virgio.com/',
  'https://www.virgio.com/collections/all',
  'https://www.virgio.com/irl',
  'https://www.virgio.com/know-your-size',
  'https://www.virgio.com/sustainability',
  'https://www.virgio.com/account',
  'https://www.virgio.com/products/party-wear-maroon-embellished-midi-dress',
  'https://www.virgio.com/products/party-wear-black-embellished-dolman-top',
  'https://www.virgio.com/products/party-wear-black-embellished-midi-dress',
  'https://www.virgio.com/products/party-wear-maroon-embellished-dolman-top',
  'https://www.virgio.com/products/ruffle-romance-black-ruffle-midi-dress',
  'https://www.virgio.com/products/papillon-charm-one-shoulder-maxi-dress',
  'https://www.virgio.com/products/ruffle-romance-blue-ruffle-dress-with-slit',
  'https://www.virgio.com/products/ruffle-romance-bell-sleeve-ruffle-dress',
  'https://www.virgio.com/products/papillon-charm-butterfly-printed-shirt',
  'https://www.virgio.com/products/ruffle-romance-long-sleeve-ruffle-dres

In [37]:
from urllib.parse import urlparse
from collections import defaultdict, Counter
import re

# Sample list of URLs
urls = filtered_link

# Optional: Remove query params
def clean_url(url):
    parsed = urlparse(url)
    return parsed.scheme + "://" + parsed.netloc + parsed.path

# Step 1: Clean and group by pattern
def get_pattern(url):
    url = clean_url(url)
    # Replace numeric parts with {id}
    pattern = re.sub(r'\d+', '{id}', url)
    return pattern

# Step 2: Build groups
group_map = defaultdict(list)
for url in urls:
    pattern = get_pattern(url)
    group_map[pattern].append(url)

# Step 3: Find the most frequent group
most_common_group = max(group_map.values(), key=len)

# Output
print("Most common pattern group:")
for u in most_common_group:
    print(u)


Most common pattern group:
https://www.nykaafashion.com/lp/about-us
https://www.nykaafashion.com/lp/about-us


In [6]:
import json
from playwright.sync_api import sync_playwright

def extract_json_ld(url):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        page.goto(url, wait_until='networkidle')

        # Locate all <script type="application/ld+json"> tags
        scripts = page.locator('script[type="application/ld+json"]')
        count = scripts.count()

        print(f"Found {count} JSON-LD script(s).")

        for i in range(count):
            raw_json = scripts.nth(i).inner_text()
            try:
                data = json.loads(raw_json)
                print(json.dumps(data, indent=2))
            except json.JSONDecodeError:
                print(f"Skipping script {i} - not valid JSON.")

        browser.close()

# Example usage
extract_json_ld('https://www.nykaafashion.com/women/westernwear/shirts/c/7623')


Error: It looks like you are using Playwright Sync API inside the asyncio loop.
Please use the Async API instead.