In [12]:
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup

async def crawl_with_playwright(url: str):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        await page.goto(url, wait_until="networkidle")
        await asyncio.sleep(2)

        content = await page.content()
        await browser.close()
        return content
    
def extract_xml_from_html(html: str):
    soup = BeautifulSoup(html, "html.parser")

    # Check for any embedded XML-looking structure
    xml_tags = ["sitemapindex", "urlset", "feed", "rss"]  # common root XML tags
    for tag in xml_tags:
        if soup.find(tag):
            return str(soup.find(tag))
        
    print("❌ Couldn't find XML in the HTML.")
    return None
    

In [None]:
from usp.tree import sitemap_tree_for_homepage, InvalidSitemap, sitemap_from_str
from collections import deque

list_of_websites = [
    "https://www.westside.com/",
    "https://virgio.com/",
    "https://www.tatacliq.com/",
    "https://nykaafashion.com/"
]
all_urls = []
for website in list_of_websites:
    print(f"Fetching sitemap for {website}")
    tree = sitemap_tree_for_homepage(website)

    count = 0
    for page in tree.all_pages():
        all_urls.append(page.url)
        count += 1
    
    if count == 0:
        print(" No URLs found in the sitemap. Maybe something failed!")
        for map in tree.all_sitemaps():
            if isinstance(map, InvalidSitemap):
                print('Invalid Sitemap:', map)
                print(' 403 ' in getattr(map, 'reason', ''))
                url_queue = deque([map.url])
                while(len(url_queue) > 0):

                    # Gather a batch of URLs to crawl at once (e.g., 5 at a time)
                    batch_size = 5
                    batch = []
                    while len(url_queue)>0 and len(batch) < batch_size:
                        url = url_queue.pop()
                        batch.append(url)
                        print('Fetching:', url)

                    # Use asyncio to crawl multiple URLs in parallel
                    async def fetch_and_process(urls):
                        tasks = [crawl_with_playwright(u) for u in urls]
                        results = await asyncio.gather(*tasks)
                        return results
                    xml_contents = await fetch_and_process(batch)

                    # Process the fetched HTML content
                    for xml_content in xml_contents:
                        xml_str = extract_xml_from_html(xml_content)
                        if xml_str:
                            new_tree = sitemap_from_str(xml_str)
                            for da in new_tree.all_pages():
                                all_urls.append(da.url)
                            for da in new_tree.all_sitemaps():
                                url_queue.append(da.url)

In [41]:
len(all_urls)

828292

In [42]:
import joblib

MODEL_PATH = "../model/model.pkl"
VECTORIZER_PATH = "../model/tfidf_vectorizer.pkl"

model = joblib.load(MODEL_PATH)
vectorizer = joblib.load(VECTORIZER_PATH)


def preprocess_url(url: str) -> str:
    return url.lower().replace("https://", "").replace("http://", "").replace("-", " ").replace("/", " ")

def predict_url(url: str) -> dict:
    processed = preprocess_url(url)
    vectorized = vectorizer.transform([processed])
    pred = model.predict(vectorized)[0]
    proba = model.predict_proba(vectorized)[0][pred]
    return {"label": int(pred), "confidence": round(float(proba), 3)}


In [None]:
non_product_urls = []
product_urls = []
for url in all_urls:
    prediction = predict_url(url)
    if prediction["label"] == 1:
        product_urls.append(
            {
                "url": url,
                "label": prediction["label"],
                "confidence": prediction["confidence"],
                "domain": url.split("/")[2],
            }
        )
    else:
        print("This URL is not a product page:", url, prediction)
        non_product_urls.append(
            {
                "url": url,
                "label": prediction["label"],
                "confidence": prediction["confidence"],
                "domain": url.split("/")[2],
            }
        )

In [48]:
import json

with open("product_urls.json", "w") as f:
    json.dump(product_urls, f, indent=2)
print(f"Saved {len(product_urls)} product URLs to product_urls.json")

Saved 824761 product URLs to product_urls.json


---------below for creating dataset---------

In [10]:
from urllib.parse import urlparse
import json

parsed_links = []
count = 0
for u in all_urls:
    parsed = urlparse(u)
    path = parsed.path
    split = parsed.path.split('/')
    product = 0
    data = {
        "url": path,
        "label": 0
    }
    if 'products' in split or 'p' in split:
        data["label"] = 1
        count += 1
    parsed_links.append(data)

In [11]:
with open("dataset.json", "w") as outfile:
    json.dump(parsed_links, outfile, indent=2)
print("Saved parsed_links to dataset.json")

Saved parsed_links to dataset.json
