In [28]:
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup

async def crawl_with_playwright(url: str):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        await page.goto(url, wait_until="networkidle")
        await asyncio.sleep(2)

        content = await page.content()
        await browser.close()
        return content
    
def extract_xml_from_html(html: str):
    soup = BeautifulSoup(html, "html.parser")

    # Check for any embedded XML-looking structure
    xml_tags = ["sitemapindex", "urlset", "feed", "rss"]  # common root XML tags
    for tag in xml_tags:
        if soup.find(tag):
            return str(soup.find(tag))
        
    print("❌ Couldn't find XML in the HTML.")
    return None
    

In [50]:
from usp.tree import sitemap_tree_for_homepage, InvalidSitemap, sitemap_from_str
from collections import deque

list_of_websites = [
    "https://www.westside.com/",
    # "https://virgio.com/",
    # "https://www.tatacliq.com/",
    # "https://nykaafashion.com/"
]
all_urls = []
for website in list_of_websites:
    print(f"Fetching sitemap for {website}")
    tree = sitemap_tree_for_homepage(website)

    count = 0
    for page in tree.all_pages():
        all_urls.append(page.url)
        count += 1
    
    if count == 0:
        print("❌ No URLs found in the sitemap. Maybe something failed!")
        for map in tree.all_sitemaps():
            if isinstance(map, InvalidSitemap):
                print('Invalid Sitemap:', map)
                print(' 403 ' in getattr(map, 'reason', ''))
                url_queue = deque([map.url])
                while(len(url_queue) > 0):

                    # Gather a batch of URLs to crawl at once (e.g., 5 at a time)
                    batch_size = 5
                    batch = []
                    while len(url_queue)>0 and len(batch) < batch_size:
                        url = url_queue.pop()
                        batch.append(url)
                        print('Fetching:', url)

                    # Use asyncio to crawl multiple URLs in parallel
                    async def fetch_and_process(urls):
                        tasks = [crawl_with_playwright(u) for u in urls]
                        results = await asyncio.gather(*tasks)
                        return results
                    xml_contents = await fetch_and_process(batch)

                    # Process the fetched HTML content
                    for xml_content in xml_contents:
                        xml_str = extract_xml_from_html(xml_content)
                        if xml_str:
                            new_tree = sitemap_from_str(xml_str)
                            for da in new_tree.all_pages():
                                all_urls.append(da.url)
                            for da in new_tree.all_sitemaps():
                                url_queue.append(da.url)

Fetching sitemap for https://www.westside.com/


Unable to gunzip response <usp.web_client.requests_client.RequestsWebClientSuccessResponse object at 0x153acdb10>, maybe it's a non-gzipped sitemap: Unable to gunzip data: Not a gzipped file (b'# ')
Unable to gunzip response <usp.web_client.requests_client.RequestsWebClientSuccessResponse object at 0x1527d8090>, maybe it's a non-gzipped sitemap: Unable to gunzip data: Not a gzipped file (b'<?')
Unable to gunzip response <usp.web_client.requests_client.RequestsWebClientSuccessResponse object at 0x15577ce10>, maybe it's a non-gzipped sitemap: Unable to gunzip data: Not a gzipped file (b'<?')
Unable to gunzip response <usp.web_client.requests_client.RequestsWebClientSuccessResponse object at 0x150487ed0>, maybe it's a non-gzipped sitemap: Unable to gunzip data: Not a gzipped file (b'<?')
Unable to gunzip response <usp.web_client.requests_client.RequestsWebClientSuccessResponse object at 0x112dcb950>, maybe it's a non-gzipped sitemap: Unable to gunzip data: Not a gzipped file (b'<?')
Unabl

In [31]:
# Filter out the URLs that are not valid
all_urls

['https://www.virgio.com/products/mini-geo-print-wrap-dress',
 'https://www.virgio.com/products/blushing-pink-shirt-dress',
 'https://www.virgio.com/products/cherry-charm-overlap-v-dress-with-pockets',
 'https://www.virgio.com/products/animal-print-v-neck-mini-dress',
 'https://www.virgio.com/products/60s-mini-burst-of-style-with-pockets',
 'https://www.virgio.com/products/citrus-splash-polka-dot-dress-with-pockets',
 'https://www.virgio.com/products/chic-block-print-shirt-dress-with-pockets',
 'https://www.virgio.com/products/daisy-daydream-shoulder-dress-with-pockets',
 'https://www.virgio.com/products/colorful-blooms-fiesta-frock-with-pockets',
 'https://www.virgio.com/products/dancing-dots-little-black-dress-with-pockets',
 'https://www.virgio.com/products/elasticated-floral-v-neck-dress-with-pockets',
 'https://www.virgio.com/products/fiery-animal-print-frock-with-pockets',
 'https://www.virgio.com/products/floral-fantasy-poplin-mini-dress-with-pockets',
 'https://www.virgio.com/p

In [34]:
with open("product_links.txt", "r") as f:
    product_links = [line.strip() for line in f if line.strip()]
print(f"Loaded {len(product_links)} links.")

Loaded 3566 links.


In [35]:
for url in all_urls:
    if url not in product_links:
        product_links.append(url)
    else:
        print(f"{url} already exists in the list.")

In [40]:
from urllib.parse import urlparse
import json

parsed_links = []
count = 0
for u in product_links:
    parsed = urlparse(u)
    path = parsed.path
    split = parsed.path.split('/')
    product = 0
    data = {
        "url": path,
        "label": 0
    }
    if 'products' in split or 'p' in split:
        data["label"] = 1
        count += 1
    if product == 1 and len(parsed_links) > 300:
        continue
    parsed_links.append(data)

373

In [44]:

# Extract URLs and labels
urls = [item["url"] for item in parsed_links]  # Add domain if needed
labels = [item["label"] for item in parsed_links]

# Preprocess URLs (replace hyphens, slashes, etc.)
def preprocess(url):
    return url.lower().replace("-", " ").replace("/", " ").replace("_", " ")

processed_urls = [preprocess(url) for url in urls]
processed_urls[:5]

[' lounge dreams oversized pure cotton shirt p 16220765',
 ' twenty dresses by nykaa fashion light brown solid ruffle full sleeves shirt p 14343958',
 ' ombrello teapot blue lace shirt with inner p 17920972',
 ' peach panda white georgette shirt with gather details p 18230129',
 ' style quotient women blue and white stripes mandarin neck shirt p 18937139']

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
# TF-IDF vectorization
vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5))
X = vectorizer.fit_transform(processed_urls)


In [46]:
from sklearn.model_selection import train_test_split
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)


In [47]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluation
print("🔍 Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))


🔍 Random Forest Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        76
           1       1.00      1.00      1.00       940

    accuracy                           1.00      1016
   macro avg       1.00      1.00      1.00      1016
weighted avg       1.00      1.00      1.00      1016



In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


# Optional: Compare with Logistic Regression
lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("\n🔍 Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))



🔍 Logistic Regression Performance:
              precision    recall  f1-score   support

           0       1.00      0.80      0.89        76
           1       0.98      1.00      0.99       940

    accuracy                           0.99      1016
   macro avg       0.99      0.90      0.94      1016
weighted avg       0.99      0.99      0.98      1016



In [49]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
def preprocess_url(url: str) -> str:
    url = url.lower().replace("https://", "").replace("http://", "")
    url = url.replace(".", "/").replace("-", " ").replace("_", " ")
    return url

def predict_url_label(url, model: RandomForestClassifier, vectorizer: TfidfVectorizer) -> int:
    processed = preprocess_url(url)
    vectorized = vectorizer.transform([processed])  # note: list input
    prediction = model.predict(vectorized)[0]
    return prediction

for url in all_urls:
    response = predict_url_label(
        url=url,
        model=rf,
        vectorizer=vectorizer,
    )
    if response == np.int64(0):
        print("Not a product page:", url)
    else:
        print("Product page:", url)


Product page: https://www.virgio.com/products/mini-geo-print-wrap-dress
Product page: https://www.virgio.com/products/blushing-pink-shirt-dress
Product page: https://www.virgio.com/products/cherry-charm-overlap-v-dress-with-pockets
Product page: https://www.virgio.com/products/animal-print-v-neck-mini-dress
Product page: https://www.virgio.com/products/60s-mini-burst-of-style-with-pockets
Product page: https://www.virgio.com/products/citrus-splash-polka-dot-dress-with-pockets
Product page: https://www.virgio.com/products/chic-block-print-shirt-dress-with-pockets
Product page: https://www.virgio.com/products/daisy-daydream-shoulder-dress-with-pockets
Product page: https://www.virgio.com/products/colorful-blooms-fiesta-frock-with-pockets
Product page: https://www.virgio.com/products/dancing-dots-little-black-dress-with-pockets
Product page: https://www.virgio.com/products/elasticated-floral-v-neck-dress-with-pockets
Product page: https://www.virgio.com/products/fiery-animal-print-frock-w