In [2]:
# Check for required Scrapy and Playwright dependencies
!pip list | grep -E "scrapy|playwright"

playwright                1.54.0
scrapy-playwright         0.0.44


In [3]:
# imports
import html
import os
from urllib.parse import urljoin

from dotenv import load_dotenv
from parsel import Selector

In [4]:
# Load environment variables
load_dotenv()

# Website URLs to scrape
websites_url = [
    "https://www.chevrolet.ca/en/trucks/silverado-1500",
    "https://www.chevrolet.ca/en/suvs/previous-year-equinox",
]

# Configuration (same as scrapper.py)
DEV_MODE = os.getenv("DEV", "False")
# LOCAL_URL = f"file://{os.path.join(os.getcwd(), 'silverado_navbar.html')}"
LOCAL_URL = "silverado_main.html"

SCRAP_WEBSITE = LOCAL_URL if DEV_MODE else websites_url[0]

print(f"DEV_MODE: {DEV_MODE}")
print(f"URLs to scrape: {SCRAP_WEBSITE}")


DEV_MODE: True
URLs to scrape: silverado_main.html


In [5]:
import json

from scrapy import Request
from scrapy.spiders import Spider
from scrapy_playwright.page import PageMethod

# Custom settings for Scrapy (same as scrapper.py)
custom_settings = {
    "ROBOTSTXT_OBEY": True,
    "LOG_LEVEL": "WARNING",
    "DEFAULT_REQUEST_HEADERS": {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-CA,en;q=0.9",
    },
    "USER_AGENT": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/125.0 Safari/537.36",
}

# Add Playwright settings for production mode
if not DEV_MODE:
    custom_settings.update(
        {
            "DOWNLOAD_HANDLERS": {
                "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
                "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            },
            "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
            "PLAYWRIGHT_BROWSER_TYPE": "chromium",
            "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 60000,
            "AUTOTHROTTLE_ENABLED": True,
            "AUTOTHROTTLE_START_DELAY": 1.0,
            "AUTOTHROTTLE_MAX_DELAY": 10.0,
            "CONCURRENT_REQUESTS": 1,
        }
    )

print("✅ Scrapy configuration loaded (same as scrapper.py)")
print(f"Settings configured for {'DEV' if DEV_MODE else 'PRODUCTION'} mode")

✅ Scrapy configuration loaded (same as scrapper.py)
Settings configured for DEV mode


In [6]:
file_path = "./" + os.path.join("samples", SCRAP_WEBSITE)
print(file_path)
try:
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()

    # Create Scrapy selector (replaces BeautifulSoup)
    selector = Selector(text=html_content)
    print("✅ Loaded local HTML file with Scrapy Selector")
    print(f"File size: {len(html_content):,} characters")

except FileNotFoundError:
    print("❌ Local HTML file not found. Will use live URLs.")
    selector = None

./samples/silverado_main.html
✅ Loaded local HTML file with Scrapy Selector
File size: 1,305,466 characters


In [7]:
EXCLUDE = {
    "script",
    "style",
    "noscript",
    "template",
    "gb-adv-grid",
    "gb-wrapper",
    "gb-responsive-image",
    "adv-col",
    "span",
    "gb-tab-nav",  # usually just adds nodes in tree, wraps an unordered list
    "gb-adv-grid",
    "adv-col",  # adds a column, we just need the main grid
    "section",
    "nav",
    "article",
    "adv-grid",
    "gb-secondary-nav",
    "br",
    "gb-sub-flyout",
    "gb-sublinks",
    "gb-main-flyout",
    "div",
}

WRAPPERS = {
    "header",
    "main",
    "footer",
    "aside",
    "picture",
}


def own_text(el):
    parts = [t.strip() for t in el.xpath("./text()").getall()]
    return " ".join(p for p in parts if p)


def all_text(el):
    return " ".join(" ".join(el.css("::text").getall()).split())


def _append(kids, node):
    if node is None:
        return
    if isinstance(node, list):
        kids.extend(node)
    else:
        kids.append(node)


def is_internal_link(u: str | None) -> bool:
    if not u:
        return False
    u = u.strip().split()[0]
    return u.startswith("/") or not u.startswith(("http://", "https://", "www."))


def _norm_url(base, u):
    if not u:
        return None
    return urljoin(base, u.strip().split()[0])


# def _attrs(el):
#     # keep all attributes verbatim
#     return dict(el.attrib)


def parse_json(raw):
    del_json = ["asShownPriceDisclosure", "startingPriceDisclosure"]
    if not raw:
        return None
    s = html.unescape(raw).replace("\\/", "/")
    try:
        data = json.loads(s)
    except json.JSONDecodeError:
        try:
            data = json.loads(s.replace("\u00a0", " ").replace("\xa0", " "))
        except json.JSONDecodeError:
            return s

    if isinstance(data, dict):
        for value in data.values():
            if isinstance(value, dict):
                for dk in del_json:
                    value.pop(dk, None)

    return data


# -------- serializers (must accept children) --------
def serialize_a(el, base, children):
    href = el.attrib.get("href")
    return {
        "a": {
            "text": all_text(el),
            "title": el.attrib.get("title", ""),
            "href": _norm_url(base, href),
            "link_type": ("internal" if is_internal_link(href) else "external")
            if href
            else None,
            # "classes": el.attrib.get("class", ""),
            "target": el.attrib.get("target"),
            "content": children or None,
        }
    }


def serialize_button_like(el, base, children):
    act = el.attrib.get("href") or el.attrib.get("formaction")
    return {
        "button": {
            "text": all_text(el),
            "url": _norm_url(base, act),
            "link_type": ("internal" if is_internal_link(act) else "external")
            if act
            else None,
            "classname": el.attrib.get("class", ""),
            "content": children or None,
            **({k: el.attrib[k] for k in el.attrib if k.startswith("data-")}),
            **(
                {
                    k: el.attrib[k]
                    for k in ("title", "aria-haspopup", "aria-expanded")
                    if k in el.attrib
                }
            ),
        }
    }


def serialize_img(el, base, _children):
    src = el.attrib.get("src")
    return {
        "img": {
            "src": _norm_url(base, src),
            "alt": el.attrib.get("alt"),
            "title": el.attrib.get("title"),
            "link_type": ("internal" if is_internal_link(src) else "external")
            if src
            else None,
            "loading": el.attrib.get("loading"),
            **({k: el.attrib[k] for k in el.attrib if k.startswith("data-")}),
            # "classes": el.attrib.get("class", ""),
        }
    }


def serialize_source(el, base, _children):
    srcset = (el.attrib.get("srcset") or "").replace("\n", " ")
    urls = []
    for part in srcset.split(","):
        tok = part.strip().split()
        if tok:
            urls.append(_norm_url(base, tok[0]))
    return {
        "source": {
            "media": el.attrib.get("media"),
            "height": el.attrib.get("height"),
            "width": el.attrib.get("width"),
            "srcset": [u for u in urls if u],
            # "classes": el.attrib.get("class", ""),
            "data_aspectratio": el.attrib.get("data-aspectratio"),
        }
    }


def serialize_heading(el, _base, _children):
    return {"heading": own_text(el)}


def serialize_gb_dynamic_text(el, _base, _children):
    return {
        "gb-dynamic-text": {
            "text": all_text(el) or None,
            # "class": el.attrib.get("class", ""),
            "country": el.attrib.get("country"),
            "regional_information": parse_json(
                el.attrib.get("regional-information-json")
            ),
        }
    }


def serialize_myaccount_flyout(el, base, children):
    # reuse JSON attr parser
    def _parse(attr):
        return parse_json(el.attrib.get(attr))

    return {
        "gb-myaccount-flyout": {
            # "class": el.attrib.get("class", ""),
            "flyoutstate": el.attrib.get("flyoutstate"),
            "auth_flyout": _parse("authflyoutdata"),
            "auth_links": _parse("authlinkdata"),
            "fallback": _parse("fallbackdata"),
            "content": children or None,  # preserve nested nodes if any
        }
    }


def _attrs_copy(el):
    return dict(el.attrib) if el.attrib else {}


def _pop_cls(attrs):
    cls = attrs.pop("class", None)
    return cls, attrs


def serialize_li(el, _base, children):
    attrs = _attrs_copy(el)
    li_class, rest = _pop_cls(attrs)
    txt = own_text(el)
    node = {
        "item": {
            **({"li_class": li_class} if li_class else {}),
            **({"attrs": rest} if rest else {}),
            **({"text": txt} if txt else {}),
            **({"content": children} if children else {}),
        }
    }
    return node


def _serialize_list(kind, el, base, children):
    # children already serialized by dfs; pick out only LI entries
    items = []
    other = []
    for ch in children:
        if isinstance(ch, dict) and "item" in ch:
            items.append(ch["item"])
        else:
            other.append(ch)
    attrs = _attrs_copy(el)
    cls, rest = _pop_cls(attrs)
    node = {
        kind: {
            # ({"class": cls} if cls else {}),
            **({"attrs": rest} if rest else {}),
            **({"items": items} if items else {"items": []}),
            **({"content": other} if other else {}),
        }
    }
    return node


def serialize_ul(el, base, children):
    return _serialize_list("ul", el, base, children)


def serialize_ol(el, base, children):
    return _serialize_list("ol", el, base, children)


def serialize_p(el, base, children):
    # pull class + other attrs if you want them
    attrs = _attrs_copy(el) if el.attrib else {}
    cls = attrs.pop("class", None)
    txt = all_text(el)

    # drop generic <br> children and empty lists
    clean_children = []
    for ch in children or []:
        if isinstance(ch, dict) and ch.get("tag") == "br":
            continue
        _append(clean_children, ch)

    # return None if paragraph is effectively empty
    if not txt and not clean_children and not cls and not attrs:
        return None

    # return {
    #     "p": {
    #         **({"text": txt} if txt else {}),
    #         **({"class": cls} if cls else {}),
    #         **({"attrs": attrs} if attrs else {}),
    #         **({"content": clean_children} if clean_children else {}),
    #     }
    # }
    return {"paragraph": clean_children if clean_children else {}}


def serialize_disclosure():
    pass


NATIVE = {
    "a": serialize_a,
    "button": serialize_button_like,
    "input": serialize_button_like,  # gated below
    "img": serialize_img,
    "source": serialize_source,
    "gb-dynamic-text": serialize_gb_dynamic_text,
    "h1": serialize_heading,
    "h2": serialize_heading,
    "h3": serialize_heading,
    "h4": serialize_heading,
    "h5": serialize_heading,
    "h6": serialize_heading,
    "ul": serialize_ul,
    "ol": serialize_ol,
    "li": serialize_li,
    "p": serialize_p,
    "gb-myaccount-flyout": serialize_myaccount_flyout,
    "gb-disclosure": serialize_disclosure,
}


def serialize_generic(el, children):
    node = {"tag": el.root.tag.lower()}
    # attrs = _attrs(el)
    # if attrs:
    #     node["attrs"] = attrs
    txt = own_text(el)
    if txt:
        node["text"] = txt
    if children:
        node["content"] = children
    return node


# -------- unified DFS --------
def dfs(el, base):
    tag = el.root.tag.lower()

    # 1) drop excluded wrappers but keep their children
    if tag in EXCLUDE:
        kids = []
        for ch in el.xpath("./*"):
            _append(kids, dfs(ch, base))
        return kids or None

    # 2) always build children first
    children = []
    for ch in el.xpath("./*"):
        _append(children, dfs(ch, base))

    # 3) special handling when needed, but never block children
    if tag in NATIVE:
        if tag == "input" and el.attrib.get("type") not in {
            "button",
            "submit",
            "reset",
        }:
            # non-button inputs fall back to generic
            pass
        else:
            try:
                return NATIVE[tag](el, base, children)
            except Exception as _:
                # fall through to generic if a serializer fails
                return serialize_generic(el, children)

    # 4) flatten trivial wrappers
    if tag in WRAPPERS:
        cls = el.attrib.get("class", "").strip()
        if not cls and not own_text(el) and len(children) == 1:
            return children[0]

    # 5) generic element
    return serialize_generic(el, children)


In [8]:
root = selector.xpath("//gb-global-nav/template[@id='gb-global-nav-content']")
BASE = "https://www.chevrolet.ca/"
tree = [n for n in (dfs(ch, BASE) for ch in root.xpath("./*")) if n is not None]
print(json.dumps(tree, indent=2, ensure_ascii=False))


[]


In [9]:
root = selector.xpath("//main[@id='gb-main-content']")
BASE = "https://www.chevrolet.ca/"
tree = [n for n in (dfs(ch, BASE) for ch in root.xpath("./*")) if n is not None]
print(json.dumps(tree, indent=2, ensure_ascii=False))


[
  {
    "tag": "link"
  },
  [
    {
      "tag": "gb-main-link",
      "content": [
        {
          "a": {
            "text": "Silverado",
            "title": "",
            "href": "https://www.chevrolet.ca/en/trucks/silverado-1500",
            "link_type": "external",
            "target": "_self",
            "content": null
          }
        }
      ]
    },
    {
      "tag": "gb-main-link",
      "content": [
        {
          "a": {
            "text": "Specs",
            "title": "",
            "href": "https://www.chevrolet.ca/byo-vc/client/en/CA/chevrolet/silverado/2025/silverado-1500/trims/compare%20",
            "link_type": "external",
            "target": "_self",
            "content": null
          }
        }
      ]
    },
    {
      "tag": "gb-main-link",
      "content": [
        {
          "a": {
            "text": "Accessories",
            "title": "",
            "href": "https://www.chevrolet.ca/en/trucks/silverado-1500/accessories",
   

In [10]:
# Get all unique tags from the root element
def get_all_unique_tags(element):
    """
    Recursively traverse the element and collect all unique tag names
    """
    unique_tags = set()

    # Add current element's tag
    if hasattr(element, "root") and hasattr(element.root, "tag"):
        unique_tags.add(element.root.tag.lower())

    # Recursively process all child elements
    for child in element.xpath(".//*"):
        if hasattr(child, "root") and hasattr(child.root, "tag"):
            unique_tags.add(child.root.tag.lower())

    return unique_tags


# Extract all unique tags from the root element
unique_tags = get_all_unique_tags(root)

print(f"Total unique tags found: {len(unique_tags)}")
print("\nAll unique tags (sorted alphabetically):")
for tag in sorted(unique_tags):
    print(f"  - {tag}")

print(f"\nAs a set: {unique_tags}")

Total unique tags found: 46

All unique tags (sorted alphabetically):
  - a
  - adv-col
  - adv-grid
  - adv-slides
  - br
  - button
  - circle
  - circle-container
  - div
  - g
  - gb-360-colorizer
  - gb-360-colorizer-notification
  - gb-adv-grid
  - gb-adv-scrl
  - gb-button
  - gb-content-well
  - gb-disclosure
  - gb-dynamic-text
  - gb-expander
  - gb-expander-tab-nav
  - gb-main-flyout
  - gb-main-link
  - gb-responsive-image
  - gb-secondary-nav
  - gb-static-image
  - gb-sub-flyout
  - gb-sublinks
  - gb-target-zone
  - gb-video-brightcove
  - gb-wrapper
  - h1
  - h2
  - h3
  - h4
  - img
  - li
  - link
  - p
  - path
  - picture
  - script
  - source
  - span
  - sup
  - svg
  - ul

As a set: {'gb-360-colorizer-notification', 'gb-button', 'gb-360-colorizer', 'h2', 'g', 'adv-grid', 'gb-content-well', 'gb-secondary-nav', 'gb-target-zone', 'gb-sublinks', 'h4', 'p', 'adv-col', 'h3', 'sup', 'img', 'div', 'gb-responsive-image', 'br', 'svg', 'link', 'gb-main-flyout', 'gb-wrapper

In [11]:
main_div = {
    "gb-responsive-image",
    "adv-slides",
    "gb-adv-grid",
    "div",
    "gb-expander",
    "ul",
    "span",
    "h2",
    "gb-360-colorizer",
    "gb-secondary-nav",
    "gb-main-link",
    "a",
    "gb-target-zone",
    "picture",
    "svg",
    "img",
    "gb-video-brightcove",
    "gb-sublinks",
    "gb-sub-flyout",
    "adv-col",
    "circle-container",
    "h4",
    "gb-content-well",
    "gb-main-flyout",
    "h1",
    "button",
    "gb-360-colorizer-notification",
    "gb-adv-scrl",
    "script",
    "p",
    "gb-wrapper",
    "g",
    "path",
    "h3",
    "gb-dynamic-text",
    "sup",
    "gb-button",
    "circle",
    "gb-static-image",
    "br",
    "source",
    "gb-disclosure",
    "li",
    "gb-expander-tab-nav",
    "adv-grid",
    "link",
}

nav_div = """- a
    - adv-col
    - br
    - button
    - div
    - gb-adv-grid
    - gb-button
    - gb-disclosure
    - gb-dynamic-text
    - gb-flyout
    - gb-myaccount-flyout
    - gb-myaccount-nav
    - gb-region-dropdown
    - gb-region-selector
    - gb-responsive-image
    - gb-tab-nav
    - gb-target-zone
    - gb-wrapper
    - h2
    - h3
    - h4
    - h6
    - img
    - li
    - nav
    - p
    - picture
    - source
    - span
    - template
    - ul"""

# parse the bullet list into a clean list of tags
lines = [ln.strip() for ln in nav_div.splitlines() if ln.strip()]
items = set([ln[1:].strip() if ln.startswith("-") else ln for ln in lines])

# convert to dict (tag -> index). Change value if you prefer a different format.
nav_dict = {tag: idx for idx, tag in enumerate(items)}
items

# compute elements present in main_div but not in items
exclusive_main_only = main_div.difference(items)

print(f"Exclusive elements in main_div (not in items): {len(exclusive_main_only)}")
for tag in sorted(exclusive_main_only):
    print(f" - {tag}")

Exclusive elements in main_div (not in items): 24
 - adv-grid
 - adv-slides
 - circle
 - circle-container
 - g
 - gb-360-colorizer
 - gb-360-colorizer-notification
 - gb-adv-scrl
 - gb-content-well
 - gb-expander
 - gb-expander-tab-nav
 - gb-main-flyout
 - gb-main-link
 - gb-secondary-nav
 - gb-static-image
 - gb-sub-flyout
 - gb-sublinks
 - gb-video-brightcove
 - h1
 - link
 - path
 - script
 - sup
 - svg


## All unique tags (sorted alphabetically):
  - a
  - adv-col
  - br
  - button
  - div
  - gb-adv-grid
  - gb-button
  - gb-disclosure
  - gb-dynamic-text
  - gb-flyout
  - gb-myaccount-flyout
  - gb-myaccount-nav
  - gb-region-dropdown
  - gb-region-selector
  - gb-responsive-image
  - gb-tab-nav
  - gb-target-zone
  - gb-wrapper
  - h2
  - h3
  - h4
  - h6
  - img
  - li
  - nav
  - p
  - picture
  - source
  - span
  - template
  - ul

## Comparison: Basic Scraping vs JavaScript-Rendered Scraping

The Scrapy + Playwright script gives you the **complete, fully-rendered HTML** that a human user would see in their browser. This includes:

### What the Scrapy script captured:
- **Complete trim information**: WT, Custom, LT, RST, Custom Trail Boss, LTZ, LT Trail Boss, High Country, ZR2
- **Fully loaded JavaScript content**
- **Dynamic pricing and specifications**
- **Interactive elements rendered as static HTML**

### Difference from basic `requests` approach:
- `requests` + `BeautifulSoup` = Raw server HTML (often incomplete)
- `Scrapy` + `Playwright` = Full browser-rendered HTML (complete content)

Let's parse the rendered Silverado page:

In [12]:


import re, json, html, asyncio
from scrapy import Spider, signals
from scrapy.crawler import CrawlerRunner
from scrapy.utils.reactor import install_reactor
from twisted.internet.defer import ensureDeferred

# Jupyter-safe: integrate Twisted with the running asyncio loop
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')

URL = "https://www.chevrolet.ca/content/chevrolet/na/ca/en/index.disclosurespurejson.html"

class ChevyDisclosuresSpider(Spider):
    name = "chevy_disclosures"
    start_urls = [URL]
    custom_settings = {
        "ROBOTSTXT_OBEY": False,
        "USER_AGENT": "Mozilla/5.0",
        "LOG_LEVEL": "ERROR",
    }

    @staticmethod
    def _parse_json_body(text: str):
        if not text:
            return None
        s = html.unescape(text).strip()
        try:
            return json.loads(s)
        except json.JSONDecodeError:
            pass
        s2 = s.replace("\\/", "/").replace("\u00a0", " ").replace("\xa0", " ")
        try:
            return json.loads(s2)
        except json.JSONDecodeError:
            pass
        m = re.search(r"(\{.*\}|\[.*\])", s2, flags=re.S)
        if m:
            try:
                return json.loads(m.group(1))
            except json.JSONDecodeError:
                return {"raw": s}
        return {"raw": s}

    def parse(self, response):
        body_text = "\n".join(t for t in response.xpath("//body//text()").getall() if t.strip())
        data = self._parse_json_body(body_text)
        yield {"url": response.url, "data": data}

# collect to memory and also write to file
items = []
runner = CrawlerRunner(settings={
    "FEEDS": {"disclosures.json": {"format": "json", "encoding": "utf8", "indent": 2, "overwrite": True}}
})
runner.signals.connect(lambda item, response, spider: items.append(dict(item)), signal=signals.item_scraped)

await ensureDeferred(runner.crawl(ChevyDisclosuresSpider))

print(f"Scraped {len(items)} item(s)")
print("Top-level keys:", list(items[0]["data"].keys())[:15] if items and isinstance(items[0].get("data"), dict) else type(items[0].get("data")))

AttributeError: 'CrawlerRunner' object has no attribute 'signals'

In [None]:
# Simple fallback fetch + parse (useful in notebooks where Scrapy may be heavy)
# This cell fetches the page, extracts text inside <body>, and calls the notebook's parse_json to decode the JSON body.
import requests, json, html, re
from parsel import Selector

URL = "https://www.chevrolet.ca/content/chevrolet/na/ca/en/index.disclosurespurejson.html"
headers = {"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}

resp = requests.get(URL, headers=headers, timeout=30)
resp.raise_for_status()

sel = Selector(text=resp.text)
# join visible text nodes from body (matches the Scrapy approach used earlier)
body_text = "\n".join(t for t in sel.xpath("//body//text()").getall() if t.strip())

# try to use parse_json from earlier cells; if not available, define a local fallback
try:
    data = parse_json(body_text)  # parse_json is defined earlier in the notebook
except NameError:
    def parse_json(raw):
        del_json = ["asShownPriceDisclosure", "startingPriceDisclosure"]
        if not raw:
            return None
        s = html.unescape(raw).replace("\\/", "/")
        try:
            data = json.loads(s)
        except json.JSONDecodeError:
            try:
                data = json.loads(s.replace("\\u00a0", " ").replace("\\xa0", " "))
            except json.JSONDecodeError:
                m = re.search(r"(\{.*\}|\[.*\])", s, flags=re.S)
                if m:
                    try:
                        return json.loads(m.group(1))
                    except json.JSONDecodeError:
                        return {"raw": s}
                return s

        if isinstance(data, dict):
            for value in data.values():
                if isinstance(value, dict):
                    for dk in del_json:
                        value.pop(dk, None)
        return data

    data = parse_json(body_text)

print("Parsed type:", type(data))
# show a short preview
preview = json.dumps(data, indent=2, ensure_ascii=False)[:2000]
print(preview)

# write to file
with open("disclosures.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("Wrote disclosures.json")
