In [2]:
# Check for required Scrapy and Playwright dependencies
!pip list | grep -E "scrapy|playwright"

playwright                1.54.0
scrapy-playwright         0.0.44


In [47]:
# imports
import html
import os
from urllib.parse import urljoin

from dotenv import load_dotenv
from parsel import Selector

In [4]:
# Load environment variables
load_dotenv()

# Website URLs to scrape
websites_url = [
    "https://www.chevrolet.ca/en/trucks/silverado-1500",
    "https://www.chevrolet.ca/en/suvs/previous-year-equinox",
]

# Configuration (same as scrapper.py)
DEV_MODE = os.getenv("DEV", "False")
# LOCAL_URL = f"file://{os.path.join(os.getcwd(), 'silverado_navbar.html')}"
LOCAL_URL = "silverado_navbar.html"

SCRAP_WEBSITE = LOCAL_URL if DEV_MODE else websites_url[0]

print(f"DEV_MODE: {DEV_MODE}")
print(f"URLs to scrape: {SCRAP_WEBSITE}")


DEV_MODE: True
URLs to scrape: silverado_navbar.html


In [5]:
import json

from scrapy import Request
from scrapy.spiders import Spider
from scrapy_playwright.page import PageMethod

# Custom settings for Scrapy (same as scrapper.py)
custom_settings = {
    "ROBOTSTXT_OBEY": True,
    "LOG_LEVEL": "WARNING",
    "DEFAULT_REQUEST_HEADERS": {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-CA,en;q=0.9",
    },
    "USER_AGENT": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/125.0 Safari/537.36",
}

# Add Playwright settings for production mode
if not DEV_MODE:
    custom_settings.update(
        {
            "DOWNLOAD_HANDLERS": {
                "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
                "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            },
            "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
            "PLAYWRIGHT_BROWSER_TYPE": "chromium",
            "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 60000,
            "AUTOTHROTTLE_ENABLED": True,
            "AUTOTHROTTLE_START_DELAY": 1.0,
            "AUTOTHROTTLE_MAX_DELAY": 10.0,
            "CONCURRENT_REQUESTS": 1,
        }
    )

print("✅ Scrapy configuration loaded (same as scrapper.py)")
print(f"Settings configured for {'DEV' if DEV_MODE else 'PRODUCTION'} mode")

✅ Scrapy configuration loaded (same as scrapper.py)
Settings configured for DEV mode


In [6]:
file_path = "./" + os.path.join("samples", SCRAP_WEBSITE)
print(file_path)
try:
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()

    # Create Scrapy selector (replaces BeautifulSoup)
    selector = Selector(text=html_content)
    print("✅ Loaded local HTML file with Scrapy Selector")
    print(f"File size: {len(html_content):,} characters")

except FileNotFoundError:
    print("❌ Local HTML file not found. Will use live URLs.")
    selector = None

./samples/silverado_navbar.html
✅ Loaded local HTML file with Scrapy Selector
File size: 2,934,810 characters


In [None]:
# from urllib.parse import urljoin

# EXCLUDE = {
#     "script",
#     "style",
#     "noscript",
#     "template",
#     "gb-adv-grid",
#     "gb-wrapper",
#     "gb-responsive-image",
#     "adv-col",
#     "span",
# }
# CONTAINERS = {
#     "div",
#     "section",
#     "nav",
#     "header",
#     "footer",
#     "main",
#     "article",
#     "aside",
#     "serialize_gb_button",
#     "gb-button",
#     "picture",
# }


# def text_of(elem):
#     # return " ".join(elem.css("::text").getall()).strip()
#     return " ".join(
#         text for text in " ".join(elem.css("::text").getall()).strip().split()
#     )


# def parse_regional_info_json(elem):
#     # raw = elem.attrib.get("regional-information-json")
#     raw = elem
#     if not raw:
#         return None
#     s = html.unescape(raw).replace("\\/", "/")
#     try:
#         return json.loads(s)
#     except json.JSONDecodeError:
#         # try stripping NBSP; otherwise return the raw string
#         try:
#             return json.loads(s.replace("\u00a0", " ").replace("\xa0", " "))
#         except json.JSONDecodeError:
#             return s


# def serialize_dynamic_text(elem, _):
#     regional_info_json = parse_regional_info_json(
#         elem.attrib.get("regional-information-json")
#     )
#     # regional_info_json = elem.attrib.get("regional-information-json")

#     # regional_info = None
#     # if regional_info_json:
#     #     # Unescape HTML entities (&lt;, &gt;, &amp;, &nbsp;) and normalize escaped slashes
#     #     print("Regional Information in the text ! ")
#     #     unescaped = html.unescape(regional_info_json).replace("\\/", "/")
#     #     try:
#     #         regional_info = json.loads(unescaped)
#     #     except json.JSONDecodeError:
#     #         try:
#     #             regional_info = json.loads(
#     #                 unescaped.replace("\u00a0", " ").replace("\xa0", " ")
#     #             )
#     #         except json.JSONDecodeError:
#     #             regional_info = unescaped

#     txt = text_of(elem)
#     print("Found a Dynamic Text")
#     return {
#         "gb-dynamic-text": {
#             "text": txt if txt else {},
#             "class": elem.attrib.get("class", ""),
#             "country": elem.attrib.get("country"),
#             "regional_information": regional_info_json,
#         }
#     }


# def serialize_heading(elem: Selector):
#     # Use ./text() to get only direct text nodes, not descendant text
#     parts = [t.strip() for t in elem.xpath("./text()").getall()]
#     text = " ".join(p for p in parts if p)
#     return {
#         "heading": {
#             "classes": elem.attrib.get("class", ""),
#             "text": text,
#         }
#     }


# # def serialize_heading(elem: Selector):
# #     parts = [t.strip() for t in elem.xpath("./text()").getall()]
# #     text = " ".join(p for p in parts if p)
# #     return {
# #         "heading": {
# #             "classes": elem.attrib.get("class", ""),
# #             "text": text,
# #             # "text": " ".join(
# #             #     text for text in " ".join(elem.css("::text").getall()).strip().split()
# #             # ),
# #         }
# #     }


# def serialize_a(el: str, base: str):
#     href = el.attrib.get("href")

#     return {
#         "a": {
#             "text": text_of(el),
#             "title": el.attrib.get("title", ""),
#             "href": urljoin(base, href) if href else None,
#             "link_type": ("internal" if is_internal_Link(href) else "external"),
#             "classes": el.attrib.get("class", ""),
#             **({"type": el.attrib["type"]} if "type" in el.attrib else {}),
#             **({"target": el.attrib.get("target")} if "target" in el.attrib else {}),
#         }
#     }


# # ...rest of the cell unchanged...
# def serialize_picture_source(elem, base):
#     """
#     <source
#         media="(min-width: 0px)"
#         class="gb-rectangle-image-responsive"
#         height="1000"
#         width="2000"
#         data-aspectratio="2.0"
#         srcset="
#             /content/dam/chevrolet/na/canada/english/index/crossovers-suvs/2025-trax/jelly/2025-trax-1sa-gvr-driver-front-3quarter-nav.jpg?imwidth=800  1x,
#             /content/dam/chevrolet/na/canada/english/index/crossovers-suvs/2025-trax/jelly/2025-trax-1sa-gvr-driver-front-3quarter-nav.jpg?imwidth=1600 2x
#         "
#     />
#     """
#     # a source tag for the picture tag might look like this
#     src = elem.attrib.get("srcset")
#     srcs = [cl_s for ech_s in src.split(",") for cl_s in ech_s.strip().split("\n")]
#     return {
#         "picture_source": {
#             "media": elem.attrib.get("media"),
#             "height": elem.attrib.get("height"),
#             "width": elem.attrib.get("width"),
#             "srcset": srcs,
#             "link_type": ("internal" if is_internal_Link(src) else "external"),
#             "classes": elem.attrib.get("class", ""),
#             "data_aspectratio": elem.attrib.get("data-aspectratio").split(","),
#         }
#     }


# def serialize_button(el, base):
#     """Button may look like this:
#     <button
#         class="gn-main-button stat-icon-link"
#         flyout="primary-0"
#         href="/content/chevrolet/na/ca/en/portablenavigation/simplified-nav/primary-navigation/hamburger-menu/hamburger-menu.html"
#         data-dtm="global nav"
#         data-hamburger-menu="true"
#         data-flyout-pagetitle="hamburger-menu"
#     >

#     or

#     <button
#         class="gn-main-button stat-text-link"
#         flyout="primary-1"
#         href="/content/chevrolet/na/ca/en/portablenavigation/simplified-nav/primary-navigation/vehicles/vehicles.html"
#         title="Vehicles"
#         data-dtm="global nav"
#         data-dtm2="Vehicles"
#         data-flyout-pagetitle="vehicles"
#     >

#     or

#     <button
#         class="gn-main-button hide-large stat-text-link"
#         flyout="more-flyout"
#         aria-haspopup="true"
#         aria-expanded="false"
#         data-dtm="global nav"
#         data-dtm2="More"
#     >
#     """

#     act = el.attrib.get("href") or el.attrib.get("formaction")
#     full_url = urljoin(base, act) if act else None

#     return {
#         "button": {
#             "text": text_of(el),
#             "url": full_url,
#             "flyout": el.attrib.get("flyout"),
#             "data_dtm": el.attrib.get("data-dtm"),
#             "data_dtm2": el.attrib.get("data-dtm2"),
#             "link_type": ("internal" if is_internal_Link(act) else "external")
#             if act
#             else "NA",
#             "classname": el.attrib.get("class", ""),
#             **({"type": el.attrib["type"]} if "type" in el.attrib else {}),
#             **(
#                 {"disabled": "disabled"}
#                 if "disabled" in el.attrib or el.attrib.get("aria-disabled") == "true"
#                 else {}
#             ),
#             **({"title": el.attrib["title"]} if "title" in el.attrib else {}),
#             **(
#                 {"data_hamburger_menu": el.attrib["data-hamburger-menu"]}
#                 if "data-hamburger-menu" in el.attrib
#                 else {}
#             ),
#             **(
#                 {"data_flyout_pagetitle": el.attrib["data-flyout-pagetitle"]}
#                 if "data-flyout-pagetitle" in el.attrib
#                 else {}
#             ),
#             **(
#                 {"aria-haspopup": el.attrib["aria-haspopup"]}
#                 if "aria-haspopup" in el.attrib
#                 else {}
#             ),
#             **(
#                 {"aria-expanded": el.attrib["aria-expanded"]}
#                 if "aria-expanded" in el.attrib
#                 else {}
#             ),
#         }
#     }


# def is_internal_Link(link: str | None) -> bool:
#     if link:
#         # check if this link starts with '/' or it doesnt start with "http://" or "https://" or "www."
#         return (
#             True
#             if link.startswith("/")
#             or not any(
#                 link.startswith(prefix) for prefix in ("http://", "https://", "www.")
#             )
#             else False
#         )
#     return False


# def serialize_image(el, base):
#     src = el.attrib.get("src")
#     return {
#         "img": {
#             "src": urljoin(base, src) if src else None,
#             "classes": el.attrib.get("class", ""),
#             "alt": el.attrib.get("alt"),
#             "title": el.attrib.get("title"),
#             "link_type": ("internal" if is_internal_Link(src) else "external"),
#             **({"imwidth": el.attrib.get("title")} if "imwidth" in el.attrib else {}),
#             **({"loading": el.attrib.get("loading")} if "loading" in el.attrib else {}),
#         }
#     }


# NATIVE = {
#     "a": serialize_a,
#     "button": serialize_button,
#     "input": serialize_button,  # handles type=button/submit/reset
#     "img": serialize_image,
#     "source": serialize_picture_source,
#     "gb-dynamic-text": serialize_dynamic_text,
# }


# def _append(kids, node):
#     if node is None:
#         return
#     if isinstance(node, list):
#         kids.extend(node)
#     else:
#         kids.append(node)


# def dfs(el, base):
#     tag = el.root.tag.lower()

#     if tag in EXCLUDE:
#         kids = []
#         for ch in el.xpath("./*"):
#             _append(kids, dfs(ch, base))
#         return kids if kids else None

#     if tag in NATIVE:
#         if tag == "input" and el.attrib.get("type") not in {
#             "button",
#             "submit",
#             "reset",
#         }:
#             pass
#         else:
#             try:
#                 return NATIVE[tag](el, base)
#             except Exception as e:
#                 # swallow and continue so one bad <source> doesn't kill traversal
#                 return {"_skip": {"tag": tag, "error": str(e)}}

#     if tag == "ul":
#         items = []
#         for li in el.xpath("./li"):
#             _append(items, dfs(li, base))
#         return {"ul": items} if items else None

#     if tag == "li":
#         kids, txt = [], text_of(el)
#         for ch in el.xpath("./*"):
#             _append(kids, dfs(ch, base))
#         return {"li": {"text": txt, "content": kids}} if (kids or txt) else None

#     if tag in CONTAINERS:
#         kids = []
#         for ch in el.xpath("./*"):
#             _append(kids, dfs(ch, base))
#         classname = el.attrib.get("class", "")
#         if not classname and len(kids) == 1:
#             return kids[0]
#         result = {"type": tag, "class": classname, "content": kids}
#         for attr in [
#             "data-hamburger-menu",
#             "data-province-selector-enabled",
#             "role",
#             "aria-hidden",
#             "aria-label",
#             "flyout-id",
#             "close-button-label",
#         ]:
#             if attr in el.attrib:
#                 result[attr] = el.attrib[attr]
#         return result

#     if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
#         return {tag: serialize_heading(el)}
#     if tag == "p":
#         txt = text_of(el)
#         return {"p": txt} if txt else None

#     kids, txt = [], text_of(el)
#     for ch in el.xpath("./*"):
#         _append(kids, dfs(ch, base))
#     if kids or txt:
#         return {"tag": tag, "components": kids, **({"text": txt} if txt else {})}
#     return None

In [None]:
# root = selector.xpath("//gb-global-nav/template[@id='gb-global-nav-content']")[0]
# BASE = "https://www.chevrolet.ca/"
# tree = [n for n in (dfs(ch, BASE) for ch in root.xpath("./*")) if n is not None]
# print(json.dumps(tree, indent=2, ensure_ascii=False))

In [None]:
EXCLUDE = {
    "script",
    "style",
    "noscript",
    "template",
    "gb-adv-grid",
    "gb-wrapper",
    "gb-responsive-image",
    "adv-col",
    "span",
    "gb-tab-nav",  # usually just adds nodes in tree, wraps an unordered list
}

WRAPPERS = {
    "div",
    "section",
    "nav",
    "header",
    "footer",
    "main",
    "article",
    "aside",
    "picture",
}


def own_text(el):
    parts = [t.strip() for t in el.xpath("./text()").getall()]
    return " ".join(p for p in parts if p)


def all_text(el):
    return " ".join(" ".join(el.css("::text").getall()).split())


def _append(kids, node):
    if node is None:
        return
    if isinstance(node, list):
        kids.extend(node)
    else:
        kids.append(node)


def is_internal_link(u: str | None) -> bool:
    if not u:
        return False
    u = u.strip().split()[0]
    return u.startswith("/") or not u.startswith(("http://", "https://", "www."))


def _norm_url(base, u):
    if not u:
        return None
    return urljoin(base, u.strip().split()[0])


def _attrs(el):
    # keep all attributes verbatim
    return dict(el.attrib)


def parse_json(raw):
    if not raw:
        return None
    s = html.unescape(raw).replace("\\/", "/")
    try:
        return json.loads(s)
    except json.JSONDecodeError:
        try:
            return json.loads(s.replace("\u00a0", " ").replace("\xa0", " "))
        except json.JSONDecodeError:
            return s


# -------- serializers (must accept children) --------
def serialize_a(el, base, children):
    href = el.attrib.get("href")
    return {
        "a": {
            "text": all_text(el),
            "title": el.attrib.get("title", ""),
            "href": _norm_url(base, href),
            "link_type": ("internal" if is_internal_link(href) else "external")
            if href
            else None,
            "classes": el.attrib.get("class", ""),
            "target": el.attrib.get("target"),
            "content": children or None,
        }
    }


def serialize_button_like(el, base, children):
    act = el.attrib.get("href") or el.attrib.get("formaction")
    return {
        "button": {
            "text": all_text(el),
            "url": _norm_url(base, act),
            "link_type": ("internal" if is_internal_link(act) else "external")
            if act
            else None,
            "classname": el.attrib.get("class", ""),
            "content": children or None,
            **({k: el.attrib[k] for k in el.attrib if k.startswith("data-")}),
            **(
                {
                    k: el.attrib[k]
                    for k in ("title", "aria-haspopup", "aria-expanded")
                    if k in el.attrib
                }
            ),
        }
    }


def serialize_img(el, base, _children):
    src = el.attrib.get("src")
    return {
        "img": {
            "src": _norm_url(base, src),
            "classes": el.attrib.get("class", ""),
            "alt": el.attrib.get("alt"),
            "title": el.attrib.get("title"),
            "link_type": ("internal" if is_internal_link(src) else "external")
            if src
            else None,
            "loading": el.attrib.get("loading"),
            **({k: el.attrib[k] for k in el.attrib if k.startswith("data-")}),
        }
    }


def serialize_source(el, base, _children):
    srcset = (el.attrib.get("srcset") or "").replace("\n", " ")
    urls = []
    for part in srcset.split(","):
        tok = part.strip().split()
        if tok:
            urls.append(_norm_url(base, tok[0]))
    return {
        "source": {
            "media": el.attrib.get("media"),
            "height": el.attrib.get("height"),
            "width": el.attrib.get("width"),
            "srcset": [u for u in urls if u],
            "classes": el.attrib.get("class", ""),
            "data_aspectratio": el.attrib.get("data-aspectratio"),
        }
    }


def serialize_heading(el, _base, _children):
    return {"heading": own_text(el)}


def serialize_gb_dynamic_text(el, _base, _children):
    return {
        "gb-dynamic-text": {
            "text": all_text(el) or None,
            "class": el.attrib.get("class", ""),
            "country": el.attrib.get("country"),
            "regional_information": parse_json(
                el.attrib.get("regional-information-json")
            ),
        }
    }


def serialize_myaccount_flyout(el, base, children):
    # reuse JSON attr parser
    def _parse(attr):
        return parse_json(el.attrib.get(attr))

    return {
        "gb-myaccount-flyout": {
            "class": el.attrib.get("class", ""),
            "flyoutstate": el.attrib.get("flyoutstate"),
            "auth_flyout": _parse("authflyoutdata"),
            "auth_links": _parse("authlinkdata"),
            "fallback": _parse("fallbackdata"),
            "content": children or None,  # preserve nested nodes if any
        }
    }


def _attrs_copy(el):
    return dict(el.attrib) if el.attrib else {}


def _pop_cls(attrs):
    cls = attrs.pop("class", None)
    return cls, attrs


def serialize_li(el, _base, children):
    attrs = _attrs_copy(el)
    li_class, rest = _pop_cls(attrs)
    txt = own_text(el)
    node = {
        "item": {
            **({"li_class": li_class} if li_class else {}),
            **({"attrs": rest} if rest else {}),
            **({"text": txt} if txt else {}),
            **({"content": children} if children else {}),
        }
    }
    return node


def _serialize_list(kind, el, base, children):
    # children already serialized by dfs; pick out only LI entries
    items = []
    other = []
    for ch in children:
        if isinstance(ch, dict) and "item" in ch:
            items.append(ch["item"])
        else:
            other.append(ch)
    attrs = _attrs_copy(el)
    cls, rest = _pop_cls(attrs)
    node = {
        kind: {
            **({"class": cls} if cls else {}),
            **({"attrs": rest} if rest else {}),
            **({"items": items} if items else {"items": []}),
            **({"content": other} if other else {}),
        }
    }
    return node


def serialize_ul(el, base, children):
    return _serialize_list("ul", el, base, children)


def serialize_ol(el, base, children):
    return _serialize_list("ol", el, base, children)


NATIVE = {
    "a": serialize_a,
    "button": serialize_button_like,
    "input": serialize_button_like,  # gated below
    "img": serialize_img,
    "source": serialize_source,
    "gb-dynamic-text": serialize_gb_dynamic_text,
    "h1": serialize_heading,
    "h2": serialize_heading,
    "h3": serialize_heading,
    "h4": serialize_heading,
    "h5": serialize_heading,
    "h6": serialize_heading,
    "ul": serialize_ul,
    "ol": serialize_ol,
    "li": serialize_li,
    "gb-myaccount-flyout": serialize_myaccount_flyout,
}


def serialize_generic(el, children):
    node = {"tag": el.root.tag.lower()}
    attrs = _attrs(el)
    if attrs:
        node["attrs"] = attrs
    txt = own_text(el)
    if txt:
        node["text"] = txt
    if children:
        node["content"] = children
    return node


# -------- unified DFS --------
def dfs(el, base):
    tag = el.root.tag.lower()

    # 1) drop excluded wrappers but keep their children
    if tag in EXCLUDE:
        kids = []
        for ch in el.xpath("./*"):
            _append(kids, dfs(ch, base))
        return kids or None

    # 2) always build children first
    children = []
    for ch in el.xpath("./*"):
        _append(children, dfs(ch, base))

    # 3) special handling when needed, but never block children
    if tag in NATIVE:
        if tag == "input" and el.attrib.get("type") not in {
            "button",
            "submit",
            "reset",
        }:
            # non-button inputs fall back to generic
            pass
        else:
            try:
                return NATIVE[tag](el, base, children)
            except Exception as _:
                # fall through to generic if a serializer fails
                return serialize_generic(el, children)

    # 4) flatten trivial wrappers
    if tag in WRAPPERS:
        cls = el.attrib.get("class", "").strip()
        if not cls and not own_text(el) and len(children) == 1:
            return children[0]

    # 5) generic element
    return serialize_generic(el, children)


In [63]:
root = selector.xpath("//gb-global-nav/template[@id='gb-global-nav-content']")[0]
BASE = "https://www.chevrolet.ca/"
tree = [n for n in (dfs(ch, BASE) for ch in root.xpath("./*")) if n is not None]
print(json.dumps(tree, indent=2, ensure_ascii=False))


[
  {
    "tag": "div",
    "attrs": {
      "class": "gn-extras"
    },
    "content": [
      {
        "button": {
          "text": "",
          "url": null,
          "link_type": null,
          "classname": "gb-visually-hide-show gb-skip-to-main-content gb-body1",
          "content": null
        }
      }
    ]
  },
  {
    "tag": "div",
    "attrs": {
      "class": "gn-aria-label"
    }
  },
  {
    "tag": "div",
    "attrs": {
      "class": "gn-logo-container"
    },
    "content": [
      {
        "a": {
          "text": "",
          "title": "Home",
          "href": "https://www.chevrolet.ca/en",
          "link_type": "external",
          "classes": "stat-image-link",
          "target": null,
          "content": [
            {
              "img": {
                "src": "https://www.chevrolet.ca/content/dam/chevrolet/na/canada/english/index/navigation-icons/global-icons/chevrolet-bowtie-120.svg",
                "classes": "gn-logo gn-nav-logo-image",
       

In [64]:
# Get all unique tags from the root element
def get_all_unique_tags(element):
    """
    Recursively traverse the element and collect all unique tag names
    """
    unique_tags = set()

    # Add current element's tag
    if hasattr(element, "root") and hasattr(element.root, "tag"):
        unique_tags.add(element.root.tag.lower())

    # Recursively process all child elements
    for child in element.xpath(".//*"):
        if hasattr(child, "root") and hasattr(child.root, "tag"):
            unique_tags.add(child.root.tag.lower())

    return unique_tags


# Extract all unique tags from the root element
unique_tags = get_all_unique_tags(root)

print(f"Total unique tags found: {len(unique_tags)}")
print("\nAll unique tags (sorted alphabetically):")
for tag in sorted(unique_tags):
    print(f"  - {tag}")

print(f"\nAs a set: {unique_tags}")

Total unique tags found: 31

All unique tags (sorted alphabetically):
  - a
  - adv-col
  - br
  - button
  - div
  - gb-adv-grid
  - gb-button
  - gb-disclosure
  - gb-dynamic-text
  - gb-flyout
  - gb-myaccount-flyout
  - gb-myaccount-nav
  - gb-region-dropdown
  - gb-region-selector
  - gb-responsive-image
  - gb-tab-nav
  - gb-target-zone
  - gb-wrapper
  - h2
  - h3
  - h4
  - h6
  - img
  - li
  - nav
  - p
  - picture
  - source
  - span
  - template
  - ul

As a set: {'gb-responsive-image', 'gb-adv-grid', 'div', 'ul', 'span', 'nav', 'h2', 'a', 'gb-target-zone', 'gb-region-dropdown', 'picture', 'img', 'gb-tab-nav', 'gb-flyout', 'adv-col', 'h4', 'template', 'gb-wrapper', 'button', 'p', 'gb-myaccount-flyout', 'h3', 'gb-dynamic-text', 'h6', 'gb-button', 'gb-myaccount-nav', 'br', 'source', 'gb-region-selector', 'gb-disclosure', 'li'}


## Comparison: Basic Scraping vs JavaScript-Rendered Scraping

The Scrapy + Playwright script gives you the **complete, fully-rendered HTML** that a human user would see in their browser. This includes:

### What the Scrapy script captured:
- **Complete trim information**: WT, Custom, LT, RST, Custom Trail Boss, LTZ, LT Trail Boss, High Country, ZR2
- **Fully loaded JavaScript content**
- **Dynamic pricing and specifications**
- **Interactive elements rendered as static HTML**

### Difference from basic `requests` approach:
- `requests` + `BeautifulSoup` = Raw server HTML (often incomplete)
- `Scrapy` + `Playwright` = Full browser-rendered HTML (complete content)

Let's parse the rendered Silverado page: