In [3]:
!pip list | grep "beautiful"

beautifulsoup4            4.12.3


In [4]:
websites_url = [
    "https://www.chevrolet.ca/en/trucks/silverado-1500",
    "https://www.chevrolet.ca/en/suvs/previous-year-equinox",
]

In [5]:
import requests
from bs4 import BeautifulSoup

url = websites_url[1]
response = requests.get(url)  # Send HTTP GET
html = response.text  # Page HTML


In [6]:
soup = BeautifulSoup(html, "lxml")

In [7]:
soup

<!DOCTYPE HTML>
<html dir="ltr" lang="en-CA">
<head>
<meta content="generic-templates-page" name="template"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="The all-new 2025 Equinox compact SUV offers versatile interiors, smart driver technologies, &amp; sleek, eye-catching designs - taking your drive to the next level." name="description"/>
<link crossorigin="" href="//brands.gm-cdn.com" rel="preconnect"/>
<link crossorigin="" href="//players.brightcove.net" rel="preconnect"/>
<link crossorigin="" href="//assets.adobedtm.com" rel="preconnect"/>
<script>
            var getLangObj = "[{\x22langCode\x22:\x22en\x22,\x22langHref\x22:\x22https:\/\/www.chevrolet.ca\/en\/en\/suvs\/previous\u002Dyear\u002Dequinox\x22,\x22currentLangCode\x22:\x22en\x22},{\x22langCode\x22:\x22fr\x22,\x22langHref\x22:\x22https:\/\/www.chevrolet.ca\/fr\/

In [8]:
with open("scrap_exquinox.html", "w") as f:
    f.write(str(soup))


In [9]:
# Get page title
title = soup.title.string

# Find first <h1>
h1 = soup.find("h1").text

# Find all links
links = [a["href"] for a in soup.find_all("a", href=True)]

# Find all paragraphs
paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]


In [10]:
title

'2025 Chevrolet Equinox | Compact SUV | Chevrolet Canada'

## Comparison: Basic Scraping vs JavaScript-Rendered Scraping

The Scrapy + Playwright script gives you the **complete, fully-rendered HTML** that a human user would see in their browser. This includes:

### What the Scrapy script captured:
- **Complete trim information**: WT, Custom, LT, RST, Custom Trail Boss, LTZ, LT Trail Boss, High Country, ZR2
- **Fully loaded JavaScript content**
- **Dynamic pricing and specifications**
- **Interactive elements rendered as static HTML**

### Difference from basic `requests` approach:
- `requests` + `BeautifulSoup` = Raw server HTML (often incomplete)
- `Scrapy` + `Playwright` = Full browser-rendered HTML (complete content)

Let's parse the rendered Silverado page:

## New Approach: Pure Scrapy Method

Instead of using BeautifulSoup for parsing, let's use Scrapy's built-in selector system which is more powerful and efficient:

In [None]:
# Install required packages for Scrapy + Playwright
!pip install scrapy scrapy-playwright

In [None]:
import json
from pathlib import Path

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings


class ImprovedCarSpider(scrapy.Spider):
    name = "car_scraper"

    # Custom settings for JavaScript rendering
    custom_settings = {
        "ROBOTSTXT_OBEY": False,
        "DOWNLOAD_HANDLERS": {
            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
        "PLAYWRIGHT_BROWSER_TYPE": "chromium",
        "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 30000,
        "USER_AGENT": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "CONCURRENT_REQUESTS": 1,
        "DOWNLOAD_DELAY": 2,
    }

    def __init__(self, urls=None, *args, **kwargs):
        super(ImprovedCarSpider, self).__init__(*args, **kwargs)
        self.start_urls = urls or [
            "https://www.chevrolet.ca/en/trucks/silverado-1500",
            "https://www.chevrolet.ca/en/suvs/previous-year-equinox",
        ]
        self.extracted_data = []

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url,
                meta={
                    "playwright": True,
                    "playwright_context": "default",
                    "playwright_context_kwargs": {
                        "viewport": {"width": 1366, "height": 768},
                    },
                    "playwright_page_methods": [
                        ("wait_for_load_state", "domcontentloaded"),
                        ("wait_for_load_state", "networkidle"),
                    ],
                },
                callback=self.parse,
            )

    def parse(self, response):
        # Extract data using Scrapy selectors (instead of BeautifulSoup)

        # Basic page info
        page_data = {
            "url": response.url,
            "title": response.css("title::text").get(),
            "meta_description": response.css(
                'meta[name="description"]::attr(content)'
            ).get(),
        }

        # Extract headings
        page_data["headings"] = {
            "h1": response.css("h1::text").getall(),
            "h2": response.css("h2::text").getall(),
            "h3": response.css("h3::text").getall(),
        }

        # Extract all links
        page_data["links"] = [
            {"text": link.css("::text").get(), "url": link.css("::attr(href)").get()}
            for link in response.css("a[href]")
        ]

        # Extract prices (common patterns)
        page_data["prices"] = response.css(
            '*[class*="price"], *[class*="Price"], *[class*="cost"]::text'
        ).getall()

        # Extract vehicle-specific info
        page_data["vehicle_info"] = self.extract_vehicle_info(response)

        # Save to file
        filename = f"{response.url.split('/')[-1]}_scrapy_data.json"
        with open(filename, "w") as f:
            json.dump(page_data, f, indent=2)

        self.extracted_data.append(page_data)
        self.logger.info(f"Extracted data from {response.url} -> {filename}")

        return page_data

    def extract_vehicle_info(self, response):
        """Extract vehicle-specific information"""
        vehicle_info = {}

        # Look for trim levels in various ways
        vehicle_info["trims"] = response.css(
            '*[class*="trim"], *[class*="Trim"]::text'
        ).getall()

        # Look for specifications
        vehicle_info["specs"] = response.css(
            '*[class*="spec"], *[class*="Spec"]::text'
        ).getall()

        # Look for features
        vehicle_info["features"] = response.css(
            '*[class*="feature"], *[class*="Feature"]::text'
        ).getall()

        # Extract any text containing vehicle model info
        text_content = response.css("body::text").getall()
        vehicle_info["model_mentions"] = [
            text.strip()
            for text in text_content
            if any(
                keyword in text.lower()
                for keyword in ["silverado", "equinox", "2025", "2024"]
            )
        ]

        return vehicle_info


print("✅ Scrapy spider class created!")
print("This spider will:")
print("- Use Playwright for JavaScript rendering")
print("- Extract data using Scrapy selectors (no BeautifulSoup)")
print("- Save structured data as JSON files")

In [None]:
# Function to run the spider from notebook
def run_scrapy_spider(urls=None):
    """Run the Scrapy spider with the given URLs"""

    # Set up the crawler process
    process = CrawlerProcess(
        {
            "USER_AGENT": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
            "ROBOTSTXT_OBEY": False,
            "DOWNLOAD_DELAY": 1,
            "CONCURRENT_REQUESTS": 1,
        }
    )

    # Add the spider
    spider_args = {"urls": urls} if urls else {}
    process.crawl(ImprovedCarSpider, **spider_args)

    # Start the spider
    print("🚀 Starting Scrapy spider...")
    process.start()  # This will block until crawling is finished
    print("✅ Scraping completed!")


# Alternative: Run spider using command line (recommended for notebooks)
def run_spider_via_file():
    """Create a standalone spider file and run it"""

    # URLs to scrape
    urls_to_scrape = [
        "https://www.chevrolet.ca/en/trucks/silverado-1500",
        "https://www.chevrolet.ca/en/suvs/previous-year-equinox",
    ]

    print("💡 To run the spider, use this command in terminal:")
    print("scrapy runspider -s ROBOTSTXT_OBEY=False improved_scraper.py")
    print()
    print("📝 Creating improved_scraper.py file...")

    return urls_to_scrape


# Show available options
print("🎯 Two ways to run Scrapy without BeautifulSoup:")
print("1. run_scrapy_spider() - Run directly from notebook")
print("2. run_spider_via_file() - Create file and run from terminal")
print()
print("📊 Data extraction uses Scrapy selectors:")
print("- response.css('selector') - CSS selectors")
print("- response.xpath('//xpath') - XPath selectors")
print("- response.css('::text').get() - Extract text")
print("- response.css('::attr(href)').get() - Extract attributes")

In [None]:
# Demonstration: Scrapy Selectors vs BeautifulSoup
# Let's show how to parse the existing HTML files using Scrapy selectors

from scrapy import Selector

# Load the existing HTML file
with open("silverado1500.html", "r", encoding="utf-8") as f:
    html_content = f.read()

# Create a Scrapy selector (replaces BeautifulSoup)
selector = Selector(text=html_content)

print("🕷️ SCRAPY SELECTOR APPROACH (replaces BeautifulSoup)")
print("=" * 60)

# Extract title using Scrapy selector
title = selector.css("title::text").get()
print(f"📄 Title: {title}")

# Extract meta description
meta_desc = selector.css('meta[name="description"]::attr(content)').get()
print(
    f"📝 Description: {meta_desc[:100]}..."
    if meta_desc
    else "📝 Description: Not found"
)

# Extract all headings
h1_tags = selector.css("h1::text").getall()
print(f"🔤 H1 headings found: {len(h1_tags)}")
for i, h1 in enumerate(h1_tags[:3], 1):  # Show first 3
    print(f"   {i}. {h1.strip()}")

# Extract links (first 5)
links = selector.css("a[href]")
print(f"🔗 Links found: {len(links)}")
for i, link in enumerate(links[:5], 1):
    link_text = link.css("::text").get() or "No text"
    link_url = link.css("::attr(href)").get()
    print(f"   {i}. {link_text.strip()[:30]}... -> {link_url}")

print(f"\n📊 File size: {len(html_content):,} characters")
print("✅ All extracted using Scrapy selectors - NO BeautifulSoup needed!")

In [11]:
# Parse the JavaScript-rendered Silverado page
from bs4 import BeautifulSoup

with open("silverado1500.html", "r", encoding="utf-8") as f:
    rendered_html = f.read()

# Create soup from the fully-rendered content
silverado_soup = BeautifulSoup(rendered_html, "lxml")

print(f"File size: {len(rendered_html):,} characters")
print(
    f"Title: {silverado_soup.title.string if silverado_soup.title else 'No title found'}"
)

File size: 3,063,072 characters
Title: 
      2025 Chevrolet Silverado 1500 | Pickup Truck | Chevrolet Canada
    


In [12]:
# Extract vehicle trim information that was captured by Playwright
import re

# Search for trim level information
trim_pattern = r"WT:Custom:LT:RST:Custom Trail Boss:LTZ:LT Trail Boss:High Country:ZR2"
trim_matches = re.findall(trim_pattern, rendered_html)

print("Silverado 1500 Trim Levels Found:")
if trim_matches:
    trims = trim_matches[0].split(":")
    for i, trim in enumerate(trims, 1):
        print(f"{i}. {trim}")
else:
    print("No trim information found")

# Search for year information
year_matches = re.findall(r"2025.*silverado", rendered_html, re.IGNORECASE)
print(f"\nFound {len(year_matches)} references to 2025 Silverado")

# Check total content size vs basic scraping
print(f"\nRendered HTML size: {len(rendered_html):,} characters")
print("This contains the FULL content a user would see!")

Silverado 1500 Trim Levels Found:
1. WT
2. Custom
3. LT
4. RST
5. Custom Trail Boss
6. LTZ
7. LT Trail Boss
8. High Country
9. ZR2

Found 648 references to 2025 Silverado

Rendered HTML size: 3,063,072 characters
This contains the FULL content a user would see!
