In [1]:

import requests
target_url = 'https://www.woolworths.com.au/shop/browse/specials/half-price?pageNumber=40'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

timeout_seconds = 15

print(f"准备爬取网页: {target_url}")

try:
    response = requests.get(target_url, headers=headers, timeout=timeout_seconds)
    response.raise_for_status()
    print(f"请求成功，状态码: {response.status_code}")
    response.encoding = response.apparent_encoding
    print(f"网页编码识别为: {response.encoding}")
    html_content = response.text
    print(html_content)
except requests.exceptions.Timeout as e:
    print(f"请求超时 ({timeout_seconds}秒): {e}")
except requests.exceptions.HTTPError as e:
    print(f"HTTP 错误: {e}")
except requests.exceptions.ConnectionError as e:
    print(f"网络连接错误: {e}")
except requests.exceptions.RequestException as e:
    print(f"请求过程中发生错误: {e}")
except Exception as e:
    print(f"发生未知错误: {e}")

准备爬取网页: https://www.woolworths.com.au/shop/browse/specials/half-price?pageNumber=40
请求成功，状态码: 200
网页编码识别为: utf-8
<!DOCTYPE html>
<html lang="en-AU" data-critters-container>
  <head>
    <title>Woolworths Supermarket - Buy Groceries Online</title>
    <base href="/">
    
      <meta name="isAdaptive" content="true">
      <meta name="title" content="Woolworths Supermarket - Buy Groceries Online">
      <meta name="description" content="Shop online at Woolworths for your groceries. We’re here to help you eat fresher & healthier food, 7 days a week with low prices always.">
      <meta name="url" content="https://www.woolworths.com.au/shop/browse/specials/half-price">
      
      <meta property="og:title" content="Woolworths Supermarket - Buy Groceries Online">
      <meta property="og:description" content="Shop online at Woolworths for your groceries. We’re here to help you eat fresher & healthier food, 7 days a week with low prices always.">
      
      <meta property="og:url" conten

In [2]:
from bs4 import BeautifulSoup
# 确保我们成功获取到了 HTML 内容才进行解析
if html_content:
    print("\n--- 开始解析 HTML 内容 ---")
    try:
        # 1. 创建 BeautifulSoup 对象
        #    第一个参数是获取到的 HTML 文本 (html_content)
        #    第二个参数是解析器，'html.parser' 是 Python 内置的，无需额外安装
        #    如果追求性能或对复杂 HTML 的兼容性，可以安装并使用 'lxml' (pip install lxml)
        soup = BeautifulSoup(html_content, 'html.parser') # 或者 'lxml'

        # 2. 查找所有标签名为 'shared-web-component-wrapper' 的元素
        #    soup.find_all('标签名') 会返回一个列表，包含所有匹配的 Tag 对象
        #    如果找不到任何匹配项，会返回一个空列表 []
        wrapper_items = soup.find_all('shared-web-component-wrapper')

        # 3. 处理并输出找到的元素
        if wrapper_items:
            print(f"成功找到了 {len(wrapper_items)} 个 '<shared-web-component-wrapper>' 元素:")

            # 遍历找到的每一个元素
            for index, item in enumerate(wrapper_items):
                print(f"\n--- 第 {index + 1} 个元素 ---")

                # ===== 在这里根据你的具体需求提取信息 =====

                # 示例 1: 打印整个元素的 HTML 结构 (带格式)
                # print("完整 HTML:")
                # print(item.prettify())

                # 示例 2: 提取元素内的纯文本内容
                item_text = item.get_text(separator=" ", strip=True) # separator 指定文本间分隔符, strip 去除首尾空白
                print(f"提取到的文本: {item_text}")

                # 示例 3: 获取元素的属性 (假设它有 'id' 或 'class' 属性)
                item_id = item.get('id')
                item_class = item.get('class') # 返回一个列表 ['class1', 'class2']
                if item_id:
                    print(f"元素 ID: {item_id}")
                if item_class:
                    print(f"元素 Class: {' '.join(item_class)}") # 将 class 列表转为字符串

                # 示例 4: 在当前元素内部查找特定的子元素
                # 例如，查找它内部的第一个 <h2> 标签
                # title_tag = item.find('h2')
                # if title_tag:
                #     print(f"内部 H2 标题: {title_tag.get_text(strip=True)}")

                # 例如，查找所有内部的 <a> 链接标签
                # links = item.find_all('a')
                # if links:
                #     print("内部链接:")
                #     for link in links:
                #         href = link.get('href')
                #         link_text = link.get_text(strip=True)
                #         if href:
                #             print(f"  - 文本: {link_text}, URL: {href}")

                # =========================================

        else:
            # 如果列表为空，说明没找到
            print("在页面中没有找到任何 '<shared-web-component-wrapper>' 标签。")

    except Exception as e:
        # 捕获解析过程中可能出现的错误
        print(f"解析 HTML 时发生错误: {e}")
else:
    # 如果前面的请求失败，html_content 为 None
    print("\n未能获取到 HTML 内容，因此无法进行解析。")


--- 开始解析 HTML 内容 ---
在页面中没有找到任何 '<shared-web-component-wrapper>' 标签。
