基本爬蟲

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from urllib.parse import quote
import time

def get_coupang_search_results(search_keyword: str, advanced_keywords: list[str]):
    encoded_keyword = quote(search_keyword)
    url = f"https://www.tw.coupang.com/search?q={encoded_keyword}&channel=user"

    options = Options()
    # 若需隱藏瀏覽器可啟用
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--lang=zh-TW")

    driver = webdriver.Chrome(
        service=Service(
            ChromeDriverManager().install()),
            options=options
    )
    driver.get(url)
    time.sleep(5)

    html = driver.page_source
    # 若需重複操作瀏覽器可先保留不關閉
    driver.quit()

    soup = BeautifulSoup(html, "html.parser")
    product_cards = soup.select(
        "div.SearchResult_searchResultProduct___h6E9"
    )

    results = []
    matched_results = []

    for card in product_cards:
        try:
            # 完整內容文字
            full_text = card.get_text(separator=" ", strip=True)

            # 精簡標題（僅取 title 區塊）
            title_tag = card.select_one("div.Product_title__8K0xk")
            title = title_tag.get_text(strip=True) if title_tag else "N/A"

            # 價格
            price_tag = card.select_one(
                "span.Product_salePricePrice__2FbsL span"
            )
            price = price_tag.get_text(strip=True) if price_tag else "N/A"

            # 每單位價格
            unit_price_tag = card.select_one("div.Product_unitPrice__QQPdR")
            unit_price = unit_price_tag.get_text(strip=True) if unit_price_tag else "N/A"

            product = {
                "title": title,
                "full_text": full_text,
                "price": price,
                "unit_price": unit_price
            }

            results.append(product)

            # 進階條件：從完整描述中過濾
            if all(kw in full_text for kw in advanced_keywords):
                matched_results.append(product)

        except Exception as e:
            print("解析錯誤：", e)
            continue

    return results, matched_results

# 主程式區塊
if __name__ == "__main__":
    # 搜尋主關鍵字
    search_keyword = "味丹 氣泡水"

    # 進階篩選條件（需同時包含全部關鍵詞）
    advanced_keywords = ["檸檬風味"]

    # 執行爬取
    all_products, filtered_products = get_coupang_search_results(
        search_keyword,
        advanced_keywords
    )

    print(f"搜尋關鍵字：{search_keyword}")
    # print("所有搜尋結果：")
    # for idx, product in enumerate(all_products, 1):
    #     print(f"{idx}. 標題: {product['title']}")
    #     print(f"   價格: {product['price']}")
    #     print(f"   每單位: {product['unit_price']}")
    #     print(f"   完整內容: {product['full_text']}")
    #     print("-" * 60)

    print(f"\n進階條件符合項目（包含：{'、'.join(advanced_keywords)}）：")
    for idx, product in enumerate(filtered_products, 1):
        print(f"{idx}. 標題: {product['title']}")
        print(f"   價格: {product['price']}")
        print(f"   每單位: {product['unit_price']}")
        print(f"   完整內容: {product['full_text']}")
        print("-" * 60)

搜尋關鍵字：味丹 氣泡水

進階條件符合項目（包含：檸檬風味）：
1. 標題: 味丹 多喝水 MORE氣泡水 檸檬風味, 560ml, 24瓶
   價格: $287
   每單位: ($2.14/100ml)
   完整內容: 味丹 多喝水 MORE氣泡水 檸檬風味, 560ml, 24瓶 特價 46折 $624 $287 ($2.14/100ml) 7折 優惠券 明天 5/13 (二) 預計送達 免運 滿 $490 ( 911 )
------------------------------------------------------------


寫入資料庫

In [25]:
import os
import pymysql
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from urllib.parse import quote
from dotenv import load_dotenv

# 載入 .env 檔案中的變數
load_dotenv()

# MariaDB 連線設定
DB_HOST = os.getenv("DB_HOST")
DB_PORT = int(os.getenv("DB_PORT", 3306))
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

def insert_into_db(data_list):
    connection = pymysql.connect(
        host=DB_HOST,
        port=DB_PORT,
        user=DB_USER,
        password=DB_PASSWORD,
        database=DB_NAME,
        charset='utf8mb4'
    )

    with connection:
        with connection.cursor() as cursor:
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS coupang_products (
                    id INT AUTO_INCREMENT PRIMARY KEY,
                    search_keyword VARCHAR(255),
                    title TEXT,
                    full_text TEXT,
                    price VARCHAR(50),
                    unit_price VARCHAR(50),
                    timestamp DATETIME
                );
            """)
            for data in data_list:
                cursor.execute("""
                    INSERT INTO coupang_products (search_keyword, title, full_text, price, unit_price, timestamp)
                    VALUES (%s, %s, %s, %s, %s, %s);
                """, (
                    data["search_keyword"],
                    data["title"],
                    data["full_text"],
                    data["price"],
                    data["unit_price"],
                    data["timestamp"]
                ))
        connection.commit()

def get_coupang_search_results(search_keyword: str, advanced_keywords: list[str]):
    encoded_keyword = quote(search_keyword)
    url = f"https://www.tw.coupang.com/search?q={encoded_keyword}&channel=user"

    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--lang=zh-TW")

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )
    driver.get(url)
    time.sleep(5)
    html = driver.page_source
    driver.quit()

    soup = BeautifulSoup(html, "html.parser")
    product_cards = soup.select(
        "div.SearchResult_searchResultProduct___h6E9"
    )

    now = datetime.now()
    results = []

    for card in product_cards:
        try:
            full_text = card.get_text(separator=" ", strip=True)
            title_tag = card.select_one("div.Product_title__8K0xk")
            title = title_tag.get_text(strip=True) if title_tag else "N/A"
            price_tag = card.select_one(
                "span.Product_salePricePrice__2FbsL span"
            )
            price = price_tag.get_text(strip=True) if price_tag else "N/A"
            unit_price_tag = card.select_one("div.Product_unitPrice__QQPdR")
            unit_price = unit_price_tag.get_text(strip=True) if unit_price_tag else "N/A"

            product = {
                "search_keyword": search_keyword,
                "title": title,
                "full_text": full_text,
                "price": price,
                "unit_price": unit_price,
                "timestamp": now
            }
            results.append(product)

        except Exception as e:
            print("解析錯誤：", e)
            continue

    return results

if __name__ == "__main__":
    search_keyword = "味丹 氣泡水"
    advanced_keywords = ["檸檬風味"]

    results = get_coupang_search_results(
        search_keyword,
        advanced_keywords
    )
    insert_into_db(results)

    print("資料已寫入 MariaDB。")


資料已寫入 MariaDB。
