In [9]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import csv
from datetime import datetime, timedelta
import time
import random

def get_random_proxy():
    # Add your proxy list here
    proxies = [
        'http://proxy1.example.com:8080',
        'http://proxy2.example.com:8080',
        # Add more proxies as needed
    ]
    return {'http': random.choice(proxies), 'https': random.choice(proxies)} if proxies else None

def google_news_search(keyword, date, max_retries=5, base_delay=60):
    formatted_date = date.strftime("%Y/%m/%d")
    encoded_keyword = urllib.parse.quote(keyword)
    url = (
        f"https://www.google.com/search?q={encoded_keyword}&tbm=nws"
        f"&tbs=cdr:1,cd_min:{formatted_date},cd_max:{formatted_date}"
    )
    headers = {
        "User-Agent": random.choice([
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        ]),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
        "Referer": "https://www.google.com/",
        "DNT": "1"
    }

    for attempt in range(max_retries):
        try:
            if attempt > 0:
                # Exponential backoff with jitter
                delay = base_delay * (2 ** attempt) + random.uniform(10, 30)
                print(f"第 {attempt + 1} 次重試，等待 {delay:.1f} 秒...")
                time.sleep(delay)

            proxies = get_random_proxy()
            response = requests.get(url, headers=headers, proxies=proxies, timeout=30)
            print(f"搜尋網址: {url}")
            print(f"回應狀態: {response.status_code}")
            print(f"回應內容大小: {len(response.text)}")

            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                results = []

                # 方法 1：新格式 (常見於舊新聞)
                for item in soup.select('div.SoaBEf'):
                    title_tag = item.select_one('div.MBeuO')
                    link_tag = item.find('a', href=True)
                    if title_tag and link_tag:
                        results.append({
                            'date': formatted_date,
                            'keyword': keyword,
                            'title': title_tag.text.strip(),
                            'link': link_tag['href']
                        })

                # 方法 2：舊格式 (dbsr)
                for item in soup.select('div.dbsr'):
                    title_tag = item.select_one('div.JheGif span') or item.select_one('div.JheGif')
                    link = item.a['href']
                    title = title_tag.text if title_tag else "No title"
                    results.append({
                        'date': formatted_date,
                        'keyword': keyword,
                        'title': title,
                        'link': link
                    })

                return results
            elif response.status_code == 429:
                print(f"遇到速率限制 (429 錯誤)，進行第 {attempt + 1} 次重試")
                # Add additional delay for rate limit
                time.sleep(random.uniform(120, 180))
                continue
            else:
                print(f"遇到未預期的狀態碼: {response.status_code}")

        except Exception as e:
            print(f"錯誤 (第 {attempt + 1} 次): {str(e)}")
            time.sleep(random.uniform(30, 60))

    print("已達到最大重試次數，跳過此日期")
    return []

def save_results_to_csv(results, filename='google_news_results.csv'):
    with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
        writer = csv.DictWriter(file, fieldnames=['date', 'keyword', 'title', 'link'])
        writer.writeheader()
        for row in results:
            writer.writerow(row)

if __name__ == "__main__":
    keyword = "鴻海"
    start_date = datetime.strptime("2010-10-19", "%Y-%m-%d")
    end_date = datetime.strptime("2010-10-22", "%Y-%m-%d")

    all_results = []
    current_date = start_date

    while current_date <= end_date:
        print(f"\n搜尋 {keyword} 於 {current_date.strftime('%Y-%m-%d')}...")
        results = google_news_search(keyword, current_date)
        print(f"找到 {len(results)} 則新聞")
        all_results.extend(results)

        # 增加間隔時間至 90-150 秒
        if current_date < end_date:
            sleep_time = 90 + random.random() * 60
            print(f"等待 {sleep_time:.1f} 秒後進行下一次搜尋...")
            time.sleep(sleep_time)

        current_date += timedelta(days=1)

    save_results_to_csv(all_results)
    print(f"\n✅ 完成，共儲存 {len(all_results)} 則新聞至 CSV 檔。")



搜尋 鴻海 於 2010-10-19...
錯誤 (第 1 次): HTTPSConnectionPool(host='www.google.com', port=443): Max retries exceeded with url: /search?q=%E9%B4%BB%E6%B5%B7&tbm=nws&tbs=cdr:1,cd_min:2010/10/19,cd_max:2010/10/19 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fb969797250>: Failed to establish a new connection: [Errno -2] Name or service not known')))
錯誤 (第 1 次): HTTPSConnectionPool(host='www.google.com', port=443): Max retries exceeded with url: /search?q=%E9%B4%BB%E6%B5%B7&tbm=nws&tbs=cdr:1,cd_min:2010/10/19,cd_max:2010/10/19 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fb969797250>: Failed to establish a new connection: [Errno -2] Name or service not known')))
第 2 次重試，等待 134.2 秒...
第 2 次重試，等待 134.2 秒...
錯誤 (第 2 次): HTTPSConnectionPool(host='www.google.com', port=443): Max retries exceeded with url: /search?q=%E9%B4%BB%E6%B5%B7&tbm=nws&tbs=cdr:1,cd_mi

KeyboardInterrupt: 

# playwright version

In [56]:
import nest_asyncio
nest_asyncio.apply()

import asyncio
from datetime import datetime, timedelta
from playwright.async_api import async_playwright
import csv
import random
import time

MAX_NEWS = 1000  # 最多抓 1000 筆

def human_delay(a=1.5, b=3.0):
    time.sleep(random.uniform(a, b))


async def google_news_search_all_pages(keyword, date1, date2):
    formatted_date1 = date1.strftime("%m/%d/%Y")
    formatted_date2 = date2.strftime("%m/%d/%Y")

    url = (
        f"https://www.google.com.tw/search?q={keyword}&hl=zh-TW&gl=tw&tbm=nws"
        f"&tbs=cdr:1,cd_min:{formatted_date1},cd_max:{formatted_date2}"
    )

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        await page.goto(url)

        await page.wait_for_timeout(3000)
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
        await page.wait_for_timeout(2000)

        results = []
        page_num = 1

        while True:
            print(f"📄 擷取第 {page_num} 頁 ({date1.date()} ~ {date2.date()})")

            articles = await page.query_selector_all('div.SoaBEf, div.dbsr')
            for article in articles:
                try:
                    title_elem = await article.query_selector('div.MBeuO, div.JheGif')
                    link_elem = await article.query_selector('a')
                    date_elem = await article.query_selector('div[class*="OSrXXb"] span')

                    title = await title_elem.inner_text() if title_elem else ''
                    link = await link_elem.get_attribute('href') if link_elem else ''
                    pub_date = await date_elem.inner_text() if date_elem else 'N/A'

                    results.append({
                        'date': f"{formatted_date1}~{formatted_date2}",
                        'keyword': keyword,
                        'title': title,
                        'link': link,
                        'publish_date': pub_date
                    })

                    if len(results) >= MAX_NEWS:
                        break

                except Exception:
                    continue

            if len(results) >= MAX_NEWS:
                break

            try:
                next_button = await page.query_selector('#pnnext')
                if next_button:
                    await next_button.click()
                    page_num += 1
                    await page.wait_for_timeout(1500)
                else:
                    break
            except Exception:
                break

        await browser.close()
        return results


def save_results_to_csv(results, filename='google_news_results_playwright.csv'):
    with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
        writer = csv.DictWriter(file, fieldnames=['date', 'keyword', 'title', 'link', 'publish_date'])
        writer.writeheader()
        for row in results:
            writer.writerow(row)


async def main():
    keyword = "鴻海財經"
    start_date = datetime.strptime("2010-10-19", "%Y-%m-%d")
    end_date = datetime.strptime("2024-10-22", "%Y-%m-%d")
    delta = timedelta(days=30)

    print(f"\n🔍 搜尋 {keyword} - {start_date.strftime('%Y-%m-%d')} ~ {end_date.strftime('%Y-%m-%d')}")

    all_results = []
    current_date = start_date

    while current_date < end_date and len(all_results) < MAX_NEWS:
        next_date = min(current_date + delta, end_date)
        results = await google_news_search_all_pages(keyword, current_date, next_date)
        all_results.extend(results)

        print(f"🧮 累積 {len(all_results)} 筆資料")

        if len(all_results) >= MAX_NEWS:
            break

        current_date = next_date + timedelta(days=1)

    save_results_to_csv(all_results[:MAX_NEWS])
    print(f"\n✅ 完成，共儲存 {len(all_results[:MAX_NEWS])} 則新聞到 CSV 檔。")


# ▶️ 執行
await main()



🔍 搜尋 鴻海財經 - 2010-10-19 ~ 2024-10-22
📄 擷取第 1 頁 (2010-10-19 ~ 2010-11-18)
🧮 累積 0 筆資料
📄 擷取第 1 頁 (2010-11-19 ~ 2010-12-19)
🧮 累積 0 筆資料
📄 擷取第 1 頁 (2010-12-20 ~ 2011-01-19)
🧮 累積 1 筆資料
📄 擷取第 1 頁 (2011-01-20 ~ 2011-02-19)
🧮 累積 1 筆資料
📄 擷取第 1 頁 (2011-02-20 ~ 2011-03-22)
🧮 累積 1 筆資料
📄 擷取第 1 頁 (2011-03-23 ~ 2011-04-22)
🧮 累積 1 筆資料
📄 擷取第 1 頁 (2011-04-23 ~ 2011-05-23)
🧮 累積 3 筆資料
📄 擷取第 1 頁 (2011-05-24 ~ 2011-06-23)
🧮 累積 6 筆資料
📄 擷取第 1 頁 (2011-06-24 ~ 2011-07-24)
🧮 累積 8 筆資料
📄 擷取第 1 頁 (2011-07-25 ~ 2011-08-24)
🧮 累積 9 筆資料
📄 擷取第 1 頁 (2011-08-25 ~ 2011-09-24)
🧮 累積 10 筆資料
📄 擷取第 1 頁 (2011-09-25 ~ 2011-10-25)
🧮 累積 12 筆資料
📄 擷取第 1 頁 (2011-10-26 ~ 2011-11-25)
🧮 累積 12 筆資料
📄 擷取第 1 頁 (2011-11-26 ~ 2011-12-26)
🧮 累積 16 筆資料
📄 擷取第 1 頁 (2011-12-27 ~ 2012-01-26)
🧮 累積 17 筆資料
📄 擷取第 1 頁 (2012-01-27 ~ 2012-02-26)
🧮 累積 18 筆資料
📄 擷取第 1 頁 (2012-02-27 ~ 2012-03-28)
🧮 累積 22 筆資料
📄 擷取第 1 頁 (2012-03-29 ~ 2012-04-28)
🧮 累積 25 筆資料
📄 擷取第 1 頁 (2012-04-29 ~ 2012-05-29)
🧮 累積 30 筆資料
📄 擷取第 1 頁 (2012-05-30 ~ 2012-06-29)
🧮 累積 31 筆資料
📄 擷取第 1 頁 (20

Future exception was never retrieved
future: <Future finished exception=Error('Connection closed')>
playwright._impl._api_types.Error: Connection closed


📄 擷取第 2 頁 (2017-08-03 ~ 2017-09-02)
🧮 累積 581 筆資料
📄 擷取第 1 頁 (2017-09-03 ~ 2017-10-03)
📄 擷取第 2 頁 (2017-09-03 ~ 2017-10-03)
📄 擷取第 3 頁 (2017-09-03 ~ 2017-10-03)
🧮 累積 600 筆資料
📄 擷取第 1 頁 (2017-10-04 ~ 2017-11-03)
📄 擷取第 2 頁 (2017-10-04 ~ 2017-11-03)
📄 擷取第 3 頁 (2017-10-04 ~ 2017-11-03)
🧮 累積 621 筆資料
📄 擷取第 1 頁 (2017-11-04 ~ 2017-12-04)
🧮 累積 621 筆資料
📄 擷取第 1 頁 (2017-12-05 ~ 2018-01-04)
🧮 累積 621 筆資料
📄 擷取第 1 頁 (2018-01-05 ~ 2018-02-04)
🧮 累積 621 筆資料
📄 擷取第 1 頁 (2018-02-05 ~ 2018-03-07)
🧮 累積 621 筆資料
📄 擷取第 1 頁 (2018-03-08 ~ 2018-04-07)
🧮 累積 621 筆資料
📄 擷取第 1 頁 (2018-04-08 ~ 2018-05-08)
🧮 累積 621 筆資料
📄 擷取第 1 頁 (2018-05-09 ~ 2018-06-08)
🧮 累積 621 筆資料
📄 擷取第 1 頁 (2018-06-09 ~ 2018-07-09)
🧮 累積 621 筆資料
📄 擷取第 1 頁 (2018-07-10 ~ 2018-08-09)
🧮 累積 621 筆資料
📄 擷取第 1 頁 (2018-08-10 ~ 2018-09-09)


CancelledError: 

使用不同年份分開

In [None]:
import nest_asyncio
nest_asyncio.apply()

import asyncio
import csv
import random
import time
from datetime import datetime, timedelta
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

# 🔧 可調參數
MAX_NEWS = 1000
DELAY_RANGE = (1.5, 3.0)
HEADLESS = False
CSV_FILE = 'google_news_results_playwright 2024-01-01_2024-12-31.csv'



def human_delay(a=1.5, b=3.0):
    time.sleep(random.uniform(a, b))


async def google_news_search_all_pages(keyword, date1, date2):
    formatted_date1 = date1.strftime("%m/%d/%Y")
    formatted_date2 = date2.strftime("%m/%d/%Y")

    url = (
        f"https://www.google.com.tw/search?q={keyword}&hl=zh-TW&gl=tw&tbm=nws"
        f"&tbs=cdr:1,cd_min:{formatted_date1},cd_max:{formatted_date2}"
    )

    results = []
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=HEADLESS)
            context = await browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                                                           "AppleWebKit/537.36 (KHTML, like Gecko) "
                                                           "Chrome/122.0.0.0 Safari/537.36")
            page = await context.new_page()

            await page.goto(url)
            await page.wait_for_timeout(3000)

            await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
            await page.wait_for_timeout(2000)

            page_num = 1
            while True:
                print(f"📄 擷取第 {page_num} 頁 ({date1.date()} ~ {date2.date()})")

                articles = await page.query_selector_all('div.SoaBEf, div.dbsr')
                for article in articles:
                    try:
                        title_elem = await article.query_selector('div.MBeuO, div.JheGif')
                        link_elem = await article.query_selector('a')
                        date_elem = await article.query_selector('div[class*="OSrXXb"] span')

                        title = await title_elem.inner_text() if title_elem else ''
                        link = await link_elem.get_attribute('href') if link_elem else ''
                        pub_date = await date_elem.inner_text() if date_elem else 'N/A'

                        results.append({
                            'date': f"{formatted_date1}~{formatted_date2}",
                            'keyword': keyword,
                            'title': title,
                            'link': link,
                            'publish_date': pub_date
                        })

                        if len(results) >= MAX_NEWS:
                            break

                    except Exception as e:
                        print(f"⚠️ 擷取新聞時錯誤：{e}")
                        continue

                if len(results) >= MAX_NEWS:
                    break

                try:
                    next_button = await page.query_selector('#pnnext')
                    if next_button:
                        await next_button.click()
                        page_num += 1
                        await page.wait_for_timeout(1500)
                    else:
                        break
                except PlaywrightTimeoutError:
                    print("⚠️ 找不到下一頁或逾時")
                    break

            await browser.close()

    except Exception as e:
        print(f"🚨 瀏覽器錯誤：{e}")

    return results


def save_results_to_csv(results, filename=CSV_FILE):
    try:
        with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
            writer = csv.DictWriter(file, fieldnames=['date', 'keyword', 'title', 'link', 'publish_date'])
            writer.writeheader()
            for row in results:
                writer.writerow(row)
    except Exception as e:
        print(f"❌ 寫入 CSV 錯誤：{e}")


async def main():
    keyword = "鴻海 財經"
    start_date = datetime.strptime("2024-01-01", "%Y-%m-%d")
    end_date = datetime.strptime("2024-12-31", "%Y-%m-%d")
    delta = timedelta(days=30)

    print(f"\n🔍 搜尋「{keyword}」 - {start_date.date()} 到 {end_date.date()}")

    all_results = []
    current_date = start_date

    while current_date < end_date and len(all_results) < MAX_NEWS:
        next_date = min(current_date + delta, end_date)

        try:
            results = await google_news_search_all_pages(keyword, current_date, next_date)
            all_results.extend(results)
            print(f"🧮 累積 {len(all_results)} 筆資料")

            # ✅ 每次取得新資料就即時寫入 CSV
            save_results_to_csv(all_results[:MAX_NEWS])

        except Exception as e:
            print(f"❗ 處理 {current_date.date()} ~ {next_date.date()} 錯誤：{e}")

        if len(all_results) >= MAX_NEWS:
            break

        current_date = next_date + timedelta(days=1)
        human_delay(*DELAY_RANGE)

    print(f"\n✅ 完成，共儲存 {len(all_results[:MAX_NEWS])} 則新聞到 CSV 檔。")


# ▶️ 執行
await main()



🔍 搜尋「鴻海 財經」 - 2024-01-01 到 2024-12-31
📄 擷取第 1 頁 (2024-01-01 ~ 2024-01-31)
📄 擷取第 2 頁 (2024-01-01 ~ 2024-01-31)
📄 擷取第 3 頁 (2024-01-01 ~ 2024-01-31)
📄 擷取第 4 頁 (2024-01-01 ~ 2024-01-31)
📄 擷取第 5 頁 (2024-01-01 ~ 2024-01-31)
🧮 累積 44 筆資料
📄 擷取第 1 頁 (2024-02-01 ~ 2024-03-02)
📄 擷取第 2 頁 (2024-02-01 ~ 2024-03-02)
📄 擷取第 3 頁 (2024-02-01 ~ 2024-03-02)
📄 擷取第 4 頁 (2024-02-01 ~ 2024-03-02)
📄 擷取第 5 頁 (2024-02-01 ~ 2024-03-02)
🧮 累積 87 筆資料
📄 擷取第 1 頁 (2024-03-03 ~ 2024-04-02)
📄 擷取第 2 頁 (2024-03-03 ~ 2024-04-02)
📄 擷取第 3 頁 (2024-03-03 ~ 2024-04-02)
📄 擷取第 4 頁 (2024-03-03 ~ 2024-04-02)
📄 擷取第 5 頁 (2024-03-03 ~ 2024-04-02)
🧮 累積 131 筆資料
📄 擷取第 1 頁 (2024-04-03 ~ 2024-05-03)
📄 擷取第 2 頁 (2024-04-03 ~ 2024-05-03)
📄 擷取第 3 頁 (2024-04-03 ~ 2024-05-03)
📄 擷取第 4 頁 (2024-04-03 ~ 2024-05-03)
📄 擷取第 5 頁 (2024-04-03 ~ 2024-05-03)
🧮 累積 181 筆資料
📄 擷取第 1 頁 (2024-05-04 ~ 2024-06-03)
📄 擷取第 2 頁 (2024-05-04 ~ 2024-06-03)
📄 擷取第 3 頁 (2024-05-04 ~ 2024-06-03)
📄 擷取第 4 頁 (2024-05-04 ~ 2024-06-03)
📄 擷取第 5 頁 (2024-05-04 ~ 2024-06-03)
🧮 累積 230 筆資

In [83]:
import pandas as pd
import glob

# 找出所有目標 CSV 檔案
file_paths = sorted(glob.glob('google_news_results_playwright*.csv'))

# 合併所有檔案
all_data = []
for file in file_paths:
    df = pd.read_csv(file)
    all_data.append(df)

# 合併成一個 DataFrame
merged_df = pd.concat(all_data, ignore_index=True)

# 將日期欄位轉成 datetime 格式並排序（假設欄位叫 'date'）
merged_df['date'] = pd.to_datetime(merged_df['date'], errors='coerce')
merged_df = merged_df.sort_values(by='date')

# 儲存為新檔案
merged_df.to_csv('google_news_results_playwright_merged_sorted.csv', index=False)


  merged_df['date'] = pd.to_datetime(merged_df['date'], errors='coerce')


In [2]:
import asyncio
from playwright.async_api import async_playwright

async def fetch_articles():
    test_urls = {
        "wealth": "https://www.wealth.com.tw/articles/de60abf4-f462-4fca-9976-5d2c7c7185da",
        "chinatimes": "https://www.chinatimes.com/newspapers/20110428000002-260202",
        "ltn": "https://ec.ltn.com.tw/article/paper/499221"
    }

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        for site, url in test_urls.items():
            try:
                await page.goto(url, timeout=20000)
                await page.wait_for_load_state('domcontentloaded')

                content_element = None

                if "wealth.com.tw" in url:
                    await page.wait_for_selector(".qdr-paragraph", timeout=10000)
                    paragraph_elements = await page.query_selector_all(".qdr-paragraph span[style*='white-space: pre-wrap']")
                    content_list = [await elem.inner_text() for elem in paragraph_elements]
                    content = "\n".join(content_list)

                elif "chinatimes.com" in url:
                    try:
                        await page.wait_for_selector(".article-body", timeout=8000)
                        content_element = await page.query_selector(".article-body")
                    except:
                        try:
                            await page.wait_for_selector(".article-content", timeout=8000)
                            content_element = await page.query_selector(".article-content")
                        except:
                            content_element = None

                elif "ltn.com.tw" in url:
                    await page.wait_for_selector(".text", timeout=10000)
                    content_element = await page.query_selector(".text")

                if content_element:
                    content = await content_element.inner_text()
                    print(f"\n✅ {site} 內文前500字：\n", content.strip()[:500])
                else:
                    print(f"❌ {site} 找不到內容元素")

            except Exception as e:
                print(f"❌ {site} 發生錯誤：{e}")

        await browser.close()

await fetch_articles()


Error: 
╔════════════════════════════════════════════════════════════════════════════════════════════════╗
║ Looks like you launched a headed browser without having a XServer running.                     ║
║ Set either 'headless: true' or use 'xvfb-run <your-playwright-app>' before running Playwright. ║
║                                                                                                ║
║ <3 Playwright Team                                                                             ║
╚════════════════════════════════════════════════════════════════════════════════════════════════╝
=========================== logs ===========================
<launching> /home/r11011101/.cache/ms-playwright/chromium-1080/chrome-linux/chrome --disable-field-trial-config --disable-background-networking --enable-features=NetworkService,NetworkServiceInProcess --disable-background-timer-throttling --disable-backgrounding-occluded-windows --disable-back-forward-cache --disable-breakpad --disable-client-side-phishing-detection --disable-component-extensions-with-background-pages --disable-component-update --no-default-browser-check --disable-default-apps --disable-dev-shm-usage --disable-extensions --disable-features=ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,DialMediaRouteProvider,AcceptCHFrame,AutoExpandDetailsElement,CertificateTransparencyComponentUpdater,AvoidUnnecessaryBeforeUnloadCheckSync,Translate --allow-pre-commit-input --disable-hang-monitor --disable-ipc-flooding-protection --disable-popup-blocking --disable-prompt-on-repost --disable-renderer-backgrounding --force-color-profile=srgb --metrics-recording-only --no-first-run --enable-automation --password-store=basic --use-mock-keychain --no-service-autorun --export-tagged-pdf --no-sandbox --user-data-dir=/tmp/playwright_chromiumdev_profile-yNlTZM --remote-debugging-pipe --no-startup-window
<launched> pid=1450050
[pid=1450050][err] [1450050:1450050:0511/230417.371613:ERROR:ozone_platform_x11.cc(240)] Missing X server or $DISPLAY
[pid=1450050][err] [1450050:1450050:0511/230417.371637:ERROR:env.cc(255)] The platform failed to initialize.  Exiting.
============================================================

In [3]:
import requests, trafilatura
from newspaper import Article
from readability import Document
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
import asyncio

test_urls = {
    "wealth": "https://www.wealth.com.tw/articles/de60abf4-f462-4fca-9976-5d2c7c7185da",
    "chinatimes": "https://www.chinatimes.com/newspapers/20110428000002-260202",
    "ltn": "https://ec.ltn.com.tw/article/paper/499221"
}

async def fallback_with_browser(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        await page.goto(url, timeout=30000)
        await page.wait_for_timeout(3000)
        text = await page.inner_text("body")
        await browser.close()
        print("✅ Playwright 擷取成功：\n", text[:1500])

async def extract_from_url(name, url):
    print(f"\n🚀 正在處理：{name} - {url}")
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/114.0.0.0 Safari/537.36"}

    try:
        html = requests.get(url, headers=headers, timeout=15).text
    except:
        html = None

    # 方法 1：trafilatura
    if html:
        text = trafilatura.extract(html)
        if text and len(text.strip()) > 100:
            print("✅ Trafilatura 擷取成功：\n", text[:1500])
            return

    # 方法 2：newspaper3k
    try:
        article = Article(url); article.download(); article.parse()
        if article.text.strip():
            print("✅ Newspaper 擷取成功：\n", article.text[:1500])
            return
    except:
        pass

    # 方法 3：readability
    try:
        summary = BeautifulSoup(Document(html).summary(), "html.parser").get_text()
        if len(summary.strip()) > 100:
            print("✅ Readability 擷取成功：\n", summary[:1500])
            return
    except:
        pass

    # 方法 4：瀏覽器備援
    print("🚨 所有方法失敗，改用瀏覽器擷取...")
    await fallback_with_browser(url)

async def main():
    for name, url in test_urls.items():
        await extract_from_url(name, url)

# ✅ Jupyter Notebook 中執行這一行：
await main()



🚀 正在處理：wealth - https://www.wealth.com.tw/articles/de60abf4-f462-4fca-9976-5d2c7c7185da
🚨 所有方法失敗，改用瀏覽器擷取...


Error: 
╔════════════════════════════════════════════════════════════════════════════════════════════════╗
║ Looks like you launched a headed browser without having a XServer running.                     ║
║ Set either 'headless: true' or use 'xvfb-run <your-playwright-app>' before running Playwright. ║
║                                                                                                ║
║ <3 Playwright Team                                                                             ║
╚════════════════════════════════════════════════════════════════════════════════════════════════╝
=========================== logs ===========================
<launching> /home/r11011101/.cache/ms-playwright/chromium-1080/chrome-linux/chrome --disable-field-trial-config --disable-background-networking --enable-features=NetworkService,NetworkServiceInProcess --disable-background-timer-throttling --disable-backgrounding-occluded-windows --disable-back-forward-cache --disable-breakpad --disable-client-side-phishing-detection --disable-component-extensions-with-background-pages --disable-component-update --no-default-browser-check --disable-default-apps --disable-dev-shm-usage --disable-extensions --disable-features=ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,DialMediaRouteProvider,AcceptCHFrame,AutoExpandDetailsElement,CertificateTransparencyComponentUpdater,AvoidUnnecessaryBeforeUnloadCheckSync,Translate --allow-pre-commit-input --disable-hang-monitor --disable-ipc-flooding-protection --disable-popup-blocking --disable-prompt-on-repost --disable-renderer-backgrounding --force-color-profile=srgb --metrics-recording-only --no-first-run --enable-automation --password-store=basic --use-mock-keychain --no-service-autorun --export-tagged-pdf --no-sandbox --user-data-dir=/tmp/playwright_chromiumdev_profile-HGcVzM --remote-debugging-pipe --no-startup-window
<launched> pid=1450209
[pid=1450209][err] [1450209:1450209:0511/230553.785499:ERROR:ozone_platform_x11.cc(240)] Missing X server or $DISPLAY
[pid=1450209][err] [1450209:1450209:0511/230553.785532:ERROR:env.cc(255)] The platform failed to initialize.  Exiting.
============================================================