In [0]:
import requests
import time
import json
from bs4 import BeautifulSoup
from pyspark.sql import SparkSession
import html
import csv
import os
import re

# Define parameters for the notebook
dbutils.widgets.text('environment',"")

var_environment = dbutils.widgets.get('environment')

spark = SparkSession.builder.getOrCreate()
# 成交结果
BASE_URL = "https://www.whtdsc.com/api/information/informationMarketNetwork"
LIST_API = BASE_URL + "/listPageOfPublish"
DETAIL_API = BASE_URL + "/getInfo"
OUTPUT_RAW = f"/Volumes/land_market_{var_environment}/10_rawdata/transaction_info_raw/"
OUTPUT_PARSED = f"/Volumes/land_market_{var_environment}/10_rawdata/transaction_info_parsed/"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
}

获取总页数

In [0]:
def get_total_pages():
    params = {
        "size": 5,
        "current": 1,
        "title": "",
        "columnType": "TRANSACTION_INFORMATION",
    }
    r = requests.get(LIST_API, params=params, headers=HEADERS)
    data = r.json()["data"]
    return data["pages"]

获取列表页

In [0]:
def fetch_list_page(page):
    params = {
        "size": 5,
        "current": page,
        "title": "",
        "columnType": "TRANSACTION_INFORMATION",
    }
    r = requests.get(LIST_API, params=params, headers=HEADERS)
    return r.json()["data"]["records"]

获取单条详情: title, publish_time, html_content

In [0]:
def fetch_detail(record_id):
    """
    根据 ID 调用 API 获取详情 HTML 表格内容
    """
    url = f"https://www.whtdsc.com/api/information/informationMarketNetwork/getById?id={record_id}"
    resp = requests.get(url)
    resp.raise_for_status()
    data = resp.json()["data"]["informationMarketNetwork"]

    return data

解析HTML

In [0]:
def parse_html_table(data):
    """
    解析 HTML 表格
    """

    html_content = html.unescape(data["content"])
    soup = BeautifulSoup(html_content, "html.parser")
    table = soup.find("table")

    # 没有 table → 不是数据表 → 跳过
    if not table:
        print("⚠️ 非表格类型详情页，跳过处理。可能为 PDF 或图片。")
        return None

    # 解析表格
    rows = table.find_all("tr")
    TARGET_COLS = 12
    parsed_rows = []

    for row in rows:
        cols = [c.get_text(strip=True) for c in row.find_all("td")]
        
        if not cols or all(c == "" for c in cols):
            continue

        # 自动左移, 找到第一个非空列
        first_non_empty = next((i for i, v in enumerate(cols) if v.strip()), None)
        if first_non_empty is None:
            continue

        cols = cols[first_non_empty:]

        # 可选：过滤只有 1~2 列的标题行
        if len(cols) < 3:
            continue

        # 固定 12 列
        # if len(cols) > TARGET_COLS:
        #     cols = cols[:TARGET_COLS]
        # else:
        #     cols += [""] * (TARGET_COLS - len(cols))

        parsed_rows.append(cols)

    return parsed_rows

保存为 原始json格式

In [0]:
def save_raw_to_json(data):
    # 处理时间戳格式：2025-09-26 16:27:53 → 20250926162753
    publish_time = data["publishTime"]
    record_id = data["id"]

    ts = re.sub(r"[-:\s]", "", publish_time)
    filename = f"{record_id}-{ts}.json"
    full_path = os.path.join(OUTPUT_RAW, filename)

    # --- 增量：如果文件存在则跳过 ---
    if os.path.exists(full_path):
        print(f"已存在，跳过：{filename}")
        return

    with open(full_path, "w", encoding="utf-8-sig") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    # print(f"Generating raw file: {filename}")

In [0]:
def save_parsed_to_json(parsed_rows, data):
    """
    parsed_rows: List[Dict]  -> 从详情页解析出的行数据
    data:        Dict        -> 包括 title, publishTime, url 等信息
    """

    publish_time = data["publishTime"]
    title = data["title"]
    transaction_id = data["id"]
    url = f"https://www.whtdsc.com/transaction/detail?id={transaction_id}&columnType=TRANSACTION_INFORMATION"

    ts = re.sub(r"[-:\s]", "", publish_time)[:8]

    safe_title = re.sub(r"[\\/:*?\"<>|]", "_", title)
    filename = f"{safe_title}-{ts}.json"
    full_path = os.path.join(OUTPUT_PARSED, filename)

    # 构造 JSON 对象（推荐结构）
    out = {
        "transaction_id": transaction_id,
        "title": title,
        "publish_time": publish_time,
        "url": url,           # 记录来源 URL
        "rows": parsed_rows   # 每行一个 dict，Schema 不固定
    }

    # --- 增量：如果文件存在则跳过 ---
    if os.path.exists(full_path):
        print(f"已存在，跳过：{filename}")
        return

    with open(full_path, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=4)

    print(f"Generated json: {filename}")


In [0]:
def crawl_all(max_pages=None):
    total_pages = get_total_pages()
    print(f"共 {total_pages} 页")

    # 如果传入 max_pages，则只爬到 max_pages 页
    if max_pages is not None:
        total_pages = min(total_pages, max_pages)

    for page in range(1, total_pages + 1):
        print(f"\n=== 第 {page} 页 ===")
        records = fetch_list_page(page)

        for rec in records:
            record_id = rec["id"]

            data = fetch_detail(record_id)
            save_raw_to_json(data)
            parsed_rows = parse_html_table(data)

            # 跳过非表格
            if parsed_rows is None:
                print(f"跳过非表格详情页 ID={record_id}")
                continue

            save_parsed_to_json(parsed_rows, data)
            time.sleep(1)  # 降低访问压力

In [0]:
# 运行爬虫, 只爬取第一个页（5条记录）crawl_all(1)
crawl_all(1)