#### House Good Job 自動爬蟲測試

### 資料來源

- 不動產成交案件 Open Data: https://plvr.land.moi.gov.tw/DownloadOpenData


In [40]:
import os
import requests
import zipfile
import re
import csv
import pandas as pd

# Mapping of city names to ISO 3166 codes
CITY_TO_ISO = {
    '臺北市': 'TPE', '臺中市': 'TXG', '基隆市': 'KEE', '臺南市': 'TNN', '高雄市': 'KHH',
    '新北市': 'NWT', '宜蘭縣': 'ILA', '桃園市': 'TAO', '嘉義市': 'CYI', '新竹縣': 'HSQ',
    '苗栗縣': 'MIA', '南投縣': 'NAN', '彰化縣': 'CHA', '新竹市': 'HSZ', '雲林縣': 'YUN',
    '嘉義縣': 'CYQ', '屏東縣': 'PIF', '花蓮縣': 'HUA', '臺東縣': 'TTT', '金門縣': 'KIN',
    '澎湖縣': 'PEN', '連江縣': 'LIE'
}

# Mapping of data usage purposes to new suffixes
PURPOSE_TO_SUFFIX = {
    '不動產買賣': 'trade',
    '預售屋買賣': 'presale',
    '不動產租賃': 'rental'
}

def download_and_extract(taiwan_year, quarter, data_type):
    """
    Download and extract real estate data.
    Args:
        taiwan_year (int): Taiwan calendar year (CE year - 1911).
        quarter (str): Quarter (S1, S2, S3, S4).
        data_type (str): Data type (xml, txt, csv, xls).
    """
    # Construct the URL
    url = f"https://plvr.land.moi.gov.tw//DownloadSeason?season={taiwan_year}{quarter}&type=zip&fileName=lvr_land{data_type}.zip"
    
    # Create the temporary directory and subfolder
    base_dir = './tmp/real_price'
    sub_dir = os.path.join(base_dir, f"{taiwan_year}_{quarter}")
    os.makedirs(sub_dir, exist_ok=True)
    
    # Define the zip file path
    zip_path = os.path.join(sub_dir, f'{taiwan_year}_{quarter}_real_price.zip')
    
    # Send GET request to download the zip file
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for bad status codes
    
    # Save the zip file
    with open(zip_path, 'wb') as f:
        f.write(response.content)
    
    # Extract the zip file
    extracted_files = []
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        for file_name in zip_ref.namelist():
            # Match files with 'manifest.csv' or '[a-z]_lvr_land_[a-z].csv'
            if re.match(r'^manifest\.csv$', file_name) or re.match(r'^[a-z]_lvr_land_[a-z]\.csv$', file_name):
                zip_ref.extract(file_name, sub_dir)
                extracted_files.append(file_name)
    
    # Remove the zip file
    # os.remove(zip_path)
    
    # Process manifest.csv to map filenames to city names and purposes
    manifest_path = os.path.join(sub_dir, 'manifest.csv')
    if os.path.exists(manifest_path):
        manifest_df = pd.read_csv(manifest_path, encoding='utf-8')
        if 'name' in manifest_df.columns and 'description' in manifest_df.columns:
            file_city_map = dict(zip(manifest_df['name'], manifest_df['description']))
        else:
            raise KeyError("Required columns 'name' or 'description' are missing in manifest.csv")
    
        # Rename extracted files based on city ISO codes and data usage purposes
        for file_name in extracted_files:
            if file_name in file_city_map:
                description = file_city_map[file_name]
                iso_code = next((code for city_name, code in CITY_TO_ISO.items() if city_name in description), None)
                suffix = next((suffix_value for purpose, suffix_value in PURPOSE_TO_SUFFIX.items() if purpose in description), None)
                if iso_code and suffix:
                    new_file_name = re.sub(r'^[a-z]_lvr_land_[a-z]\.csv$', f'{iso_code}_lvr_land_{suffix}.csv', file_name)
                    os.rename(
                        os.path.join(sub_dir, file_name),
                        os.path.join(sub_dir, new_file_name)
                    )

In [41]:
download_and_extract(113, 'S3', 'csv')