In [4]:
#  相關套件

import time
import random
import json
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
from collections import deque
from concurrent.futures import ThreadPoolExecutor

WEB_NAME = '104_人力銀行'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
    'Referer': 'https://www.104.com.tw/jobs/search',
    }


In [45]:
## 取得網站所有職業總覽

# 1. 取得 JSON 資料
# jobcat 檔案名稱
file_jobcat_json = f"{WEB_NAME}_jobcat_json.txt"
url_JobCat = "https://static.104.com.tw/category-tool/json/JobCat.json"

response_jobcat = requests.get(url_JobCat, headers=HEADERS, timeout=10)
response_jobcat.raise_for_status()
jobcat_data = response_jobcat.json()
with open(file_jobcat_json, "w", encoding="utf-8") as f:
    json.dump(jobcat_data, f, ensure_ascii=False, indent=4)
print(f"職業總覽資料已儲存為 {file_jobcat_json}")


# 2. 直接將 requests 取得的資料傳入遞迴函式
def flatten_jobcat_recursive(node_list, parent_des=None, parent_no=None):
    flat_list = []
    for node in node_list:
        row = {
            "parent_code": parent_no,
            "parent_name": parent_des,
            "job_code": node.get("no"),
            "job_name": node.get("des"),
        }
        flat_list.append(row)
        if "n" in node and node["n"]:
            children_list = flatten_jobcat_recursive(
                node_list=node["n"],
                parent_des=node.get("des"),
                parent_no=node.get("no"),
            )
            flat_list.extend(children_list)
    return flat_list


# 3. 執行結果轉為 DataFrame
flattened_data = flatten_jobcat_recursive(jobcat_data)
df_jobcat = pd.DataFrame(flattened_data)
df_jobcat = df_jobcat[df_jobcat["parent_code"].notnull()]
df_jobcat_sorted = df_jobcat.sort_values(by="job_code")
df_jobcat_sorted.to_excel(f"{WEB_NAME}_category.xlsx", index=False)
print(f"職業總覽資料已轉換為 '{WEB_NAME}_category.xlsx'")

# 篩選出 IT 相關的工作
# 篩選出 job_code 以 '140' 開頭的行
mask = df_jobcat_sorted["job_code"].astype(str).str.startswith("2007")
df_it_jobs = df_jobcat_sorted[mask]
# df_it_jobs

職業總覽資料已儲存為 104_人力銀行_jobcat_json.txt
職業總覽資料已轉換為 '104_人力銀行_category.xlsx'


In [46]:
# 產生 104 人力銀行網址 https://www.104.com.tw 根據提供的 (關鍵字和職缺類別) 轉換為職缺網址

def catch_104_url (KEYWORDS, CATEGORY, ORDER=None):
    """
    這個函數會根據給定的關鍵字和類別參數構建一個完整的職缺網址。
    如果同時提供了關鍵字和類別，將會包含兩者；如果只提供其中一個，則只會包含該參數。

    參數:
    KEYWORDS (str): 職缺的關鍵字。
    CATEGORY (str): 職缺的類別。
    ORDER (int, optional): 排序的參數，預設為 None。

    返回:
    str: 生成的職缺網址。
    """

    BASE_URL = "https://www.104.com.tw/jobs/search/?jobsource=joblist_search&mode=s"

    param_fragments = []
    if ORDER is not None:
        param_fragments.append(f"order={ORDER}")
    if KEYWORDS:
        param_fragments.append(f"keyword={KEYWORDS}")
    if CATEGORY:
        param_fragments.append(f"jobcat={CATEGORY}")
    query_string = "&".join(param_fragments)

    return f"{BASE_URL}{query_string}&page="


KEYWORDS_STR = "雲端工程師"
JOBCAT_CODE = "2007000000"
ORDER_SETTING = 16     # 15 (符合度高)、  16 (最近更新)


# # 測試範例
# url_1 = catch_104_url (KEYWORDS_STR, JOBCAT_CODE, ORDER_SETTING)
# print (url_1)  # https://www.104.com.tw/jobs/search/?jobsource=joblist_search&mode=s&order=15&keyword=雲端工程師&jobcat=2007000000&page=

# url_2 = catch_104_url (KEYWORDS_STR, "")
# print (url_2)  # https://www.104.com.tw/jobs/search/?jobsource=joblist_search&mode=s&keyword=雲端工程師&page=

# url_3 = catch_104_url ("", JOBCAT_CODE, ORDER_SETTING)
# print (url_3)  # https://www.104.com.tw/jobs/search/?jobsource=joblist_search&mode=s&order=15&jobcat=2007000000&page=

# url_4 = catch_104_url ("","")
# print (url_4)  # https://www.104.com.tw/jobs/search/?jobsource=joblist_search&mode=s&page=

In [47]:
#  從 api 網址獲取工作職缺的網址

def fetch_104_job_url (_CODE, KEYWORD):
    """
    這個函數會遍歷多個頁面，並從每個頁面中提取工作職缺的網址，將其存儲在一個集合中以避免重複。
    使用 tqdm 顯示進度條，並在每次請求之間隨機延遲以避免過於頻繁的請求。

    參數:
    _CODE (str): 職缺類別的代碼。
    KEYWORD (str): 搜尋的關鍵字。

    返回:
    list: 包含所有獲取到的工作職缺網址的列表。

    """

    BASE_URL = "https://www.104.com.tw/jobs/search/api/jobs"

    PAGE = 1
    MAX_PAGE = 10
    PAGE_SIZE = 30
    ORDER_SETTING = 15   # 15 (符合度高)、  16 (最近更新)
    
    MAX_LENGTH = 4
    recent_counts = deque (maxlen=MAX_LENGTH)
    job_url_set = set ()  # 用於存儲唯一的職缺網址

    with requests.Session () as session, tqdm (total=MAX_PAGE, desc="104 職缺列表", unit="PAGE", leave=True) as pbar:
        while True:
            params = {
                'jobsource': 'm_joblist_search',
                'page': PAGE,
                'pagesize': PAGE_SIZE,
                'order': ORDER_SETTING, 
                'jobcat': _CODE,
                'keyword': KEYWORD,
            }

            response = requests.get (BASE_URL, headers=HEADERS, params=params, timeout=20)
            api_job_urls = response.json ()['data']
            for job_url in api_job_urls:
                job_url_set.add (job_url ['link']['job'])

            # 檢查是否有新資料
            total_jobs = len (job_url_set) 
            recent_counts.append (total_jobs)
            if len (recent_counts) == MAX_LENGTH and len (set (recent_counts)) == 1:
                print (f"連續 {MAX_LENGTH} 次沒有新資料，提前結束。")
                break
            
            time.sleep (random.uniform (0.5, 1.5))
            
            pbar.set_postfix_str (f"目前頁面 {PAGE}, 最大頁數: {MAX_PAGE}")
            pbar.update (1)

            PAGE = PAGE + 1  # 更新頁碼
            if PAGE >= MAX_PAGE:
                MAX_PAGE = PAGE + 1 
                pbar.total = MAX_PAGE

    modified_job_url_set = {f"https://www.104.com.tw/job/ajax/content/{url.split ('/')[-1]}" for url in job_url_set}
    print (f"共獲取到 {len (job_url_set)} 筆職缺資料。")
    return list (modified_job_url_set)


# 測試範例
# JOBCAT_CODE = "2007000000"
# KEYWORDS = "雲端工程師"
# jobs_url = fetch_104_job_url(JOBCAT_CODE, KEYWORDS)
# jobs_url [0]

In [48]:
# 從指定的職缺網址獲取職缺的相關數據

def fetch_104_job_data(job_url):
    """
    這個函數會發送 GET 請求到提供的職缺網址，並使用 BeautifulSoup 解析返回的 HTML 文檔。
    它會從頁面中的 JavaScript 代碼中提取職缺的元數據，並將其轉換為 Pandas DataFrame 格式。

    參數:
    job_url (str): 職缺的網址。

    返回:
    pd.DataFrame: 包含職缺詳細信息的 DataFrame，包括職缺網址、公司名稱、公司網址及其他職缺網址。
    """
    response = requests.get(job_url, headers=HEADERS)
    jobMetaData = response.json()["data"]
    df = pd.json_normalize(jobMetaData)
    return df


# 測試範例
# job_url = "https://www.104.com.tw/job/ajax/content/8k4lp"   # jobs_url [0]
# job_data = fetch_104_job_data(job_url)
# job_data

In [49]:
# 根據關鍵字與職業類別 獲取所有工作職位的資料

SEARCH_TIMESTAMP = time.strftime ('%Y-%m-%d', time.localtime (time.time ()))
JOBCAT_CODE = "2007000000"
KEYWORDS = "雲端工程師"
FILE_NAME = f"({SEARCH_TIMESTAMP})_{WEB_NAME}_{KEYWORDS}_{JOBCAT_CODE}"

print ( f"開始執行 {FILE_NAME}" )
job_data_list = []
job_urls = fetch_104_job_url(JOBCAT_CODE, KEYWORDS)        # 列出 104 人力銀行 - 職缺網址列表 


all_jobs_df = pd.DataFrame ()  # 初始化一個空的 DataFrame

for url in tqdm (job_urls, desc="Fetching job data", unit="job"):
    df_job_data = fetch_104_job_data(url)
    all_jobs_df = pd.concat ([all_jobs_df, df_job_data], ignore_index=True)


print(all_jobs_df.shape)

all_jobs_df.head(1)

開始執行 (2025-06-16)_104_人力銀行_雲端工程師_2007000000


104 職缺列表:  95%|█████████▌| 38/40 [01:00<00:03,  1.59s/PAGE, 目前頁面 38, 最大頁數: 39]


連續 4 次沒有新資料，提前結束。
共獲取到 1034 筆職缺資料。


Fetching job data: 100%|██████████| 1034/1034 [03:30<00:00,  4.91job/s]

(1034, 79)





Unnamed: 0,switch,custLogo,postalCode,closeDate,industry,custNo,reportUrl,industryNo,employees,chinaCorp,...,jobDetail.startWorkingDay,jobDetail.hireType,jobDetail.delegatedRecruit,jobDetail.needEmp,jobDetail.landmark,jobDetail.remoteWork,interactionRecord.lastProcessedResumeAtTime,interactionRecord.nowTimestamp,jobDetail.remoteWork.type,jobDetail.remoteWork.description
0,on,https://static.104.com.tw/b_profile/cust_pictu...,105,,人力仲介代徵,11111119000,https://www.104.com.tw/feedback?category=2&cus...,1009001001,820人,False,...,不限,0,,1人,,,,1750026676,,


In [50]:
# all_jobs_df.to_csv (f"{FILE_NAME}.csv", index=False, encoding='utf-8-sig')
# print (f"已將所有職缺資料儲存到 {FILE_NAME}.csv")

all_jobs_df.to_excel(f"{FILE_NAME}.xlsx", index=False)
print(f"已將所有職缺資料儲存到 {FILE_NAME}.xlsx")

已將所有職缺資料儲存到 (2025-06-16)_104_人力銀行_雲端工程師_2007000000.xlsx


In [51]:
all_jobs_df.columns

Index(['switch', 'custLogo', 'postalCode', 'closeDate', 'industry', 'custNo',
       'reportUrl', 'industryNo', 'employees', 'chinaCorp',
       'corpImageRight.corpImageRight.imageUrl',
       'corpImageRight.corpImageRight.link', 'header.corpImageTop.imageUrl',
       'header.corpImageTop.link', 'header.jobName', 'header.appearDate',
       'header.custName', 'header.custUrl', 'header.analysisType',
       'header.analysisUrl', 'header.isSaved', 'header.isApplied',
       'header.applyDate', 'header.userApplyCount', 'header.isActivelyHiring',
       'contact.hrName', 'contact.email', 'contact.visit', 'contact.phone',
       'contact.other', 'contact.reply', 'environmentPic.environmentPic',
       'environmentPic.corpImageBottom.imageUrl',
       'environmentPic.corpImageBottom.link', 'condition.acceptRole.role',
       'condition.acceptRole.disRole.needHandicapCompendium',
       'condition.acceptRole.disRole.disability', 'condition.workExp',
       'condition.edu', 'condition.major'

In [52]:
column_names = [
    {"序號": 1, "英文": "switch", "中文": "內部切換/開關"},
    {"序號": 2, "英文": "custLogo", "中文": "公司Logo"},
    {"序號": 3, "英文": "postalCode", "中文": "郵遞區號"},
    {"序號": 4, "英文": "closeDate", "中文": "截止日期"},
    {"序號": 5, "英文": "industry", "中文": "產業類別"},
    {"序號": 6, "英文": "custNo", "中文": "公司代號"},
    {"序號": 7, "英文": "reportUrl", "中文": "檢舉職務網址"},
    {"序號": 8, "英文": "industryNo", "中文": "產業代號"},
    {"序號": 9, "英文": "employees", "中文": "員工人數"},
    {"序號": 10, "英文": "chinaCorp", "中文": "中國大陸關係企業"},
    {
        "序號": 11,
        "英文": "corpImageRight.corpImageRight.imageUrl",
        "中文": "右側公司圖片網址",
    },
    {
        "序號": 12,
        "英文": "corpImageRight.corpImageRight.link",
        "中文": "右側公司圖片連結",
    },
    {"序號": 13, "英文": "header.corpImageTop.imageUrl", "中文": "頂部公司圖片網址"},
    {"序號": 14, "英文": "header.corpImageTop.link", "中文": "頂部公司圖片連結"},
    {"序號": 15, "英文": "header.jobName", "中文": "職務名稱"},
    {"序號": 16, "英文": "header.appearDate", "中文": "更新日期"},
    {"序號": 17, "英文": "header.custName", "中文": "公司名稱"},
    {"序號": 18, "英文": "header.custUrl", "中文": "公司頁面網址"},
    {"序號": 19, "英文": "header.analysisType", "中文": "應徵分析類型"},
    {"序號": 20, "英文": "header.analysisUrl", "中文": "應徵分析網址"},
    {"序號": 21, "英文": "header.isSaved", "中文": "是否已儲存"},
    {"序號": 22, "英文": "header.isApplied", "中文": "是否已應徵"},
    {"序號": 23, "英文": "header.applyDate", "中文": "應徵日期"},
    {"序號": 24, "英文": "header.userApplyCount", "中文": "使用者應徵次數"},
    {"序號": 25, "英文": "header.isActivelyHiring", "中文": "是否為積極徵才"},
    {"序號": 26, "英文": "contact.hrName", "中文": "聯絡人"},
    {"序號": 27, "英文": "contact.email", "中文": "聯絡E-mail"},
    {"序號": 28, "英文": "contact.visit", "中文": "親洽地址"},
    {"序號": 29, "英文": "contact.phone", "中文": "聯絡電話"},
    {"序號": 30, "英文": "contact.other", "中文": "其他聯絡方式"},
    {"序號": 31, "英文": "contact.reply", "中文": "應徵回覆率/時間"},
    {"序號": 32, "英文": "environmentPic.environmentPic", "中文": "公司環境照片"},
    {
        "序號": 33,
        "英文": "environmentPic.corpImageBottom.imageUrl",
        "中文": "底部公司圖片網址",
    },
    {
        "序號": 34,
        "英文": "environmentPic.corpImageBottom.link",
        "中文": "底部公司圖片連結",
    },
    {"序號": 35, "英文": "condition.acceptRole.role", "中文": "接受身份"},
    {
        "序號": 36,
        "英文": "condition.acceptRole.disRole.needHandicapCompendium",
        "中文": "需附身心障礙證明",
    },
    {
        "序號": 37,
        "英文": "condition.acceptRole.disRole.disability",
        "中文": "身心障礙類別",
    },
    {"序號": 38, "英文": "condition.workExp", "中文": "工作經歷"},
    {"序號": 39, "英文": "condition.edu", "中文": "學歷要求"},
    {"序號": 40, "英文": "condition.major", "中文": "科系要求"},
    {"序號": 41, "英文": "condition.language", "中文": "語文條件"},
    {"序號": 42, "英文": "condition.localLanguage", "中文": "本國語言條件"},
    {"序號": 43, "英文": "condition.specialty", "中文": "工作技能"},
    {"序號": 44, "英文": "condition.skill", "中文": "擅長工具"},
    {"序號": 45, "英文": "condition.certificate", "中文": "具備證照"},
    {"序號": 46, "英文": "condition.driverLicense", "中文": "具備駕照"},
    {"序號": 47, "英文": "condition.other", "中文": "其他條件"},
    {"序號": 48, "英文": "welfare.tag", "中文": "福利標籤"},
    {"序號": 49, "英文": "welfare.welfare", "中文": "公司福利(詳細說明)"},
    {"序號": 50, "英文": "welfare.legalTag", "中文": "法定福利標籤"},
    {"序號": 51, "英文": "jobDetail.jobDescription", "中文": "工作內容"},
    {"序號": 52, "英文": "jobDetail.jobCategory", "中文": "職務類別"},
    {"序號": 53, "英文": "jobDetail.salary", "中文": "薪資待遇(文字描述)"},
    {"序號": 54, "英文": "jobDetail.salaryMin", "中文": "最低薪資"},
    {"序號": 55, "英文": "jobDetail.salaryMax", "中文": "最高薪資"},
    {"序號": 56, "英文": "jobDetail.salaryType", "中文": "薪資類型(月薪/面議)"},
    {"序號": 57, "英文": "jobDetail.jobType", "中文": "工作性質(全職/兼職)"},
    {"序號": 58, "英文": "jobDetail.workType", "中文": "工作型態"},
    {"序號": 59, "英文": "jobDetail.addressNo", "中文": "地址郵遞區號"},
    {"序號": 60, "英文": "jobDetail.addressRegion", "中文": "上班地點(縣市)"},
    {"序號": 61, "英文": "jobDetail.addressArea", "中文": "上班地點(鄉鎮市區)"},
    {"序號": 62, "英文": "jobDetail.addressDetail", "中文": "上班地點(詳細地址)"},
    {"序號": 63, "英文": "jobDetail.industryArea", "中文": "工作地點/工業區"},
    {"序號": 64, "英文": "jobDetail.longitude", "中文": "經度"},
    {"序號": 65, "英文": "jobDetail.latitude", "中文": "緯度"},
    {"序號": 66, "英文": "jobDetail.manageResp", "中文": "管理責任"},
    {"序號": 67, "英文": "jobDetail.businessTrip", "中文": "出差外派"},
    {"序號": 68, "英文": "jobDetail.workPeriod", "中文": "上班時段"},
    {"序號": 69, "英文": "jobDetail.vacationPolicy", "中文": "休假制度"},
    {"序號": 70, "英文": "jobDetail.startWorkingDay", "中文": "可上班日"},
    {"序號": 71, "英文": "jobDetail.hireType", "中文": "聘僱類型"},
    {"序號": 72, "英文": "jobDetail.delegatedRecruit", "中文": "是否為派遣工作"},
    {"序號": 73, "英文": "jobDetail.needEmp", "中文": "需求人數"},
    {"序號": 74, "英文": "jobDetail.landmark", "中文": "地標"},
    {"序號": 75, "英文": "jobDetail.remoteWork.type", "中文": "遠端工作類型"},
    {"序號": 76, "英文": "jobDetail.remoteWork.description", "中文": "遠端工作描述"},
    {
        "序號": 77,
        "英文": "interactionRecord.lastProcessedResumeAtTime",
        "中文": "上次處理履歷時間",
    },
    {"序號": 78, "英文": "interactionRecord.nowTimestamp", "中文": "當前時間戳"},
    {"序號": 79, "英文": "jobDetail.remoteWork", "中文": "遠端工作(物件)"},
]

In [53]:
df_new = pd.json_normalize(column_names)
df_new.columns = ["序號", "104_英文", "104_中文"]
df_new

Unnamed: 0,序號,104_英文,104_中文
0,1,switch,內部切換/開關
1,2,custLogo,公司Logo
2,3,postalCode,郵遞區號
3,4,closeDate,截止日期
4,5,industry,產業類別
...,...,...,...
74,75,jobDetail.remoteWork.type,遠端工作類型
75,76,jobDetail.remoteWork.description,遠端工作描述
76,77,interactionRecord.lastProcessedResumeAtTime,上次處理履歷時間
77,78,interactionRecord.nowTimestamp,當前時間戳
