In [2]:
#  相關套件

import time
import random
import json
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
from collections import deque
from concurrent.futures import ThreadPoolExecutor

WEB_NAME = 'Cake_me'
WEB_URL = "https://www.cake.me/job"

In [3]:
# 取得職業類別


# 1. 取得 JSON 資料
# jobcat 檔案名稱
file_jobcat_json = f"{WEB_NAME}_jobcat_json.txt"
jobtt = requests.get(WEB_URL)
jobtt_soup = BeautifulSoup(jobtt.text, 'html.parser')
scripts = jobtt_soup.find_all('script')
jobcat_data = json.loads(scripts[-1].string)['props']['pageProps']['_nextI18Next']

# 2. 儲存 JSON 資料
with open(file_jobcat_json, "w", encoding="utf-8") as f:
    json.dump(jobcat_data, f, ensure_ascii=False, indent=4)
print(f"職業總覽資料已儲存為 {file_jobcat_json}")

df = pd.json_normalize(jobcat_data)

sector_prefix = 'initialI18nStore.zh-TW.sector.sectors'
sector_groups_prefix = 'initialI18nStore.zh-TW.sector.sector_groups.'
prefix = sector_prefix

filtered_df = (
            df.filter(like=prefix)
              .rename(columns=lambda col: col.replace(prefix, ''))
        )

filtered_df

職業總覽資料已儲存為 Cake_me_jobcat_json.txt


Unnamed: 0,.advertising-marketing-agency_adtech-martech,.advertising-marketing-agency_advertising,.advertising-marketing-agency_design,.advertising-marketing-agency_digital,.advertising-marketing-agency_event-management,.advertising-marketing-agency_marketing-communications,.advertising-marketing-agency_public-relations,.agriculture_agricultural-technology,.agriculture_dairy,.agriculture_farming,...,.tech_gambling-casinos,.tech_games,.tech_hardware,.tech_information-services,.tech_internet,.tech_mobile-apps,.tech_robotics,.tech_saas-cloud-services,.tech_semiconductor,.tech_software
0,廣告技術 / 行銷技術,廣告,設計,數位,事件管理,行銷 / 溝通,公共關係,農業科技,乳製品 / 酪農,農業,...,博弈 / 賭場,遊戲,硬體,資訊服務,網際網路,手機應用程式,機器人科學,軟體即服務 / 雲服務,半導體,軟體


In [4]:
# 產生cake.me網址 https://www.cake.me 根據提供的(關鍵字和職缺類別) 轉換為職缺網址

def cake_me_url(KEYWORDS, CATEGORY, ORDER=None):
    """
    這個函數會根據給定的關鍵字和類別參數構建一個完整的職缺網址。
    如果同時提供了關鍵字和類別，將會包含兩者；如果只提供其中一個，則只會包含該參數。

    參數:
    KEYWORDS (str): 職缺的關鍵字。
    CATEGORY (str): 職缺的類別。
    ORDER (str, optional): 排序的參數，預設為 None。

    返回:
    str: 生成的職缺網址。

    # 測試範例
    url_1 = cake_me_url("雲端工程師", "it", "latest")    
    # https://www.cake.me/jobs/雲端工程師?order=latest&profession[0]=it&page=

    url_2 = cake_me_url("雲端工程師", "")      
    # https://www.cake.me/jobs/雲端工程師?page=

    url_3 = cake_me_url("", "it", "latest")             
    # https://www.cake.me/jobs/categories/it?order=latest&page=

    url_4 = cake_me_url("", "")               
    # https://www.cake.me/jobs?page=
    
    """

    BASE_URL = "https://www.cake.me/jobs"

    if KEYWORDS and CATEGORY:
        url = f"{BASE_URL}/{KEYWORDS}?profession[0]={CATEGORY}&page="
    elif KEYWORDS:
        url = f"{BASE_URL}/{KEYWORDS}?page="
    elif CATEGORY:
        url = f"{BASE_URL}/categories/{CATEGORY}?page="
    else:
        url = f"{BASE_URL}?page="

    if ORDER:  # 只在 ORDER 不為 None 時添加
        url = url.replace("?page=", f"?order={ORDER}&page=")

    return url


# # # 測試範例  類別: {軟體, it}
# url_1 = cake_me_url("雲端工程師", "it", "latest")    
# print(url_1)  # https://www.cake.me/jobs/雲端工程師?order=latest&profession[0]=it&page=

# url_2 = cake_me_url("雲端工程師", "")      
# print(url_2)  # https://www.cake.me/jobs/雲端工程師?page=

# url_3 = cake_me_url("", "it", "latest")             
# print(url_3)  # https://www.cake.me/jobs/categories/it?order=latest&page=

# url_4 = cake_me_url("", "")               
# print(url_4)  # https://www.cake.me/jobs?page=

In [5]:
#  從指定的職缺網址獲取工作職缺的網址

def fetch_job_url(joburl):
    """
    這個函數會遍歷多個頁面，並從每個頁面中提取工作職缺的網址，將其存儲在一個集合中以避免重複。
    使用 tqdm 顯示進度條，並在每次請求之間隨機延遲以避免過於頻繁的請求。

    參數:
    joburl (str): 職缺列表的基本網址。
    PAGE (int): 起始頁碼。

    返回:
    list: 包含所有獲取到的工作職缺網址的列表。
    """

    PAGE = 0
    MAX_PAGE = 1

    MAX_LENGTH = 4
    recent_counts = deque(maxlen=MAX_LENGTH)

    job_url_set = set()  # 使用 set() 來存儲網址
    with tqdm(total=MAX_PAGE, desc="cake.me職缺列表 ", unit="PAGE", leave=True) as pbar:
        while True:
            # 獲取當前頁面內容的工作網址
            response = requests.get(f"{joburl}{PAGE}")
            response_soup = BeautifulSoup(response.text, 'html.parser')
            job_urls = response_soup.find_all('a', class_='JobSearchItem_jobTitle__bu6yO')
            for job_url in job_urls:
                job_url_set.add("https://www.cake.me" + job_url['href'])  # 添加到 set 中
            
            # 檢查是否有新資料
            total_jobs = len(job_url_set) 
            recent_counts.append(total_jobs)
            if len(recent_counts) == MAX_LENGTH and len(set(recent_counts)) == 1:
                print(f"連續{MAX_LENGTH}次沒有新資料，提前結束。")
                print(f"Total unique job URLs fetched: {len(job_url_set)}")
                break
               
            # 獲取總頁數
            time.sleep(random.uniform(0.5, 1.5))
            pagination_items = response_soup.find_all('a', class_='Pagination_itemNumber___enNq')
            if pagination_items:
                MAX_PAGE = int(pagination_items[-1].text) + 1
                pbar.total = MAX_PAGE  # 更新進度條的總頁數
            pbar.set_postfix_str(f"目前頁面 {PAGE}, 最大頁數: {MAX_PAGE}")
            pbar.update(1)

            if PAGE <= MAX_PAGE:  
                PAGE = PAGE + 1 
            else:
                break

        return list(job_url_set)  # 將 set 轉換為 list
    
    
# # 測試範例
# joburl = "https://www.cake.me/jobs/雲端工程師?order=latest&profession[0]=it&page="
# job_urls = fetch_job_url(joburl)
# job_urls[0]

In [None]:
# 從指定的職缺網址獲取職缺的相關數據

import requests
import time
import random
import pandas as pd
import json
from bs4 import BeautifulSoup

def clean_html_if_string(value):
    """
    輔助函數：只在輸入值為字串時，才清除 HTML 標籤。
    對於其他類型（數字、列表、None 等），直接返回原值。
    """
    if isinstance(value, str):
        # 使用 .get_text() 是從 HTML 中提取純文字的標準方法
        # separator=' ' 在標籤間插入空格，讓文字更自然
        # strip=True 移除開頭和結尾的空白
        return BeautifulSoup(value, "html.parser").get_text(separator=' ', strip=True)
    return value
    

def fetch_job_data(job_url: str) -> pd.DataFrame:
    """
    這個函數會發送 GET 請求到提供的職缺網址，並使用 BeautifulSoup 解析返回的 HTML 文檔。
    它會從頁面中的 JavaScript 代碼中提取職缺的元數據，將其轉換為 Pandas DataFrame，
    並清除特定欄位中的 HTML 標籤。

    參數:
    job_url (str): 職缺的網址。

    返回:
    pd.DataFrame: 包含職缺詳細信息的 DataFrame。如果發生錯誤，則返回一個空的 DataFrame。
    """
    try:
        time.sleep(random.uniform(0.3, 0.8)) 
        
        # 加上 User-Agent，讓請求看起來更像來自瀏覽器
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(job_url, headers=headers, timeout=15)
        response.raise_for_status()
        
        response_soup = BeautifulSoup(response.text, 'html.parser')
        
        data_script = response_soup.find('script', id='__NEXT_DATA__')
        
        if not data_script:
            print("錯誤：在頁面中找不到職缺資料 (script#__NEXT_DATA__) 。")
            return pd.DataFrame()

        page_props = json.loads(data_script.string)['props']['pageProps']

        # 前一版本
        # jobMetaData = json.loads(scripts[-1].string)['props']['pageProps']['ssr']['jobMetaData']['job']
        # df = pd.json_normalize(jobMetaData)
        # # 公司名稱、網址、其他職缺、目前職缺網址
        # office_name = json.loads(scripts[1].string)['itemListElement'][0]['item']['name']
        # office_url = json.loads(scripts[1].string)['itemListElement'][0]['item']['@id']
        # job_other = json.loads(scripts[1].string)['itemListElement'][1]['item']['@id']
        # job_url = json.loads(scripts[1].string)['itemListElement'][2]['item']['@id']
        # df['job_url'] = job_url
        # df['office_name'] = office_name
        # df['office_url'] = office_url
        # df['job_other'] = job_other


        # 在這裡，我們直接從 pageProps 提取 'job' 的資料，而不是 'company'
        # 因為職缺頁面的核心是職缺本身，它內部已經包含了公司資訊
        job_details = page_props.get('job')
        
        if not job_details:
            print("錯誤：無法從 JSON 中解析出職缺資料 ('job' key not found)。")
            return pd.DataFrame()
        
        df = pd.json_normalize([job_details])

        columns_to_clean = [
            'description',
            'job_responsibilities',
            'job_requirements',
            'requirements'
            'job_preferred_requirements',
            'company.description'
        ]

        # 2. 只對存在於 DataFrame 且需要清洗的欄位進行操作
        for col in columns_to_clean:
            if col in df.columns:
        # for col in df.columns:
        #     if col in df.columns and df[col].dtype == 'object':
                df[col] = df[col].apply(clean_html_if_string)
        
        return df

    except requests.exceptions.RequestException as e:
        print(f"網路請求失敗: {e}")
        return pd.DataFrame()
    except (KeyError, TypeError, json.JSONDecodeError) as e:
        print(f"解析頁面資料失敗，頁面結構可能已變更: {e}")
        return pd.DataFrame()


# # --- 測試範例 ---
# # 使用您的範例網址
# job_url = 'https://www.cakeresume.com/companies/qdm/jobs/2f3db0'
# job_data = fetch_job_data(job_url)
# job_data

In [None]:
# 根據關鍵字與職業類別 獲取所有工作職位的資料

SEARCH_TIMESTAMP = time.strftime('%Y-%m-%d', time.localtime(time.time()))
KEYWORDS = "雲端工程師"
CATAGORY = "it"
FILE_NAME = f"({SEARCH_TIMESTAMP})_{WEB_NAME}_{KEYWORDS}_{CATAGORY}"

print( f"開始執行 {FILE_NAME}" )
job_data_list = []
SEARCH_PAGE_URL = cake_me_url(KEYWORDS, CATAGORY)       # 產生cake.me網址
job_urls = fetch_job_url(SEARCH_PAGE_URL)               # 列出職缺列表
with ThreadPoolExecutor(max_workers=3) as executor:    # 列出職缺細項
    futures = [executor.submit(fetch_job_data, url) for url in job_urls]
    for future in tqdm(futures):
        result = future.result()
        if result is not None and not result.empty:  # 確保結果不為空
            job_data_list.append(result)

all_jobs_df = pd.concat(job_data_list, axis=0)  
all_jobs_df.reset_index(drop=True, inplace=True)
all_jobs_df['job_url'] = 'https://www.cake.me/companies/geovision/jobs/' + all_jobs_df['path']

all_jobs_df.shape

In [None]:
all_jobs_df['job_url'] = 'https://www.cake.me/companies/geovision/jobs/' + all_jobs_df['path']
all_jobs_df

In [None]:
# merged_df.to_csv (f"{FILE_NAME}.csv", index=False, encoding='utf-8-sig')
# print (f"已將所有職缺資料儲存到 {FILE_NAME}.csv")

all_jobs_df.to_excel(f"{FILE_NAME}.xlsx", index=False)
print(f"已將所有職缺資料儲存到 {FILE_NAME}.xlsx")
all_jobs_df.head(1)

In [None]:
all_jobs_df.columns

In [None]:
column_names = [
    {"序號": 1, "英文": "path", "中文": "職缺頁面路徑"},
    {"序號": 2, "英文": "title", "中文": "職缺標題"},
    {"序號": 3, "英文": "profession_v2", "中文": "職業類別"},
    {"序號": 4, "英文": "job_type", "中文": "工作類型"}, # 例如: 全職, 兼職
    {"序號": 5, "英文": "seniority_level", "中文": "資歷級別"}, # 例如: Entry, Mid, Senior
    {"序號": 6, "英文": "min_work_exp_year", "中文": "最少工作經驗年資"},
    {"序號": 7, "英文": "locations", "中文": "工作地點"},
    {"序號": 8, "英文": "remote", "中文": "是否可遠端工作"},
    {"序號": 9, "英文": "number_of_management", "中文": "管理職責人數"},
    {"序號": 10, "英文": "number_of_openings", "中文": "招聘人數"},
    {"序號": 11, "英文": "salary_min", "中文": "最低薪資"},
    {"序號": 12, "英文": "salary_max", "中文": "最高薪資"},
    {"序號": 13, "英文": "salary_currency", "中文": "薪資貨幣"}, # 例如: TWD, USD
    {"序號": 14, "英文": "salary_type", "中文": "薪資類型"}, # 例如: 月薪, 年薪
    {"序號": 15, "英文": "inclusivity_traits", "中文": "多元共融特質"},
    {"序號": 16, "英文": "tags", "中文": "技能標籤"},
    {"序號": 17, "英文": "description", "中文": "職缺描述"},
    {"序號": 18, "英文": "requirements", "中文": "應徵條件"},
    {"序號": 19, "英文": "category", "中文": "職缺類別(總)"},
    {"序號": 20, "英文": "hide_salary_completely", "中文": "完全隱藏薪資"},
    {"序號": 21, "英文": "hide_salary_max", "中文": "隱藏最高薪資"},
    {"序號": 22, "英文": "year_of_seniority", "中文": "資歷年限(另一種)"},
    {"序號": 23, "英文": "lang", "中文": "刊登語言"},
    {"序號": 24, "英文": "signing_bonus", "中文": "簽約獎金"},
    {"序號": 25, "英文": "unique_impression_count", "中文": "不重複瀏覽次數"},
    {"序號": 26, "英文": "cached_votes_up", "中文": "讚數(快取)"},
    {"序號": 27, "英文": "content_updated_at", "中文": "內容更新時間"},
    {"序號": 28, "英文": "sponsored", "中文": "是否為贊助職缺"},
    {"序號": 29, "英文": "external_url", "中文": "外部應徵連結"},
    {"序號": 30, "英文": "impression_count", "中文": "總瀏覽次數"},
    {"序號": 31, "英文": "interview_process", "中文": "面試流程"},
    {"序號": 32, "英文": "created_at", "中文": "建立時間"},
    {"序號": 33, "英文": "updated_at", "中文": "更新時間"},
    {"序號": 34, "英文": "liked", "中文": "是否已收藏"}, # 使用者個人狀態
    {"序號": 35, "英文": "job_recruiters", "中文": "負責招募者"},
    {"序號": 36, "英文": "job_questions", "中文": "應徵問題"},
    {"序號": 37, "英文": "noindex", "中文": "禁止搜尋引擎索引"},
    {"序號": 38, "英文": "processed_description", "中文": "處理過的職缺描述"},
    {"序號": 39, "英文": "impression_token", "中文": "瀏覽令牌"}, # 技術性欄位
    {"序號": 40, "英文": "common_applied_jobs", "中文": "常見相關應徵職缺"},
    {"序號": 41, "英文": "aasm_state", "中文": "職缺狀態"}, # 技術性欄位 (例如: published, archived)
    {"序號": 42, "英文": "metadata.images", "中文": "相關圖片"},
]

df_new = pd.json_normalize(column_names)
df_new.columns = ["序號", "cake.me_英文", "cake.me_中文"]
df_new

In [None]:
#  前一版本
# column_names = [
#     {"序號": 1, "英文": "name", "中文": "公司名稱"},
#     {"序號": 2, "英文": "description", "中文": "公司簡介"},
#     {"序號": 3, "英文": "path", "中文": "公司頁面路徑"},
#     {"序號": 4, "英文": "unique_impression_count", "中文": "不重複曝光次數"},
#     {"序號": 5, "英文": "address", "中文": "公司地址"},
#     {"序號": 6, "英文": "country", "中文": "國家"},
#     {"序號": 7, "英文": "contact_name", "中文": "聯絡人姓名"},
#     {"序號": 8, "英文": "contact_phone", "中文": "聯絡電話"},
#     {"序號": 9, "英文": "aasm_state", "中文": "工作流程狀態"},
#     {"序號": 10, "英文": "content_updated_at", "中文": "內容更新時間"},
#     {"序號": 11, "英文": "twitter_handle", "中文": "Twitter 帳號"},
#     {"序號": 12, "英文": "email", "中文": "電子郵件"},
#     {"序號": 13, "英文": "phone", "中文": "電話"},
#     {"序號": 14, "英文": "work_environment", "中文": "工作環境"},
#     {"序號": 15, "英文": "employee_benefits", "中文": "員工福利"},
#     {"序號": 16, "英文": "products_or_services", "中文": "產品或服務"},
#     {"序號": 17, "英文": "mission", "中文": "公司使命"},
#     {"序號": 18, "英文": "media_coverage", "中文": "媒體報導"},
#     {"序號": 19, "英文": "cover_image", "中文": "封面圖片"},
#     {"序號": 20, "英文": "youtube_video_url", "中文": "YouTube 影片網址"},
#     {"序號": 21, "英文": "recruiter_contact_info", "中文": "招募人員聯絡資訊"},
#     {"序號": 22, "英文": "recruiting_website_url", "中文": "招募網站網址"},
#     {"序號": 23, "英文": "founded_year", "中文": "創立年份"},
#     {"序號": 24, "英文": "created_at", "中文": "建立時間"},
#     {"序號": 25, "英文": "updated_at", "中文": "更新時間"},
#     {"序號": 26, "英文": "company", "中文": "公司"},
#     {"序號": 27, "英文": "website_url", "中文": "公司網站網址"},
#     {"序號": 28, "英文": "geo_city", "中文": "地理位置-城市"},
#     {"序號": 29, "英文": "geo_formatted_address", "中文": "地理位置-格式化地址"},
#     {"序號": 30, "英文": "geo_state_code", "中文": "地理位置-州/省代碼"},
#     {"序號": 31, "英文": "geo_country_code", "中文": "地理位置-國家代碼"},
#     {"序號": 32, "英文": "geo_zip", "中文": "地理位置-郵遞區號"},
#     {"序號": 33, "英文": "featured", "中文": "精選狀態"},
#     {"序號": 34, "英文": "geo_state_name", "中文": "地理位置-州/省名稱"},
#     {"序號": 35, "英文": "geo_city_l", "中文": "地理位置-城市 (本地化)"},
#     {"序號": 36, "英文": "geo_formatted_address_l", "中文": "地理位置-格式化地址 (本地化)"},
#     {"序號": 37, "英文": "geo_state_name_l", "中文": "地理位置-州/省名稱 (本地化)"},
#     {"序號": 38, "英文": "geo_street_address_l", "中文": "地理位置-街道地址 (本地化)"},
#     {"序號": 39, "英文": "geo_l_locale", "中文": "地理位置-本地化語系"},
#     {"序號": 40, "英文": "sector", "中文": "產業類別"},
#     {"序號": 41, "英文": "facebook_url", "中文": "Facebook 網址"},
#     {"序號": 42, "英文": "linkedin_url", "中文": "LinkedIn 網址"},
#     {"序號": 43, "英文": "instagram_url", "中文": "Instagram 網址"},
#     {"序號": 44, "英文": "medium_url", "中文": "Medium 網址"},
#     {"序號": 45, "英文": "whatsapp_id", "中文": "WhatsApp ID"},
#     {"序號": 46, "英文": "geo_country_l", "中文": "地理位置-國家 (本地化)"},
#     {"序號": 47, "英文": "application_read_rate", "中文": "應徵讀取率"},
#     {"序號": 48, "英文": "application_read_time", "中文": "應徵讀取時間"},
#     {"序號": 49, "英文": "candidate_read_rate", "中文": "應徵者資料讀取率"},
#     {"序號": 50, "英文": "candidate_read_time", "中文": "應徵者資料讀取時間"},
#     {"序號": 51, "英文": "published_at", "中文": "發布時間"},
#     {"序號": 52, "英文": "logo", "中文": "公司 Logo"},
#     {"序號": 53, "英文": "company_overview", "中文": "公司概覽"},
#     {"序號": 54, "英文": "amount_of_capital", "中文": "資本額"},
#     {"序號": 55, "英文": "number_of_employees", "中文": "員工人數"},
#     {"序號": 56, "英文": "og_image", "中文": "OG 圖片 (社群分享預覽圖)"},
#     {"序號": 57, "英文": "ga_tracking_code", "中文": "Google Analytics 追蹤碼"},
#     {"序號": 58, "英文": "tax_id_number", "中文": "統一編號 / 稅務識別碼"},
#     {"序號": 59, "英文": "last_active_at", "中文": "最後活躍時間"},
#     {"序號": 60, "英文": "labels", "中文": "標籤"},
#     {"序號": 61, "英文": "teches", "中文": "使用技術"},
#     {"序號": 62, "英文": "faq_items", "中文": "常見問題項目"},
#     {"序號": 63, "英文": "work_environment_images", "中文": "工作環境圖片"},
#     {"序號": 64, "英文": "currency_symbol", "中文": "貨幣符號"},
#     {"序號": 65, "英文": "followed", "中文": "已追蹤"},
#     {"序號": 66, "英文": "followed_job_notification_type", "中文": "追蹤職缺通知類型"},
#     {"序號": 67, "英文": "total_followers", "中文": "總追蹤人數"},
#     {"序號": 68, "英文": "currency_code", "中文": "貨幣代碼"},
#     {"序號": 69, "英文": "seems_spam", "中文": "疑似垃圾訊息"},
#     {"序號": 70, "英文": "noindex", "中文": "禁止搜尋引擎索引"},
#     {"序號": 71, "英文": "impression_token", "中文": "曝光權杖 (Token)"},
#     {"序號": 72, "英文": "profession_job_counts", "中文": "各職業類別職缺數"},
#     {"序號": 73, "英文": "listed_job_count", "中文": "上架中職缺數"},
#     {"序號": 74, "英文": "sanitized_description", "中文": "純文字(淨化後)的公司簡介"},
#     {"序號": 75, "英文": "sanitized_products_or_services", "中文": "純文字(淨化後)的產品或服務"},
#     {"序號": 76, "英文": "sanitized_mission", "中文": "純文字(淨化後)的公司使命"},
#     {"序號": 77, "英文": "sanitized_media_coverage", "中文": "純文字(淨化後)的媒體報導"},
#     {"序號": 78, "英文": "sanitized_employee_benefits", "中文": "純文字(淨化後)的員工福利"},
#     {"序號": 79, "英文": "sanitized_work_environment", "中文": "純文字(淨化後)的工作環境"}
# ]

# df_new = pd.json_normalize(column_names)
# df_new.columns = ["序號", "cake.me_英文", "cake.me_中文"]
# df_new

In [None]:
# 補充 :  網頁結構解析

url = "https://www.cake.me/jobs/雲端工程師"

response = requests.get(url)
response_soup = BeautifulSoup(response.text, 'html.parser')
scripts = response_soup.find_all('script')


# 查看篩選欄位選項
# options = []
# catagorys = 1
# DropdownButton_contents = response_soup.find_all('div', 'JobSearchPage_searchFilter__ts_A0')
# for JobSearchPage_searchFilter in DropdownButton_contents:
#     title  = JobSearchPage_searchFilter.find('div', 'DropdownButton_content__XZwFf').text
#     # print(title)
#     Checkbox_texts = JobSearchPage_searchFilter.find_all('div', 'Checkbox_text__g6TLq')
#     for Checkbox in Checkbox_texts:
#         # print(Checkbox.text)
#         option_text = Checkbox.text.strip()
#         options.append({'catagorys':catagorys, 'title': title, 'option_text': option_text})
#     catagorys = catagorys +1
# df_searchFilter = pd.DataFrame(options)
# df_searchFilter
# df_searchFilter['option_text'][(df_searchFilter['catagorys']==1)]



# 提取資料結構 使用遞歸函數提取子結構
def extract_keys_with_branch_structure(value_in, current_path=''):
    
    branch_structure = {}
    if isinstance(value_in, dict):
        for index, (key, value) in enumerate(value_in.items()):
            path = f"{current_path}-{index + 1}" if current_path else f"{index + 1}"
            branch_structure[path] = key  # 添加當前層級的鍵

            # path = f"{current_path}-{key}" if current_path else f"{key}"
            # branch_structure[path] = value  # 添加當前層級的鍵
            
            # 使用遞歸查找 子結構、提取與合併
            if isinstance(value, dict):
                branch_structure.update(extract_keys_with_branch_structure(value, path))

    return branch_structure

data = json.loads(scripts[-1].string)
branch_structure = extract_keys_with_branch_structure(data)

json.loads(scripts[-1].string)['props']['pageProps']['serverState']['initialResults']['Job']['state']['query']
