In [None]:
#  相關套件

import re
import time
import random

from selenium import webdriver
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

WEB_NAME = "linkedin"

In [None]:
# selenium 模擬器


def linkedin_login(email, password):
    """
    登錄到 LinkedIn。

    參數:
    email (str): 用戶的 LinkedIn 電子郵件地址。
    password (str): 用戶的 LinkedIn 密碼。

    返回:
    webdriver.Chrome: 登錄後的 Chrome 瀏覽器驅動實例。
    """
    driver = webdriver.Chrome()
    driver.get("https://www.linkedin.com/login")

    driver.find_element(By.ID, "username").send_keys(email)
    driver.find_element(By.ID, "password").send_keys(password)
    driver.find_element(By.XPATH, '//button[@type="submit"]').click()

    return driver  # 返回driver以便後續操作


# # 測試範例
# LINKEDIN_EMAIL = "josop77094@daxiake.com"  # 青銅一號
LINKEDIN_EMAIL = "venofo6451@nomrista.com"  # 青銅二號

LINKEDIN_PASSWORD = "!!Qwerasdf1234"

driver = linkedin_login(LINKEDIN_EMAIL, LINKEDIN_PASSWORD)

In [None]:
# 產生linkedin網址 https://www.linkedin.com 根據提供的(關鍵字和職缺類別) 轉換為職缺網址


def catch_linkedin_url(KEYWORDS):
    """
    根據提供的關鍵字構建 LinkedIn 職缺網址。

    此函數會根據給定的關鍵字生成一個完整的 LinkedIn 職缺搜尋網址。
    如果只提供了關鍵字，則網址將包含該關鍵字；如果需要包含類別，則可根據需求進行擴展。

    參數:
    KEYWORDS (str): 職缺的關鍵字，例如 "雲端工程師"。

    返回:
    # str: 生成的 LinkedIn 職缺網址。

    """

    BASE_URL = f"https://www.linkedin.com/jobs/search?keywords={KEYWORDS}&location=台灣&geoId=104187078"
    return BASE_URL


# 測試範例
# KEYWORDS = "雲端工程師"
# KEYWORDS_url = catch_linkedin_url(KEYWORDS)
# KEYWORDS_url

In [None]:
# 從 linkedin 職缺網址 獲取工作職缺的網址


def fetch_jobs_url(driver, KEYWORDS_url):
    """
    從 LinkedIn 獲取指定頁數的工作職缺數據。

    此函數將訪問 LinkedIn 的工作職缺搜尋頁面，並提取每個工作職缺的 ID、標籤和網址。
    它會處理多頁結果，並返回一個包含所有職缺數據的列表。

    參數:
    driver (WebDriver): Selenium 的 WebDriver 實例，用於操作瀏覽器。
    total_pages (int): 要處理的總頁數。

    返回:
    list: 包含工作職缺數據的字典列表，每個字典包含以下鍵:
        - "Job_id": 工作職缺的 ID。
        - "Aria-label": 工作職缺的標籤。
        - "Href": 工作職缺的網址。
    """

    job_data = []
    job_id_set = set()
    total_pages = 5
    # 使用 tqdm 顯示進度條
    with tqdm(total=total_pages, desc="處理中", unit="頁") as pbar:
        PAGE = 0
        while PAGE < total_pages:
            start_index = PAGE * 25
            # search_url = f"https://www.linkedin.com/jobs/search?keywords=雲端工程師&location=台灣&geoId=104187078&start={start_index}"
            search_url = f"{KEYWORDS_url}&start={start_index}"

            driver.get(search_url)
            time.sleep(random.uniform(0.8, 1.2))

            target_div = driver.find_element(
                By.CLASS_NAME, "SBrGWlVKspoSNUDUmljgjbmnJKzmLbaCHZeg"
            )
            scroll_to_bottom(driver, target_div)
            time.sleep(random.uniform(0.5, 0.9))

            html_content = driver.page_source
            html_content_soup = BeautifulSoup(html_content, "html.parser")

            # 總頁數
            jobs_search_pagination = html_content_soup.find(
                "p", "jobs-search-pagination__page-state"
            ).get_text(strip=True)
            total_pages = int(re.findall(r"\d+", jobs_search_pagination)[-1])

            job_cards = html_content_soup.find_all("div", class_="job-card-list")
            for job_card in job_cards:
                job_link = job_card.find("a", class_="job-card-container__link")
                aria_label = job_link.get("aria-label")

                href_job_id = job_link.get("href")
                job_id = job_card.get("data-job-id")
                if job_id == "search":
                    job_id = href_job_id.split("currentJobId=")[1].split("&")[0]

                href = "https://www.linkedin.com/jobs/view/" + job_id + "/"

                if job_id and job_id not in job_id_set:
                    job_id_set.add(job_id)
                    job_data.append(
                        {"Job_id": job_id, "Aria-label": aria_label, "Href": href}
                    )

            PAGE += 1
            pbar.total = total_pages
            pbar.update(1)

    return job_data


# 滾動到頁面底部的函數
def scroll_to_bottom(driver, target_div):
    """
    滾動到指定的頁面底部。

    此函數會將指定的元素滾動到底部，以便加載更多內容。

    參數:
    driver (WebDriver): Selenium 的 WebDriver 實例。
    target_div (WebElement): 需要滾動的目標元素。
    """
    div_height = driver.execute_script("return arguments[0].scrollHeight", target_div)
    scroll_amount = div_height / 10
    for _ in range(10):
        driver.execute_script(
            "arguments[0].scrollTop += arguments[1]", target_div, scroll_amount
        )
        time.sleep(random.uniform(0.5, 0.9))


# 測試網址_關鍵字

# KEYWORDS = "雲端工程師"
# KEYWORDS_url = catch_linkedin_url(KEYWORDS)

# job_data = fetch_job_data(driver, KEYWORDS_url)
# df_job_data = pd.DataFrame(job_data)
# df_job_data.shape

In [None]:
# 從指定的職缺網址獲取職缺的相關數據


def fetch_job_data(driver, url):
    # 定義一個函數來提取文本
    def extract_text(soup, selector, class_name):
        element = soup.find(selector, class_=class_name)
        return element.get_text(strip=True) if element else None

    # 提取 Job ID
    Job_id = url.split("/")[-2]
    driver.get(url)

    job_content = driver.page_source
    job_content_soup = BeautifulSoup(job_content, "html.parser")

    # 提取職缺信息
    title = extract_text(job_content_soup, "h1", "t-24")
    company = extract_text(
        job_content_soup, "div", "job-details-jobs-unified-top-card__company-name"
    )
    description = extract_text(
        job_content_soup,
        "div",
        "job-details-jobs-unified-top-card__tertiary-description-container",
    )
    job_insight = extract_text(
        job_content_soup, "li", "job-details-jobs-unified-top-card__job-insight"
    )

    # 提取關於該職缺的描述
    job_desc = job_content_soup.find("div", "jobs-description__content")
    job_desc_text = (
        job_desc.find("div", "mt4").get_text(strip=True) if job_desc else None
    )

    # 提取公司類別
    compan_catagoly = extract_text(job_content_soup, "div", "t-14 mt5")

    # 提取公司網址
    compan_website = None
    website_element = job_content_soup.find("a", class_="pv3")
    if website_element and "href" in website_element.attrs:
        compan_website = "https://www.linkedin.com" + website_element["href"]

    # 構建職缺數據字典
    company_job_data = {
        "Job ID": Job_id,
        "Title": title,
        "Company": company,
        "Description": description,
        "Job Insight": job_insight,
        "Job Description": job_desc_text,
        "Company Category": compan_catagoly,
        "Company Website": compan_website,
    }

    df = pd.json_normalize(company_job_data)
    return df


# 測試範例  輸出 Json
# job_url = df_job_data['Href'][0]
# job_data = fetch_job_data(driver, job_url)
# job_data

In [None]:
# 根據關鍵字與職業類別 獲取所有工作職位的資料

SEARCH_TIMESTAMP = time.strftime("%Y-%m-%d", time.localtime(time.time()))

KEYWORDS = "雲端工程師"
FILE_NAME = f"({SEARCH_TIMESTAMP})_{WEB_NAME}_{KEYWORDS}"

print(f"開始執行 {FILE_NAME}")
KEYWORDS_url = catch_linkedin_url(KEYWORDS)
job_data_list = []
job_urls = fetch_jobs_url(KEYWORDS)  # 列出 Linkedin-職缺網址列表


all_jobs_df = pd.DataFrame()  # 初始化一個空的 DataFrame

for url in tqdm(job_urls, desc="Fetching job data", unit="job"):
    df_job_data = fetch_job_data(url)
    all_jobs_df = pd.concat([all_jobs_df, df_job_data], ignore_index=True)


# 存檔
all_jobs_df.to_csv(f"{FILE_NAME}.csv", encoding="utf-8-sig")
print(f"存檔完成 : {FILE_NAME}.csv")

In [None]:
# 測試頁面

search_url = "https://www.linkedin.com/jobs/search?keywords=雲端工程師&location=台灣&geoId=104187078&start=125"
driver.get(search_url)
time.sleep(random.uniform(2.5, 6))

html_content = driver.page_source
html_content_soup = BeautifulSoup(html_content, "html.parser")


# 職缺總數
jobs_search_results = html_content_soup.find(
    "div", class_="jobs-search-results-list__subtitle"
).get_text(strip=True)
jobs = re.findall(r"(\d+)", jobs_search_results)
total_jobs = int(jobs[0]) if jobs else 0
total_jobs


# 總頁數
jobs_search_pagination = html_content_soup.find(
    "p", "jobs-search-pagination__page-state"
).get_text(strip=True)
pages = re.findall(r"\d+", jobs_search_pagination)
final_page = int(pages[-1])
final_page

job_cards = html_content_soup.find_all("div", class_="job-card-container")
for job_card in job_cards:
    job_id = job_card.get("data-job-id")
    print(f"Job ID: {job_id}")

    href = "https://www.linkedin.com/jobs/view/" + job_id
    print(f"Job Link: {href}")

    job_link = job_card.find("a", class_="job-card-container__link")
    aria_label = job_link.get("aria-label")

    print(f"Aria Label: {aria_label}")
    print("-" * 40)

print(f"總職缺數: {total_jobs}, 總頁數: {final_page}")