### 下載模組

在終端機輸入以下指令：

```
pip install selenium
```

### 匯入模組

In [1]:
import re
import os
import base64
import requests
import json

# 函式註解
from typing import *

# 網路爬蟲
from time import sleep
from requests.exceptions import InvalidSchema
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

### 定義 Class Crewling_Meme_Template

In [2]:
class Crewling_Meme_Template:
    def __init__(self, url: str) -> None:
        self.url = url

        # 用於紀錄詳細資訊
        self.template_details = dict()

        self.chrome_options = Options()
        self.chrome_options.add_argument("--disable-gpu")

        # 無頭模式
        # self.chrome_options.add_argument("--headless")

    # 獲取詳細資訊
    def get_details(self) -> Dict[str, Dict[str, str]]:
        return self.template_details
    
    # 更換 url
    def change_url(self, url: str) -> None:
        self.url = url
    
    # 開啟瀏覽器
    def open_webdriver(self) -> None:
        # 初始化瀏覽器、設置智能等待
        # 注意!!! implicitly_wait 不要設定得太短
        self.driver = webdriver.Chrome('chromedriver', options = self.chrome_options)
        self.driver.implicitly_wait(20)

        # 開啟瀏覽器，並固定視窗大小
        self.driver.get(self.url)
        self.driver.set_window_size(1200, 800)
        sleep(0.5)

    def crewling(self, total_page: int) -> None:
        for _ in range(total_page):
            # 爬取當前頁面的所有模板
            self.crewling_one_page()

    # 爬取當前頁面的所有模板
    def crewling_one_page(self) -> None:
        # 計算該頁面的總模板數
        total_template_num = len(self.driver.find_element(By.CLASS_NAME, "mt-boxes").find_elements(By.CLASS_NAME, "mt-box"))
        
        for index in range(total_template_num):
            # 爬取單一的模板
            self.crewling_one_template(index)
        
        btn_next_page = self.driver.find_element(By.LINK_TEXT, "Next ›")
        btn_next_page.click()
        sleep(0.5)
    
    # 爬取單一的模板
    def crewling_one_template(self, index: int) -> None:
        sleep(2)
        # 獲取 父 div
        meme_parent = self.driver.find_element(By.CLASS_NAME, "mt-boxes")

        # 父 div 底下的獲取所有 子 div
        memes = meme_parent.find_elements(By.CLASS_NAME, "mt-box")
        
        # 取得梗圖名稱
        meme_name = memes[index].find_element(By.TAG_NAME, "a").text

        # 將不能用於檔案命名的字元刪除
        invalid_chars = r'[\\/:"*?<>|]'
        meme_name = re.sub(invalid_chars, '', meme_name)
        sleep(0.5)
        
        # 點擊 Add Caption 按鈕
        btn_add_caption = memes[index].find_element(By.LINK_TEXT, f"Add Caption")
        btn_add_caption.click()
        sleep(0.5)

        # 不要下載 GLF 
        if "GIF" in self.driver.find_element(By.TAG_NAME, "h1").text:
            # 回到上一頁
            self.driver.back()
            sleep(0.5)
            return 
        
        # 取得每個文字方塊的座標比例
        canvas_size, positions = self.get_textareas_pos()

        # 紀錄詳細資訊
        num_of_textareas = len(self.driver.find_element(By.CLASS_NAME, "mm-boxes").find_elements(By.CLASS_NAME, "mm-text"))
        self.template_details[meme_name] = {
            "url": self.driver.current_url,
            "canvas_size": canvas_size, 
            "num_of_textareas": num_of_textareas,
            "positions": positions
        }
        
        # 點擊 Generate 按鈕
        sleep(0.5)
        download_btn = self.driver.find_element(By.XPATH, r"/html/body/div[3]/div[2]/div[2]/div[10]/div[5]/button[1]")
        download_btn.click()
        
        # 找到圖片，並取得下載連結
        sleep(1)
        img = self.driver.find_element(By.ID, "done-img")
        img_url = img.get_attribute("src")
        sleep(0.5)
        
        # img_url 可能會有兩種情況，所以用 try... except... 來處理
        # 1. 單純的 url => 直接用 requests 即可
        # 2. Base64 編碼 => 用 base64 解碼，才能獲得圖片
        try:
            # 直接從 url 找到圖片
            response = requests.get(img_url)

            # 輸出圖片
            with open(f"./meme_templates/{meme_name}.png", "wb") as file:
                # file.write(image_bytes)
                file.write(response.content)        
        except InvalidSchema:
            # 解碼 Base64 圖片資料
            image_data = img_url.split(",")[1]
            image_bytes = base64.b64decode(image_data)

            with open(f"./meme_templates/{meme_name}.png", "wb") as file:
                file.write(image_bytes)
        
        # 回到上一頁
        sleep(0.5)
        try:
            self.driver.back()
        except:
            pass
        sleep(0.5)

    # 取得文字方塊在圖片中的座標
    def get_textareas_pos(self) -> Tuple[Dict[str, float], Dict[int, Dict[str, float]]]:
        canvas_box = self.driver.find_element(By.CLASS_NAME, "mm-preview")
        
        # 取得圖片大小
        canvas = self.driver.find_element(By.TAG_NAME, "canvas")
        width = int(canvas.get_attribute("width"))
        height = int(canvas.get_attribute("height"))
        canvas_size = {
            "width": width, 
            "height": height}

        # 移動滑鼠到圖片上 (因為 class name 會改變)
        actions = ActionChains(self.driver)
        actions.move_to_element(canvas_box)
        actions.perform()
        sleep(0.5)

        # 取得文字方塊
        text_boxes = canvas_box.find_elements(By.CLASS_NAME, "drag-box")
    
        # 紀錄每個文字方塊的座標比例
        positions = {}
        for index, text_box in enumerate(text_boxes):
            # 獲取 style 屬性
            # "left: 254px; top: 254px; width: 239px; height: 239px;"
            style_data = (text_box.get_attribute("style"))

            # 字串處裡
            # left 249 top 249 width 235 height 235
            style_data = style_data.replace(":", "").replace(";", "").replace("px", "")

            # 轉成字典
            # {'left': '249', 'top': '249', 'width': '235', 'height': '235'}
            pairs = style_data.split()
            styles = {pairs[i]: pairs[i+1] for i in range(0, len(pairs), 2)}

            # 計算座標比例  
            x_ratio = round((int(styles["left"]) + int(styles["width"]) / 2) / width, 2)
            y_ratio = round((int(styles["top"]) + int(styles["height"]) / 2) / height, 2)
            
            positions[len(text_boxes)-index] = {
                "x": x_ratio, 
                "y": y_ratio}
        
        return canvas_size, positions

    # 關閉瀏覽器
    def close(self) -> None:
        self.driver.quit()

### 主程式

In [3]:
# 要執行網路爬蟲時，可直接用此函式
def start_crewling(crewling_class: Crewling_Meme_Template, total_page: int) -> None:
    crewling_class.open_webdriver()
    crewling_class.crewling(total_page)
    crewling_class.close()

In [4]:
# 網址
url = "https://imgflip.com/memetemplates"

# 總共要跑幾頁
total_page = 2

# 開始爬蟲
crewling_meme = Crewling_Meme_Template(url)
start_crewling(crewling_meme, total_page)

##### 爬蟲時間越久越不穩定，所以建議執行一段時間後，更換網址再繼續

In [9]:
# 要更換的網址，從第 2 頁開始
url = "https://imgflip.com/memetemplates?page=2"

# 總共要跑幾頁
total_page = 1

# 更換網址
crewling_meme.change_url(url)

# 開始爬蟲
start_crewling(crewling_meme, total_page)

WebDriverException: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=113.0.5672.127)
Stacktrace:
Backtrace:
	GetHandleVerifier [0x010E8893+48451]
	(No symbol) [0x0107B8A1]
	(No symbol) [0x00F85058]
	(No symbol) [0x00F77BB6]
	(No symbol) [0x00F778D9]
	(No symbol) [0x00F86460]
	(No symbol) [0x00FDC038]
	(No symbol) [0x00FCA0B6]
	(No symbol) [0x00FA7E08]
	(No symbol) [0x00FA8F2D]
	GetHandleVerifier [0x01348E3A+2540266]
	GetHandleVerifier [0x01388959+2801161]
	GetHandleVerifier [0x0138295C+2776588]
	GetHandleVerifier [0x01172280+612144]
	(No symbol) [0x01084F6C]
	(No symbol) [0x010811D8]
	(No symbol) [0x010812BB]
	(No symbol) [0x01074857]
	BaseThreadInitThunk [0x75A800C9+25]
	RtlGetAppContainerNamedObjectPath [0x77AD7B4E+286]
	RtlGetAppContainerNamedObjectPath [0x77AD7B1E+238]


##### 儲存詳細資訊

In [5]:
details = crewling_meme.get_details()

with open(f"./template_details/template_details-1.json", "w+") as json_file:
    json.dump(details, json_file)

### 如果上面那些看不懂，可以看看下方的程式碼，有稍微簡化一些了

In [None]:
url = "https://imgflip.com/memetemplates"
page_num = 3

chrome_options = Options()
chrome_options.add_argument("--disable-gpu")  # 關閉 GPU

# 初始化瀏覽器、設置智能等待
# 注意!!! implicitly_wait 不要設定得太短
driver = webdriver.Chrome('chromedriver', options = chrome_options)
driver.implicitly_wait(20)

# 開啟瀏覽器，並固定視窗大小
driver.get(url)
driver.set_window_size(1200, 800)

for _ in range(page_num):
    num = len(driver.find_element(By.CLASS_NAME, "mt-boxes").find_elements(By.CLASS_NAME, "mt-box"))

    for i in range(num):
        
        # 獲取 父 div
        meme_parent = driver.find_element(By.CLASS_NAME, "mt-boxes")

        # 父 div 底下的獲取所有 子 div
        memes = meme_parent.find_elements(By.CLASS_NAME, "mt-box")
        
        # 取得梗圖名稱
        meme_name = memes[i].find_element(By.TAG_NAME, "a").text
        
        # 點擊 Add Caption 按鈕
        btn_add_caption = memes[i].find_element(By.LINK_TEXT, f"Add Caption")
        btn_add_caption.click()
        sleep(0.5)

        # 點擊下載按鈕
        download_btn = driver.find_element(By.XPATH, r"/html/body/div[3]/div[2]/div[2]/div[10]/div[5]/button[1]")
        download_btn.click()
        
        # 找到圖片，並取得下載連結
        img = driver.find_element(By.ID, "done-img")
        img_url = img.get_attribute("src")
        sleep(0.5)
        
        # img_url 可能會有兩種情況，所以用 try... except... 來處理
        # 1. 單純的 url => 直接用 requests 即可
        # 2. Base64 編碼 => 用 base64 解碼，才能獲得圖片
        try:
            # 直接從 url 找到圖片
            response = requests.get(img_url)

            # 輸出圖片
            with open(f"./meme_templates/{meme_name}.png", "wb") as file:
                # file.write(image_bytes)
                file.write(response.content)        
        except InvalidSchema:
            # 解碼 Base64 圖片資料
            image_data = img_url.split(",")[1]
            image_bytes = base64.b64decode(image_data)

            with open(f"./meme_templates/{meme_name}.png", "wb") as file:
                file.write(image_bytes)
        finally:
            sleep(0.5)

        # 回到上一頁
        driver.back()
        sleep(0.5)
    btn_next_page = driver.find_element(By.LINK_TEXT, "Next ›")
    btn_next_page.click()
    sleep(0.5)

driver.close()