In [2]:
import header
import logging

from selenium import webdriver 
from selenium.webdriver.support.ui import WebDriverWait
import SeleniumUtil

import datetime
import pandas as pd
import traceback
from bs4 import BeautifulSoup

import os
import requests

In [3]:
TempPath = "./Temp/"  # browser file
FinalPath = "./Result/" # project file
lastResultPath = "./CrawlList/"
lastResultName = "lastResult"

In [4]:
def getDetailFromContent(soup, tempMap, tabNumber, subColName):
    
    temp = soup.select("pre:nth-of-type(1)")[0].text.strip()
    if tabNumber == 19:
        # 發文字號
        serialNumber = temp.replace("\n", "").strip()
        strPos = serialNumber.find("日")
        endPos = serialNumber.rfind("號令")
        if strPos != -1 & endPos != -1:
            strPos += 1
            endPos += 2
            serialNumber = serialNumber[strPos:endPos]
        else:
            serialNumber = ""

        # 發文日期
        date = soup.select("#ctl00_cphMain_lblndate")[0].text.strip()

        # 相關法條
        try:
            relatedLaws = soup.select("td")[0].text.split("\n")
        except:
            relatedLaws = []
            
    else:
        
        # 裁判字號
        strPos = temp.find(subColName + "字號：")
        if strPos != -1:
            strPos += 5
            endPos = strPos + temp[strPos:].find("：") - 4
            serialNumber = temp[strPos:endPos].strip()
        else:
            serialNumber = ""


        # 裁判日期
        strPos = temp.find(subColName + "日期：")
        if strPos != -1:
            strPos += 5
            endPos = strPos + temp[strPos:].find("：") - 4
            date = temp[strPos:endPos]
        else:
            date = ""

        # 相關法條
        strPos = temp.find("相關法條：")
        if strPos != -1:
            strPos += 5
            endPos = strPos + temp[strPos:].find("：") - 4
            relatedLaws = temp[strPos:endPos].split("\n")
        else:
            try:
                relatedLaws = soup.select("td")[0].text.split("\n")
            except:
                relatedLaws = []
    
    relatedLaw = ", ".join(e.strip() for e in relatedLaws)
        
    tempMap["發文字號"] = serialNumber
    tempMap["發文日期"] = date
    tempMap["相關法條"] = relatedLaw
    
    return tempMap

In [5]:
def request2soup(url):
    res = requests.get(url)
    res.encoding = "utf-8"
    soup = BeautifulSoup(res.text, "html.parser", from_encoding = "utf-8")
    return soup

In [6]:
def parsingDetail(df, FinalPath, tabNumber):
    
    if tabNumber == 20:
        subColName = "裁判"
    elif tabNumber == 22:
        subColName = "提案"
    else:
        subColName = "發文"
    
    df_detail = pd.DataFrame(columns = ["標題", "全文內容", subColName + "字號", subColName + "日期", "相關法條", "附件"])

    for link in df["網頁連結"]:
        try:
            print("擷取網址：" + link)
            soup = request2soup(link)

            # 主旨
            title = soup.select("#ctl00_cphMain_lblSubject")[0].text.strip()

            # 全文內容
            content = soup.select("#pageNews")[0].text.strip()
            
            tempMap = {"標題" : title, 
                       "全文內容" : content,
                       "附件" : ""}
            
            tempMap = getDetailFromContent(soup, tempMap, tabNumber, subColName)
            
            df_detail = df_detail.append(tempMap, ignore_index = True)

            print("爬取成功")
        except:
            print("爬取內文失敗")
            print("失敗連結：" + link)
            logging.error("爬取內文失敗")
            logging.error("失敗連結：" + link)
            traceback.print_exc()

        print("\n")
    return df_detail

In [7]:
def outputCsv(df, fileName, path):
    # 若目錄不存在，建立目錄
    if not os.path.isdir(path):
        os.mkdir(path)
    df.to_csv(path + fileName + ".csv", index = False, encoding = "utf_8_sig")

In [8]:
def parsingTitle(driver, checkRange):
    try:
        # 取得上次爬網結果
        if os.path.isfile(lastResultPath):
            lastResult = pd.read_csv(lastResultPath)
        else:
            lastResult = pd.DataFrame()

        # 爬網日期區間為一個禮拜
        endDate = datetime.date.today()
        strDate = (endDate - datetime.timedelta(days = checkRange)).isoformat()
        
        ending = False
        df = pd.DataFrame(columns = ["爬文日期", "發文日期", "標題", "網頁連結"])

        # actions
        while True:
            try:
                dates = driver.find_elements_by_css_selector(".tdDate")
                dates = [x.text for x in dates] 

                titles = driver.find_elements_by_css_selector(".tdSubject")
                titles = [x.text for x in titles] 

                links = driver.find_elements_by_css_selector(".tdSubject a")
                links = [x.get_attribute("href") for x in links]

                idx = pd.Series([False] * len(dates))
                for i in range(len(dates)):
                    date = dates[i]
                    if date < strDate: # 若發文日期小於開始日期, 則結束爬取主旨
                        ending = True
                        break
                    idx[i] = True
                    
                nowDates = [str(endDate.year) + "/" + str(endDate.month) + "/" + str(endDate.day)] * len(dates)
                d = {"爬文日期" : nowDates, "發文日期" : dates, "標題" : titles, "網頁連結" : links}
                df = df.append(pd.DataFrame(data = d)[idx])  # append page

                # 若結束爬取主旨, 停止爬取剩下的 page
                if ending:
                    break
                    
                # 下一頁
                goNext = driver.find_elements_by_css_selector("#ctl00_cphMain_PagerTop_butNext")[0]
                if goNext.get_attribute("href") == None: # 最後一頁不執行點擊下一頁
                    break
                goNext.click() # 下一頁
            except:
                print("爬取第 %s 頁主旨發生錯誤" %str(i + 1))
                logging.error("爬取第 %s 頁主旨發生錯誤" %str(i + 1))
                traceback.print_exc()

        df.index = [i for i in range(df.shape[0])] # reset Index 
        outputCsv(df, lastResultName, lastResultPath)

        if not lastResult.empty:
            # 若與上次發文日期和標題相同，則跳至下一筆
            for i in range(len(df)):
                for j in range(len(lastResult)):
                    if (df["發文日期"][i] == lastResult["發文日期"][j]) & (df["標題"][i] == lastResult["標題"][j]): 
                        df.drop(i, inplace = True)
                        break

        if len(df) == 0:
            print("%s 至 %s 間無資料更新" %(strDate, endDate))
            logging.critical("%s 至 %s 間無資料更新" %(strDate, endDate))
        else:
            df.index = [i for i in range(df.shape[0])] # reset 

        return df
    
    except:
        print("爬取主旨列表失敗")
        logging.error("爬取主旨列表失敗")
        traceback.print_exc()
        return pd.DataFrame(columns = ["爬文日期", "發文日期", "標題", "網頁連結"])
  

In [11]:
def main(url, tabNumber, checkRange = 7):
    
    logging.critical("\n")
    logging.critical("爬網開始......")
    logging.critical("目標網址：" + url)
    
    strTime = datetime.datetime.now()
    logging.critical("開始時間：" + strTime.strftime("%Y/%m/%d %H:%M:%S"))
    
    DownloadTool = SeleniumUtil.ChromeDownload()
    DownloadTool.setDownLoadTempPath(TempPath)
    DownloadTool.setDownLoadFinalPath(FinalPath)
    chrome_options = DownloadTool.getChromeOptions()
    driver = webdriver.Chrome(chrome_options = chrome_options) # open chrome browser with Options
    
    try:
        if tabNumber >= 19 and tabNumber <= 22 and isinstance(tabNumber, int):
            url = url + str(tabNumber)
        else:
            raise ValueError("tabNumber 必須為 19 到 22 的整數")
        
        driver.get(url)
        df_1 = parsingTitle(driver, checkRange)
        if len(df_1) != 0:
            outputCsv(df_1, "第一層結果", FinalPath)
        
            df_2 = parsingDetail(df_1, tabNumber, FinalPath)
            outputCsv(df_2, "第二層結果", FinalPath)
    except:
        print("執行爬網作業失敗")
        logging.error("執行爬網作業失敗")
        traceback.print_exc()

    endTime = datetime.datetime.now()
    logging.critical("結束時間：" + endTime.strftime("%Y/%m/%d %H:%M:%S"))
    logging.critical("執行時間：" + str((endTime - strTime).seconds) + " 秒")
    logging.critical("輸出筆數：" + str(len(df_1)) + " 筆")
    logging.critical("爬網結束......\n")

In [15]:
if __name__ == "__main__":
    url = "http://www.lawbank.com.tw/news/NewsSearch.aspx?TY="
    main(url, 22)

  


擷取網址：https://www.lawbank.com.tw/news/NewsContent.aspx?AID=1&NID=157082.00&kw=&TY=22&sd=&ed=&total=18575&NCLID=&lsid=




爬取成功


擷取網址：https://www.lawbank.com.tw/news/NewsContent.aspx?AID=2&NID=157078.00&kw=&TY=22&sd=&ed=&total=18575&NCLID=&lsid=
爬取成功


擷取網址：https://www.lawbank.com.tw/news/NewsContent.aspx?AID=3&NID=157077.00&kw=&TY=22&sd=&ed=&total=18575&NCLID=&lsid=
爬取成功


擷取網址：https://www.lawbank.com.tw/news/NewsContent.aspx?AID=4&NID=157076.00&kw=&TY=22&sd=&ed=&total=18575&NCLID=&lsid=
爬取成功


擷取網址：https://www.lawbank.com.tw/news/NewsContent.aspx?AID=5&NID=157075.00&kw=&TY=22&sd=&ed=&total=18575&NCLID=&lsid=
爬取成功


擷取網址：https://www.lawbank.com.tw/news/NewsContent.aspx?AID=6&NID=157074.00&kw=&TY=22&sd=&ed=&total=18575&NCLID=&lsid=
爬取成功


擷取網址：https://www.lawbank.com.tw/news/NewsContent.aspx?AID=7&NID=157065.00&kw=&TY=22&sd=&ed=&total=18575&NCLID=&lsid=
爬取成功


擷取網址：https://www.lawbank.com.tw/news/NewsContent.aspx?AID=8&NID=157060.00&kw=&TY=22&sd=&ed=&total=18575&NCLID=&lsid=
爬取成功


擷取網址：https://www.lawbank.com.tw/news/NewsContent.aspx?AID=9&NID=157040.00&kw=&TY=22&sd=&ed=&total=18575&NCLID=&lsid=
爬取成功


擷