In [27]:

"""
公開資訊觀測站-重大訊息爬蟲程式碼
程式碼撰寫: 蘇彥庭
日期: 20210108
"""

# 載入套件
import datetime
import requests
import pandas as pd
import time
import os
from bs4 import BeautifulSoup
import re


# 確認是否有正常連線
def CheckConnect(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        if '查詢過於頻繁' in soup:  # 查詢過於頻繁視為下載失敗
            print('查詢過於頻繁!')
            soup = None
            checkSuccess = False
        else:
            checkSuccess = True
        return soup, checkSuccess
    except Exception as e:
        print('下載失敗!')
        soup = None
        checkSuccess = False
        return soup, checkSuccess


# 將ROC日期轉換為西元日期
def ConvertDate(idate):
    if '年' in idate:
        return str((int(re.findall('民國(\\d+)年', idate)[0]) + 1911) * 10000 + \
                   int(re.findall('年(\\d+)月', idate)[0]) * 100 + \
                   int(re.findall('月(\\d+)日', idate)[0]))
    elif '/' in idate:
        idate = idate.split('/')
        return str((int(idate[0]) + 1911) * 10000 + int(idate[1]) * 100 + int(idate[2]))
    # else:
    #     return str((int(idate[0:3]) + 1911) * 10000 + int(idate[3:5]) * 100 + int(idate[5:7]))


# 整理url參數名稱與值之函數
def CombineParam(elem):
    if (elem.get('name') is not None) and (elem.get('value') is not None):
        return str(elem.get('name')) + '=' + str(elem.get('value'))


# 產生爬蟲目標url
def MakeURL(i_param, url_param):
    if i_param:
        url_param = url_param + '&' + i_param.replace('document.t59sb01_form.', '').replace('.value', ''). \
            replace(";openWindow(this.form ,'');", '').replace('\'', '').replace(';', '&')
        target_url = 'https://mops.twse.com.tw/mops/web/ajax_t59sb01?' + url_param
        return target_url


# 設定程式執行路徑
runProgramPath = 'C:/Users/ASUS/stock_intern/stock/沛錡/爬蟲andAPI/爬蟲/'
os.chdir(runProgramPath)

# 確認當前目錄是否有資料儲存資料夾 若沒有則建立
if 'material_info' not in os.listdir():
    os.mkdir((runProgramPath + 'material_info'))

# # 產生近7個實際日期
# todayDate = datetime.datetime.now()
# dateList = []
# for i in range(1):
#     iDate = todayDate - datetime.timedelta(days=i)
#     dateList.append(iDate.strftime('%Y%m%d'))

# 設定爬蟲日期區間
# 起始日
file = os.listdir((runProgramPath + 'material_info'))
if len(file) > 0:
    downloadStartDate = max(file).replace('.csv', '')
    downloadStartDate = downloadStartDate[0:4] + '-' + downloadStartDate[4:6] + '-' + downloadStartDate[6:8]
else:
    downloadStartDate = '2015-01-01'
# 結束日
downloadEndDate = datetime.datetime.now()
# 產生日期序列
dateList = pd.date_range(start=downloadStartDate, end=downloadEndDate).strftime('%Y%m%d')

# 每次只爬200個交易日
dateList = dateList[0:200]
# 計步器: 爬50個交易日後休息2小時
downloadDayNums = 0

# 迴圈日期下載重大訊息資訊資料
for iDate in dateList:

    print('目前程式正在下載日期: ' + iDate + ' 上市櫃重大訊息資料')

    # 建立儲存表
    materialInfoData = pd.DataFrame()

    # 年月日
    iYear = str(int(iDate[0:4]) - 1911)
    iMonth = iDate[4:6]
    iDay = iDate[6:8]

    # 下載公司當日重大訊息資料
    url = 'https://mops.twse.com.tw/mops/web/ajax_t05st02?' \
          'encodeURIComponent=1&step=1&step00=0&firstin=1&off=1&' \
          'TYPEK=all&year=' + iYear + '&month=' + iMonth + '&day=' + iDay

    # 防呆機制
    checkSuccess = False
    tryNums = 0
    while not checkSuccess:
        soup, checkSuccess = CheckConnect(url)
        if not checkSuccess:   # 若爬取失敗 則暫停120秒
            if tryNums == 5:   # 若已重新爬取累計5次 則放棄此次程式執行
                break
            tryNums += 1
            print('本次下載失敗 程式暫停120秒')
            time.sleep(120)

    # 防呆機制: 若累積爬取資料失敗 則終止此次程式
    if tryNums == 5:
        print('下載失敗次數累積5次 結束程式')
        break

    # 防呆機制: 若頁面出現"查無[日期]之重大訊息資料" 則進行下一個迴圈
    if '查無' in str(soup):
        print('該日期無資料 進行下一個日期資料下載')
        continue

    # 整理資料
    rowDatas = soup.find_all('table')[2].find_all('tr')
    rows = list()
    for row in rowDatas:
        rows.append([elem.get('value') for elem in row.find_all('input')])
    rows = [elem[:-1] for elem in rows if elem]
    columnNames = ['name', 'code', 'announce_date', 'time', 'subject',
                   'number', 'rule', 'actual_date', 'content']
    df = pd.DataFrame(data=rows, columns=columnNames)

    # 儲存重大訊息資訊資料
    materialInfoData = pd.concat([materialInfoData, df])
    time.sleep(5)

    # 下載DR公司當日重大訊息
    print('目前程式正在下載日期: ' + iDate + ' DR公司當日重大訊息資料')

    # 由於DR公司和一般公司的重大訊息架構不一樣 需要額外處理
    # 整理基本資訊
    rowDatas = soup.find_all('table')[3].find_all('tr')
    simpleInfoRows = list()
    for row in rowDatas:
        simpleInfoRows.append([elem.getText().replace('\xa0', '') for elem in row.find_all('td')])
    simpleInfoRows = [elem for elem in simpleInfoRows if elem]

    # 整理詳細資料資訊
    # 整理詳細資料url網址的共用參數
    urlParamRaw = soup.find_all('form')[1]
    urlParam = list()
    for i in urlParamRaw:
        urlParam.append([CombineParam(elem) for elem in urlParamRaw.find_all('input')])
    urlParam = [elem for elem in urlParam[0] if elem]
    urlParam = '&'.join(urlParam)

    # 整理各家DR公司重訊詳細資料url
    rawUrl = soup.find_all('table')[3].find_all('tr')
    #print(rawUrl)
    urlList = list()
    for i in rawUrl:
        urlList.append([MakeURL(elem.get('onclick'), urlParam) for elem in i.find_all('input')])
        
    urlList = [elem for elem in urlList if elem]

    # 執行迴圈爬蟲
    for idx, iUrl in enumerate(urlList):

        # 取得DR公司重訊資訊
        url = iUrl[0]

        # 防呆機制
        checkSuccess = False
        tryNums = 0
        while not checkSuccess:
            soup2, checkSuccess = CheckConnect(url)
            if not checkSuccess:  # 若爬取失敗 則暫停120秒
                if tryNums == 5:  # 若已重新爬取累計5次 則放棄此次程式執行
                    break
                tryNums += 1
                print('本次下載失敗 程式暫停120秒')
                time.sleep(120)

        # 防呆機制: 若累積爬取資料失敗 則終止此次程式
        if tryNums == 5:
            print('下載失敗次數累積5次 結束程式')
            break

        # 整理資料
        rowDatas = soup2.find_all('table')[1].find_all('tr')
        detailInfoRows = list()
        for row in rowDatas:
            detailInfoRows.append([elem.getText() for elem in row.find_all('td')])

        iRow = [[simpleInfoRows[idx][3],                   # 股票名稱(name)
                 simpleInfoRows[idx][2],                   # 股票代碼(code)
                 ConvertDate(simpleInfoRows[idx][0]),      # 公告日期(announce_date)
                 simpleInfoRows[idx][1].replace(':', ''),  # 公告時間(time)
                 simpleInfoRows[idx][4],                   # 主旨(subject)
                 detailInfoRows[1][0],                     # 公告序號(number)
                 '',                                       # 條款(rule): 內容有提供 但因每家寫的格式不一樣很難處理 故直接以缺值取代
                 ConvertDate(detailInfoRows[2][0]),        # 事實發生日(actual_date)
                 detailInfoRows[5][0]]]                    # 內容(content)

        # 儲存資料
        df = pd.DataFrame(data=iRow, columns=columnNames)
        materialInfoData = pd.concat([materialInfoData, df])
        time.sleep(5)

    # 將本日重大訊息資料以csv檔案儲存
    saveFilePath = runProgramPath + 'material_info/' + iDate + '.csv'
    materialInfoData.to_csv(saveFilePath, index=False)

    # 計步器: 爬50個交易日後休息2小時
    downloadDayNums += 1
    if downloadDayNums % 50 == 0:
        print('目前已爬50個交易日 程式自動休息2小時!')
        time.sleep(60*60*2)


目前程式正在下載日期: 20150104 上市櫃重大訊息資料
目前程式正在下載日期: 20150104 DR公司當日重大訊息資料
[]
目前程式正在下載日期: 20150105 上市櫃重大訊息資料
目前程式正在下載日期: 20150105 DR公司當日重大訊息資料
[<tr><th class="tblHead" nowrap="">公告日期</th><th class="tblHead" nowrap="">公告時間</th><th class="tblHead" nowrap="">公司代號</th><th class="tblHead" nowrap="">公司名稱</th><th class="tblHead" nowrap="">主旨</th><th class="tblHead" nowrap=""></th></tr>, <tr class="even">
<td align="center"> 104/01/05</td><td align="center"> 08:00:10</td><td align="center"><pre> 911609</pre></td><td align="center"> 揚子江</td><td style="text-align:left !important;"> 新聞稿</td><td><input onclick="document.t59sb01_form.SKEY.value='1';document.t59sb01_form.DATE1.value='20141231';co_id.value='911609';openWindow(this.form ,'');" type="button" value="詳細資料"/>
</td>
</tr>, <tr class="odd">
<td align="center"> 104/01/05</td><td align="center"> 08:00:18</td><td align="center"><pre> 911609</pre></td><td align="center"> 揚子江</td><td style="text-align:left !important;"> 集團訂單更新</td><td><input onclick="docu

KeyboardInterrupt: 

In [1]:
import requests
import pandas as pd

In [2]:
url = 'https://mops.twse.com.tw/mops/web/t05st01'

In [3]:
payload = {
        "encodeURIComponent": "1",
        "step": "1",
        "firstin": "1",
        "off": "1",
        "queryName": "co_id",
        "inpuType": "co_id",
        "TYPEK": "all",
        "co_id": "2498",
        "year": "111",
   }

In [4]:
response = requests.post(url, data = payload)
raw_data=response.text
print(raw_data)


<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">

<title>公開資訊觀測站</title>
<link href="css/print_css1.css" rel="stylesheet" type="text/css" Media="Print"/>



<link href="css/css2.css" rel="stylesheet" type="text/css" />
<link href="css/clickmenu1.css"rel="stylesheet" type="text/css"/>





<!--
<link href="css/tag.css" rel="stylesheet" type="text/css" />
<link href="css/tablea.css" rel="stylesheet" type="text/css" />
<script type="text/javascript" language="JavaScript1.2" src="js/menu/stmenu.js"></script>
-->
<!-- 2021.09.07 start 拿掉舊插件  -->
<!-- <script type="text/javascript" src="js/jquery-1.1.4.pack.js"></script>
<script type="text/javascript" src="js/jquery.clickmenu.js"></script> -->
<!-- 2021.09.07 end 拿掉舊插件  -->
<!-- 2021.09.07 start -->
<script type="tex

In [5]:
from bs4 import BeautifulSoup

In [6]:
soup = BeautifulSoup(response.text, 'html.parser')
rawUrl = soup.find_all('table',class_="hasBorder")[0].find_all('tr')
for i in rawUrl:
    print(i,'\n')

<tr class="tblHead"><th>公司代號</th><th>公司名稱</th><th>發言日期</th><th>發言時間</th><th>主旨</th><th> </th></tr> 

<tr class="even">
<td style="text-align:left !important;"> 2498</td><td style="text-align:left !important;"> 宏達電</td><td style="text-align:left !important;"> 111/01/07</td><td style="text-align:left !important;"> 15:00:21</td><td style="text-align:left !important;"><pre style="font-family:0�;"><font size="3"> 代子公司威宏電子(上海)有限公司公告參與投資基金</font></pre></td><td align="center"><input onclick="document.t05st01_fm.action='ajax_t05st01';document.t05st01_fm.seq_no.value='1';document.t05st01_fm.spoke_time.value='150021';document.t05st01_fm.spoke_date.value='20220107';document.t05st01_fm.co_id.value='2498';document.t05st01_fm.TYPEK.value='sii';openWindow(this.form ,'');" type="button" value="詳細資料"/>
</td>
</tr> 

<tr class="odd">
<td style="text-align:left !important;"> 2498</td><td style="text-align:left !important;"> 宏達電</td><td style="text-align:left !important;"> 111/03/01</td><td style="text-al

In [7]:
def MakeURL(i_param, url_param):
    if i_param:
        url_param = url_param + '&' + i_param.replace('document.t05st01_form.', '').replace('.value', ''). \
            replace(";openWindow(this.form ,'');", '').replace('\'', '').replace(';', '&')
        target_url = 'https://mops.twse.com.tw/mops/web/ajax_t59sb01?' + url_param
        return target_url

In [8]:
urlList = list()
urlParam="firstin=true&TYPEK=all&YEAR=111&step=1&off=true&coid=2498"
for i in rawUrl:
    urlList.append([MakeURL(elem.get('onclick'), urlParam) for elem in i.find_all('input')])

urlList = [elem for elem in urlList if elem]
print(urlList[0])

['https://mops.twse.com.tw/mops/web/ajax_t59sb01?firstin=true&TYPEK=all&YEAR=111&step=1&off=true&coid=2498&document.t05st01_fm.action=ajax_t05st01&document.t05st01_fm.seq_no=1&document.t05st01_fm.spoke_time=150021&document.t05st01_fm.spoke_date=20220107&document.t05st01_fm.co_id=2498&document.t05st01_fm.TYPEK=sii']


In [10]:
urlt=urlList[0]
response = requests.post(urlt)
raw_data=response.text
print(raw_data)

InvalidSchema: No connection adapters were found for "['https://mops.twse.com.tw/mops/web/ajax_t59sb01?firstin=true&TYPEK=all&YEAR=111&step=1&off=true&coid=2498&document.t05st01_fm.action=ajax_t05st01&document.t05st01_fm.seq_no=1&document.t05st01_fm.spoke_time=150021&document.t05st01_fm.spoke_date=20220107&document.t05st01_fm.co_id=2498&document.t05st01_fm.TYPEK=sii']"

In [9]:
#urlParam
#firstin=true&TYPEK=all&YEAR=104&MONTH=01&SDAY=01&EDAY=01&step=2b

#MakeURL(i_param, url_param)
#https://mops.twse.com.tw/mops/web/ajax_t59sb01?firstin=true&TYPEK=all&YEAR=104&MONTH=01&SDAY=05&EDAY=05&step=2b&SKEY=1&DATE1=20141231&co_id=911609
"""
encodeURIComponent: 1
step: 1
firstin: 1
off: 1
keyword4: 
code1: 
TYPEK2: 
checkbtn: 
queryName: co_id
inpuType: co_id
TYPEK: all
co_id: 2498
year: 111
month: 
b_date: 
e_date: 
"""

'\nencodeURIComponent: 1\nstep: 1\nfirstin: 1\noff: 1\nkeyword4: \ncode1: \nTYPEK2: \ncheckbtn: \nqueryName: co_id\ninpuType: co_id\nTYPEK: all\nco_id: 2498\nyear: 111\nmonth: \nb_date: \ne_date: \n'