In [5]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import pandas as pd
import time

count = 0

def 連接到資料庫(db):
    global conn,cursor
    # 連接到資料庫A ====================================================================
    conn = sqlite3.connect('goodinfoRevenue.db')
    # cursor object
    cursor = conn.cursor()

    sql = '''select * from revenue t'''
    dfdatabase = pd.read_sql(sql,conn)

    # 連接到資料庫B
    sqlStock = '''select * from stock t'''
    dfStock = pd.read_sql(sqlStock,conn)
    dfStock = pd.DataFrame(dfStock)
    ids = dfStock['code'][count:]
    # ===================================================================================
    return dfdatabase, ids
    
import random

def 導入資料庫(df3, LastMonth):
    # 如果爬蟲沒有新資料則跳過
    if LastMonth not in df3['date'].values:
        # ================================
        # print(id)
        # ================================
        pass
    # 如果爬蟲有新資料則更新
    else:
        df3mask = df3['date'] == LastMonth
        df4 = df3[df3mask]
        for index, row in df4.iterrows():
            try:
                cursor.execute(
                """INSERT OR IGNORE INTO revenue 
                    (code,date,open,close,high,low,updownYuan,updown,revenue,mon,yoy,revenueSum,yoySum)
                    values(?,?,?,?,?,?,?,?,?,?,?,?,?)""",
                    (row['code'],
                    row['date'],
                    row['open'],
                    row['close'],
                    row['high'],
                    row['low'],
                    row['updownYuan'],
                    row['updown'],
                    row['revenue'],
                    row['mon'],
                    row['yoy'],
                    row['revenueSum'],
                    row['yoySum'])
                    )
                conn.commit()
            except:
                pass

def 爬蟲(count, id, LastMonth):
    try:
        # 爬個股資料
        url = f'https://goodinfo.tw/tw/ShowSaleMonChart.asp?STOCK_ID={id}'
        # headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.70'} # goodinfo有擋機器人爬蟲，透過添加headers模仿真實上網的環境就能抓到資料了
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39'}
        res = requests.get(url, headers = headers, timeout=25)
        res.encoding = "utf-8" # 將編碼設定為【utf-8】，中文字就能顯示出來了
        # res.text

        # BeautifSoup是一個用來解析HTML結構的Python套件，將取回的網頁HTML結構透過提供的方法解析。解析器（html.parser,html5lib,lxml），官方文件lxml為最快
        soup = BeautifulSoup(res.text,"lxml") 
        # select_one：搜索類名、標籤名、id名等，因為我們搜索的是id，在html語言中要加【#】才能搜索到
        data = soup.select_one("#divSaleMonChartDetail")
        # data
        
        # 隨機等待時間
        # time.sleep(int(format(random.randint(10,20))))
        time.sleep(10)

        # 【重整表格】 ===============================================================================================
        # prettify()：函數將我們的data物件美化作用
        dfs = pd.read_html(data.prettify())
        df = dfs[1]
        # 網頁的表格是由四格組成，但Python中無法合併單元格一起顯示，所以被合併的表格就會拆分成一格一格顯示
        # 使用columns.get_level_values來取得的最後一行的欄位名
        df.columns = df.columns.get_level_values(2)
        # 刪除所有多於的標題欄
        df2 = df[df["月別"]=="月別"].index
        df2 = df.drop(df2)
        # 重整標題
        # df2.columns = ['月別','開盤','收盤','最高','最低','漲跌(元)','漲跌(%)','月營收(億)','月月增(%)','月年增','累月營收(億)','累月年增','營收(億)','月增(%)','年增(%)','累計營收(億)','累計年增(%)']
        df2.columns = ['date','open','close','high','low','updownYuan','updown','月營收(億)','月月增(%)','月年增','累月營收(億)','累月年增','revenue','mon','yoy','revenueSum','yoySum']

        # 刪除營業收入
        df3 = df2.copy()
        df3.drop(columns=['月營收(億)','月月增(%)','月年增','累月營收(億)','累月年增'],inplace=True)
        # 使用pandas的insert方法，第一个参数指定插入列的位置，第二个参数指定插入列的列名，第三个参数指定插入列的数据
        df3.insert(0,'code',id)
        # =============================================================================================================

        導入資料庫(df3, LastMonth)
    except:
        print(f'{count}没有找到{id}资料')
        # time.sleep(int(format(random.randint(15,25))))
        time.sleep(10)

from tqdm import tqdm

def 更新月營收(count):
    dfdatabase, ids = 連接到資料庫('goodinfoRevenue.db')
    
    # 上個月財報日期
    thisMonth = pd.Timestamp.today() 
    LastMonth = thisMonth - pd.DateOffset(months=1) # 這個月日期減上個月日期
    LastMonth = LastMonth.strftime("%Y/%m") # 格式化日期
    # LastMonth = '2023/03'
    
    # 先篩選掉已經存在於stock的股票
    check = dfdatabase[dfdatabase['date'] == LastMonth]['code']
    ids = ids[~ids.isin(check)]
    
    progress_bar = tqdm(ids,desc="正在执行第{}次".format(0))
    

    for id in progress_bar:
        
        # 先查詢本地資料庫
        dfdatabaseMask = dfdatabase['code'] == id
        dfdatabase2 = dfdatabase[dfdatabaseMask]
        

        # 如果沒有上個月資料則爬蟲更新
        if LastMonth not in dfdatabase2['date'].values:
            
            爬蟲(count, id, LastMonth)
            
            progress_bar.set_description("正在执行第{}次".format(count+1))
            progress_bar.update(1)
            
            count+=1
            time.sleep(int(format(random.randint(15,25))))
        # 如果有的話則跳過
        else:
            progress_bar.set_description("正在执行第{}次".format(count+1))
            progress_bar.update(1)
            
            count+=1
            pass
        
    progress_bar.close()
    
if __name__ == "__main__":
    更新月營收(count)
    
    print('爬蟲完畢')
    conn.close()

正在执行第0次: 0it [00:00, ?it/s]

爬蟲完畢



