In [9]:
import datetime
import json
import typing

import pandas as pd
import requests

URL = "https://www.twse.com.tw/exchangeReport/MI_INDEX?response=json&date={}&type=ALLBUT0999&_={}"
# 網頁瀏覽時, 所帶的 request header 參數, 模仿瀏覽器發送 request
HEADER = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
    "Connection": "keep-alive",
    "Host": "www.twse.com.tw",
    "Referer": "https://www.twse.com.tw/zh/page/trading/exchange/MI_INDEX.html",
    "sec-ch-ua": '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "Windows",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest",
}


def crawler(parameters:typing.Dict[str, str]) -> pd.DataFrame:
    crawler_date = parameters.get("crawler_date", "")
    crawler_date = crawler_date.replace("-", "")
    crawler_timestamp = int(datetime.datetime.now().timestamp())

    resp = requests.get(
        url=URL.format(crawler_date, crawler_timestamp), headers=HEADER
    )
    
    columns = [
        "stock_id",  # 證券代號
        "stock_name",  # 證券名稱
        "trading_volume",  # 成交股數
        "volume",  # 成交量
        "total_amount",  # 成交總金額
        "open",  # 開盤價
        "max",  # 最高價
        "min",  # 最低價
        "close",  # 收盤價
        "pe_ratio",  # 本益比
    ]

    if resp.ok:
        resp_data = json.loads(resp.text)
        data = pd.DataFrame(resp_data["data9"])
        data = data[[0, 1, 2, 3, 4, 5, 6, 7, 8, 15]]
        data.columns = columns
        data["date"] = parameters.get("crawler_date", "")
    else:
        data = pd.DataFrame()
    return data


if __name__ == "__main__":
    parameters = {
        "crawler_date": "2023-04-19",
    }
    data = crawler(parameters)
    print(data)


     stock_id stock_name trading_volume  volume   total_amount    open  \
0        0050     元大台灣50      8,700,172  22,007  1,037,538,973  120.00   
1        0051    元大中型100         73,416     173      4,291,980   58.50   
2        0052       富邦科技        349,045     765     37,163,057  107.15   
3        0053       元大電子         10,094   1,005        603,696   60.10   
4        0055   元大MSCI金融        165,794     281      3,717,174   22.42   
...       ...        ...            ...     ...            ...     ...   
1184     9944         新麗        118,699     137      2,416,253   20.40   
1185     9945        潤泰新      6,070,529   3,588    215,079,673   35.45   
1186     9946       三發地產      1,767,847     705     24,830,573   13.65   
1187     9955         佳龍        768,142     576     19,875,633   26.15   
1188     9958        世紀鋼      6,700,967   4,803    786,414,414  115.50   

         max     min   close pe_ratio        date  
0     120.05  118.80  119.05     0.00  2023-04-19  
1      

In [4]:
import sys
import datetime

def gen_task_paramter_list(start_date: str, end_date: str):
    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d").date()
    end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d").date()
    days = (end_date - start_date).days + 1
    date_list = [
        dict(crawler_date=str(start_date + datetime.timedelta(days=day)))
        for day in range(days)
    ]
    return date_list

parameter_list = gen_task_paramter_list(
    start_date="2023-4-1",
    end_date="2023-4-20",
)

for parameters in parameter_list:
    print(f"{parameters}")

{'crawler_date': '2023-04-01'}
{'crawler_date': '2023-04-02'}
{'crawler_date': '2023-04-03'}
{'crawler_date': '2023-04-04'}
{'crawler_date': '2023-04-05'}
{'crawler_date': '2023-04-06'}
{'crawler_date': '2023-04-07'}
{'crawler_date': '2023-04-08'}
{'crawler_date': '2023-04-09'}
{'crawler_date': '2023-04-10'}
{'crawler_date': '2023-04-11'}
{'crawler_date': '2023-04-12'}
{'crawler_date': '2023-04-13'}
{'crawler_date': '2023-04-14'}
{'crawler_date': '2023-04-15'}
{'crawler_date': '2023-04-16'}
{'crawler_date': '2023-04-17'}
{'crawler_date': '2023-04-18'}
{'crawler_date': '2023-04-19'}
{'crawler_date': '2023-04-20'}


In [5]:
from celery import Celery

app = Celery(
    "task",
    # 只包含 tasks.py 裡面的程式, 才會成功執行
    include=["financialdata.tasks"],
    # 連線到 rabbitmq,
    # pyamqp://user:password@localhost:5672/
    # 本書的帳號密碼都是 worker
    broker="pyamqp://worker:worker@localhost:5672/",
)


@app()
def crawler(
    dataset: str,
    parameters: typing.Dict[str, str],
):
    # 使用 getattr, importlib,
    # 根據不同 dataset, 使用相對應的 crawler 收集資料
    # 爬蟲
    # df = getattr(
    #     importlib.import_module(f"financialdata.crawler.{dataset}"),
    #     "crawler",
    # )(parameters=parameters)
    print('df')
    print("upload db")
    
crawler('taiwan','2023-04-20')

TypeError: 'Celery' object is not callable