In [1]:
from pathlib import Path

# Create data folder if not exist
Path("data").mkdir(parents=True, exist_ok=True)

In [7]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
from bs4 import BeautifulSoup
import unicodedata as ucd

In [18]:
from pathlib import Path
import pandas as pd
import requests
from bs4 import BeautifulSoup
import unicodedata as ucd

# Create data folder if it doesn't exist
Path("data").mkdir(parents=True, exist_ok=True)

# URL to fetch the Taiwan stock list
url = "https://isin.twse.com.tw/isin/C_public.jsp?strMode=2"

# Get the response from the URL
response = requests.get(url)
response.encoding = 'big5'  # Set encoding to Big5 for Chinese characters

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table containing the stock data
table = soup.find("table", {"class": "h4"})

# Extract the rows from the table
rows = []
for row in table.find_all('tr')[1:]:  # Skip the header row
    cols = row.find_all('td')
    if len(cols) >= 2:
        stock_id = ucd.normalize('NFKC', cols[0].text.split()[0].strip())  # Extract stock ID
        stock_name = ucd.normalize('NFKC', cols[0].text.split()[1].strip())  # Extract stock name
        rows.append([stock_id, stock_name])

# Convert the list of rows to a DataFrame
df = pd.DataFrame(rows, columns=["STOCK_ID", "STOCK_NAME"])
# Rename columns if not already done
df = df.rename(columns={"證券代號": "STOCK_ID", "證券名稱": "STOCK_NAME"})

# Save the DataFrame to a CSV file
df.to_csv("stock_id.csv", index=False, header=True)

# Show the first few rows
print(df.head())

  STOCK_ID STOCK_NAME
0     1101         台泥
1     1102         亞泥
2     1103         嘉泥
3     1104         環泥
4     1108         幸福


In [28]:
# Request html
retry_strategy = Retry(total=3)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)
response = http.get("https://www.moneydj.com/ETF/X/Basic/Basic0007a.xdjhtm?etfid=0050.TW")

# Parser html
soup = BeautifulSoup(response.content, "html.parser")

df = pd.DataFrame()
row_index = 0

# Locate the table by find the sibling html tag which have id attribute
first_table = soup.find(id="ctl00_ctl00_MainContent_MainContent_sdate3").find_next_sibling()
stock_tag = first_table.find_all("td")
for i in range(0, len(stock_tag), 4):
    stock_name = stock_tag[i].text.strip()

    df.loc[row_index, "STOCK_NAME"] = stock_name
    df.loc[row_index, "持股(千股)"] = stock_tag[i + 1].text.strip()
    df.loc[row_index, "比例"] = stock_tag[i + 2].text.strip()
    df.loc[row_index, "增減"] = stock_tag[i + 3].text.strip()
    row_index += 1

stock_tag = first_table.find_next_sibling().find_all("td")

for i in range(0, len(stock_tag), 4):
    stock_name = stock_tag[i].text.strip()

    df.loc[row_index, "STOCK_NAME"] = stock_name
    df.loc[row_index, "持股(千股)"] = stock_tag[i + 1].text.strip()
    df.loc[row_index, "比例"] = stock_tag[i + 2].text.strip()
    df.loc[row_index, "增減"] = stock_tag[i + 3].text.strip()
    row_index += 1


# Combine with Stock ID
stock_df = pd.read_csv("stock_id.csv")
result_df = pd.merge(df, stock_df, how="left", on=["STOCK_NAME"])
result_df = result_df[["STOCK_ID", "STOCK_NAME", "持股(千股)", "比例", "增減"]]

In [29]:
result_df.head() 

Unnamed: 0,STOCK_ID,STOCK_NAME,持股(千股),比例,增減
0,2330,台積電,224205.0,54.29,+0.14%
1,2317,鴻海,105975.0,5.44,-0.21%
2,2454,聯發科,13785.0,4.36,-0.47%
3,2308,台達電,20100.0,2.16,+0.21%
4,2382,廣達,24913.0,1.76,-0.17%


In [30]:
result_df.to_csv("data/ETF50.csv", index=False, header=True)

In [22]:
import pandas as pd
import yfinance as yf


In [23]:
# 護國神股
TSMC = yf.Ticker("2330.TW")

# 股利
TSMC.dividends

# Dividends & StockSplits
TSMC.actions

TSMC.balance_sheet.index

TSMC.calendar

TSMC.cashflow

TSMC.earnings

TSMC.financials



Unnamed: 0,2023-12-31,2022-12-31,2021-12-31,2020-12-31,2019-12-31
Tax Effect Of Unusual Items,439276610.819893,450946515.219931,612244570.460729,804033135.053612,
Tax Rate For Calcs,0.130998,0.13179,0.105809,0.126103,
Normalized EBITDA,1520153500000.0,1589654800000.0,1085058800000.0,912176400000.0,
Total Unusual Items,3353300000.0,3421700000.0,5786300000.0,6376000000.0,
Total Unusual Items Excluding Goodwill,3353300000.0,3421700000.0,5786300000.0,6376000000.0,
Net Income From Continuing Operation Net Minority Interest,851740000000.0,992923400000.0,592359200000.0,510744000000.0,
Reconciled Depreciation,532190900000.0,437254300000.0,422394900000.0,331724600000.0,
Reconciled Cost Of Revenue,986625200000.0,915536500000.0,767877700000.0,628124700000.0,
EBITDA,1523506800000.0,1593076500000.0,1090845100000.0,918552400000.0,
EBIT,991315900000.0,1155822200000.0,668450200000.0,586827800000.0,


In [31]:
etf50_df = pd.read_csv("data/ETF50.csv")
etf50_id = etf50_df.loc[:, "STOCK_ID"].astype(str) + ".TW"
etf50_id = etf50_id.str.cat(sep=" ")
print(etf50_id)

2330.TW 2317.TW 2454.TW 2308.TW 2382.TW 2881.TW 2891.TW 2303.TW 2882.TW 3711.TW 2886.TW 2412.TW 2884.TW 1216.TW 2885.TW 2357.TW 2892.TW 2890.TW 2327.TW 3034.TW 5880.TW 3008.TW 2880.TW 2002.TW 3231.TW 2345.TW 1303.TW 2883.TW 2379.TW 2887.TW 3037.TW 1101.TW 1301.TW 4938.TW 2207.TW 2301.TW 3661.TW 3017.TW 2603.TW 6669.TW 1326.TW 2395.TW 2912.TW 3045.TW 4904.TW 5876.TW 5871.TW 1590.TW 6505.TW 2408.TW


In [32]:
# Download etf50 recent 1 years data
df = yf.download(etf50_id, group_by="Ticker", period="1y", interval="1d")

# rotate Ticker axis and convert to (Date,Ticker) index
df = df.stack(level=0).rename_axis(["Date", "Ticker"]).reset_index(level=1)

# drop index
df = df.reset_index(level=0)
df = df.rename(columns={"Ticker": "STOCK_ID"})
df = df[["STOCK_ID", "Date", "Adj Close", "Close", "High", "Low", "Open", "Volume"]]

[*********************100%%**********************]  50 of 50 completed


In [33]:
df.head()

Price,STOCK_ID,Date,Adj Close,Close,High,Low,Open,Volume
0,1101.TW,2023-08-28,34.170761,35.200001,35.299999,35.099998,35.150002,8257478.0
1,1216.TW,2023-08-28,68.557686,71.099998,71.5,70.599998,71.0,4257934.0
2,1301.TW,2023-08-28,78.27774,79.599998,79.800003,79.0,79.0,6219358.0
3,1303.TW,2023-08-28,65.25235,66.199997,66.199997,65.099998,65.199997,2803972.0
4,1326.TW,2023-08-28,59.330414,60.900002,61.200001,60.5,60.700001,1938340.0


In [34]:
df.to_csv("data/ETF50_10years.csv", index=False, header=True)

In [35]:
import pandas as pd
from datetime import date, timedelta
from pathlib import Path
from tqdm import tqdm 
import requests
from io import StringIO

Path("data/TII").mkdir(parents=True, exist_ok=True)

In [38]:
from datetime import date, timedelta
import pandas as pd
import requests
from io import StringIO
from tqdm import tqdm
import os

hack_date = date.today()

# Create the directory if it doesn't exist
os.makedirs("data/TII", exist_ok=True)

for i in tqdm(range(365)):
    hack_date_str = hack_date.strftime("%Y%m%d")

    url = f"https://www.twse.com.tw/fund/T86?response=csv&date={hack_date_str}&selectType=ALL"
    response = requests.get(url).text

    if len(response) > 2:
        data = StringIO(response)
        try:
            df = pd.read_csv(data, sep=",", skiprows=[0], on_bad_lines='skip')
            df = df.drop(df.tail(8).index).drop(columns=df.columns[-1], axis=1)
            df.to_csv(f"data/TII/{hack_date_str}.csv", index=False, header=True)
        except pd.errors.ParserError as e:
            print(f"ParserError on {hack_date_str}: {e}")
    else:
        print(f"No data for {hack_date_str}")
    
    hack_date -= timedelta(days=1)

100%|██████████| 365/365 [01:02<00:00,  5.85it/s]
