## 環境設定及整合


### Python 套件導入

In [68]:
import os
import numpy as np
import pandas as pd
import chardet
import datetime
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, f_regression, chi2

### 資料集下載及讀取

In [69]:
!gdown 1jJGJjXcQqRaAWBoHB4hf0PT-OH4vRvvM

Downloading...
From (original): https://drive.google.com/uc?id=1jJGJjXcQqRaAWBoHB4hf0PT-OH4vRvvM
From (redirected): https://drive.google.com/uc?id=1jJGJjXcQqRaAWBoHB4hf0PT-OH4vRvvM&confirm=t&uuid=88313b08-cc01-47d5-9645-b75a34c1699c
To: /content/BDA.zip
100% 470M/470M [00:05<00:00, 84.2MB/s]


In [70]:
!unzip -P bda2024 ./BDA.zip
!mkdir ./data
!mv ./*.csv ./data

Archive:  ./BDA.zip
  inflating: bda2024_202203-202402_內容數據_新聞1.csv  
  inflating: bda2024_202203-202402_內容數據_新聞2.csv  
  inflating: bda2024_202203-202402_內容數據_新聞3.csv  
  inflating: bda2024_202203-202402_討論數據_dcard.csv  
  inflating: bda2024_202203-202402_討論數據_mobile01-1.csv  
  inflating: bda2024_202203-202402_討論數據_mobile01-2.csv  
  inflating: bda2024_202203-202402_討論數據_ptt.csv  
  inflating: bda2024_微股力_個股交易數據-2年.csv  
  inflating: bda2024_微股力_社群PKTD-2年.csv  
  inflating: bda2024_微股力_籌碼數據-2年.csv  
  inflating: bda2024_微股力_財報數據-2年.csv  
mkdir: cannot create directory ‘./data’: File exists


In [71]:
root = '/content/data'
data = {}
for filename in os.listdir(root):
    words = filename.split('_')
    readable_filename = ''.join(words[-2:])
    filepath = f'{root}/{filename}'
    print(f'正在載入 {readable_filename} ...')
    df = pd.read_csv(filepath, encoding='utf-8')
    print(df)
    data[readable_filename[:-4]] = df
    print(f'成功載入 {readable_filename}。')

正在載入 內容數據新聞1.csv ...
                       id p_type                   s_name s_area_name  \
0       1646075372873_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
1       1646075374896_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
2       1646075377238_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
3       1646084374652_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
4       1646084376722_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
...                   ...    ...                      ...         ...   
179444  1669821279154_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
179445  1669821281498_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
179446  1669821283632_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
179447  1669821286206_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
179448  1669822133851_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   

                      post_time                                 title  author  \
0       2022-03-01 02

  df = pd.read_csv(filepath, encoding='utf-8')


         stock_name stock_symbol  foreign_investor_bought  \
0                信大         1109                     2000   
1                信大         1109                        0   
2                信大         1109                    11000   
3                信大         1109                    13000   
4                信大         1109                    68000   
...             ...          ...                      ...   
998026          現觀科         6906                    28000   
998027        金萬林-創         6645                    14000   
998028  群益ESG投等債20+       00937B                  1511400   
998029         玖鼎電力         4588                   389000   
998030      世界健身-KY         2762                    10000   

        foreign_investor_sold  investment_trust_bought  investment_trust_sold  \
0                        3000                        0                      0   
1                       11000                        0                      0   
2                       

  df = pd.read_csv(filepath, encoding='utf-8')


        stock_name stock_symbol   open   high    low  close  volume  \
0               日馳         1526  47.55  48.45  47.55  48.30     138   
1               日馳         1526  48.30  48.30  47.40  47.95     153   
2               日馳         1526  48.45  48.70  47.80  48.10     120   
3               日馳         1526  47.95  47.95  47.55  47.60     165   
4               日馳         1526  47.65  47.65  45.30  45.65     514   
...            ...          ...    ...    ...    ...    ...     ...   
1154220         統一         1216  76.20  78.60  76.00  77.20   20607   
1154221         統一         1216  76.80  78.80  76.40  78.80   16466   
1154222         統一         1216  78.00  78.70  77.10  78.20   16015   
1154223         統一         1216  78.00  78.00  76.30  77.40    8524   
1154224         統一         1216  76.10  77.20  75.80  76.20    8347   

                        date  
0        2022-03-01 00:00:00  
1        2022-03-02 00:00:00  
2        2022-03-03 00:00:00  
3        2022-03-04 00:

In [72]:
data.keys()

dict_keys(['內容數據新聞1', '微股力財報數據-2年', '內容數據新聞2', '微股力籌碼數據-2年', '微股力個股交易數據-2年', '討論數據ptt', '微股力社群PKTD-2年', '討論數據dcard', '內容數據新聞3', '討論數據mobile01-1', '討論數據mobile01-2'])

In [73]:
# 類股代號皆轉為 string 型態
data['微股力個股交易數據-2年']['stock_symbol'] = data['微股力個股交易數據-2年']['stock_symbol'].astype(str)
data['微股力社群PKTD-2年']['stock_symbol'] = data['微股力社群PKTD-2年']['stock_symbol'].astype(str)
data['微股力籌碼數據-2年']['stock_symbol'] = data['微股力籌碼數據-2年']['stock_symbol'].astype(str)
data['微股力財報數據-2年']['stock_symbol'] = data['微股力財報數據-2年']['stock_symbol'].astype(str)


# 呼叫次數整理
data['微股力社群PKTD-2年']['actionP'] = data['微股力社群PKTD-2年']['actionP'] // 123
data['微股力社群PKTD-2年']['actionK'] = data['微股力社群PKTD-2年']['actionK'] // 123
data['微股力社群PKTD-2年']['actionT'] = data['微股力社群PKTD-2年']['actionT'] // 123
data['微股力社群PKTD-2年']['actionD'] = data['微股力社群PKTD-2年']['actionD'] // 123

### 中文文本處理

In [74]:
import jieba
!git clone https://github.com/fxsjy/jieba.git
jieba.set_dictionary('jieba/extra_dict/dict.txt.big')

fatal: destination path 'jieba' already exists and is not an empty directory.


#### 中文文本斷詞

In [75]:
data['討論數據dcard'] = data['討論數據dcard'][['post_time', 'title', 'content']]
data['討論數據dcard']

Unnamed: 0,post_time,title,content
0,2022-03-01 00:00:18.000,#分享 投資股票個人經驗分享,定股美股ETF長期去抓報酬也是適合的投資工具方式
1,2022-03-01 00:00:22.000,#分享 投資股票個人經驗分享,最近剛申辦覺得定期定額投資美股很方便
2,2022-03-01 00:07:48.000,#標的 請問現在 台積電 是時候買入嗎？,我三百買的 給你參考
3,2022-03-01 00:14:04.000,#分享 明天燦坤有機會漲停,中鋼呢
4,2022-03-01 00:17:39.000,#分享 當沖 六個月的心情😂,有100時候怎麼沒有選擇減碼落袋為安\n現在用什麼心態在做當沖呢？？
...,...,...,...
231315,2024-02-29 23:57:06.000,#分享 2/29 也是速戰速決,我的好朋友封心小y陪到脫褲子了
231316,2024-02-29 23:58:11.000,#請益 找台新證券的營業員,凱基
231317,2024-02-29 23:58:31.000,#其他 2024台股看漲or看跌,已經刪除的內容就像 Dcard 一樣，錯過是無法再相見的！
231318,2024-02-29 23:58:42.000,#分享 「大綜亮燈，亞力被洗掉了」,想請問紫大，前幾天討論的4939和8096，我有買了～這幾天感覺沒什麼波動，我應該這樣繼續放...


In [76]:
data['討論數據dcard']['content'][0]

'定股美股ETF長期去抓報酬也是適合的投資工具方式'

In [77]:
list(jieba.cut_for_search(data['討論數據dcard']['content'][0]))

Building prefix dict from /content/jieba/extra_dict/dict.txt.big ...
DEBUG:jieba:Building prefix dict from /content/jieba/extra_dict/dict.txt.big ...
Loading model from cache /tmp/jieba.u8edc9e38654a5763420eb5a1a47d89e3.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.u8edc9e38654a5763420eb5a1a47d89e3.cache
Loading model cost 1.640 seconds.
DEBUG:jieba:Loading model cost 1.640 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


['定股',
 '美股',
 'ETF',
 '長期',
 '去',
 '抓',
 '報酬',
 '也',
 '是',
 '適合',
 '的',
 '投資',
 '工具',
 '方式']

#### 語料庫中搜尋文本

In [78]:
start_date = '2024-01-01'
end_date = '2024-03-01'
dcard_stock_corpus = data['討論數據dcard'][np.logical_and(start_date <= data['討論數據dcard']['post_time'], data['討論數據dcard']['post_time'] <= end_date)]['content'].reset_index()['content'].astype(str)
dcard_stock_corpus

0                                     1.51%…這種勝率，賠的時候都很慘吧…
1        2023年成果結算，總算在Q4跟上車了\nhttps://megapx-assets.dca...
2        https://megapx-assets.dcard.tw/images/57727790...
3                                   哥 這樣按一開始的本金算 今年總獲利是幾%？
4        同為大二\nhttps://megapx-assets.dcard.tw/images/07...
                               ...                        
18528                                      我的好朋友封心小y陪到脫褲子了
18529                                                   凱基
18530                        已經刪除的內容就像 Dcard 一樣，錯過是無法再相見的！
18531    想請問紫大，前幾天討論的4939和8096，我有買了～這幾天感覺沒什麼波動，我應該這樣繼續放...
18532                                    為什麼上馬一直打一樣的英文\nb0
Name: content, Length: 18533, dtype: object

#### 移除 Stopwords

In [79]:
!git clone https://github.com/goto456/stopwords.git
stopwords = []
def remove_stopwords(tokens):
    return [token for token in tokens if token and token not in '，。：；「」『』（）、《》〈〉——＠＃＄％＾＆＊＋“”']

fatal: destination path 'stopwords' already exists and is not an empty directory.


### 訓練資料集生成

In [80]:
def retrieve_dcard_articles(data_source, stock_num: str, keywords: list[str], from_date, to_date):
    data[data_source]['post_time'] = pd.to_datetime(data[data_source]['post_time'])
    df = data[data_source][np.logical_and(start_date <= data[data_source]['post_time'], data[data_source]['post_time'] <= end_date)]
    result = pd.DataFrame()
    for keyword in keywords:
      temp = df[df['title'].str.contains(keyword) | df['content'].str.contains(keyword)]
      result = pd.concat([result, temp], ignore_index=True)
    return result

In [81]:
retrieve_dcard_articles('內容數據新聞1', '2363', ['台積電', '半導體', 'AI', '蘋果', 'Apple', '台灣積體電路', '臺灣積體電路', '張忠謀', '晶片', '晶圓', '護國神山', '製程技術', '國家晶片計畫', '南科', '晶圓代工', '芯片設計', '高科技產業', '積體電路', '聯電', '力積電', '中科院', 'AI 晶片', '華邦電', '記憶體', '台積公司', '中芯國際', 'ASIC', '封測', '微電子', '電子設計自動化', '光刻機', '異質整合', 'AIoT', '矽智財', '模擬晶片', 'EDA', '製造服務', '研發創新', '系統封裝', '先進封裝', '半導體設備', '量產', '材料科學', '矽晶圓', '半導體材料', '機器學習', '深度學習', '5G', '物聯網', '自動駕駛'], '2022-03-01', '2024-04-01')

Unnamed: 0,id,p_type,s_name,s_area_name,post_time,title,author,content,page_url


In [82]:
def where_key_words_in(df, keywords):
  res = pd.DataFrame()
  for kw in keywords:
    temp = df[df['title'].str.contains(kw) | df['content'].str.contains(kw)]
    res = pd.concat([res, temp], ignore_index=True)
  return res

In [83]:
def tokenize(text):
    return list(jieba.cut(text, cut_all=True))

FORBIDDEN_FEATURES = ['閱讀', '全文', 'br']
K_BEST = 32
def make_dataset(data_source, stock_symbol, keywords):
    exchange_data = data[data_source][data[data_source]['stock_symbol'].isin([stock_symbol])]
    exchange_data['delta'] = exchange_data['close'] - exchange_data['open']
    exchange_data['gain_rate'] = exchange_data['delta'] / exchange_data['open']
    article_df = where_key_words_in(data['內容數據新聞1'], keywords)
    article_df['date'] = pd.to_datetime(article_df['post_time']).dt.date.shift(5)
    exchange_data['date'] = pd.to_datetime(exchange_data['date'])
    article_df = article_df[5:].reset_index(drop=True)
    article_df['date'] = pd.to_datetime(article_df['date'])
    article_df['gain_rate'] = 0
    for date in exchange_data['date']:
        article_df.loc[article_df['date'].dt.strftime('%Y-%m-%d') == date.strftime('%Y-%m-%d'), 'gain_rate'] = exchange_data[exchange_data['date'] == date]['gain_rate'].values[0]
        # print(f'{date} gain rate set to', exchange_data[exchange_data['date'] == date]['gain_rate'].values[0])
    article_df['content'] = article_df['title'] + ' ' + article_df['content']

    article_df['content'] = article_df['content'].apply(tokenize)
    article_df['label'] = article_df['gain_rate'].apply(lambda x: 1 if x >= 0 else 0)

    corpus = article_df['content'].apply(' '.join)
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    anova_selector = SelectKBest(chi2, k=K_BEST)
    X_kbest = anova_selector.fit_transform(X, article_df['label'])
    feature_names = vectorizer.get_feature_names_out()
    selected_features = feature_names[anova_selector.get_support()]
    selected_features_set = set(selected_features)
    for forbidden_feature in FORBIDDEN_FEATURES:
        selected_features_set.discard(forbidden_feature)
    print(f'語料庫中最顯著影響漲跌的關鍵字:', np.array(selected_features_set))
    article_df['content'] = article_df['content'].apply(lambda l: ' '.join([t for t in l if t in selected_features_set]))

    return article_df[['content', 'gain_rate']], article_df['date']

# keywords = ['台積電', '半導體', 'AI', '蘋果', 'Apple', '台灣積體電路', '臺灣積體電路', '張忠謀', '晶片', '晶圓', '護國神山']
# keywords_2330 = ['台積電', '半導體', 'AI', '台灣積體電路', '臺灣積體電路', '張忠謀', '護國神山']
keywords_2363 = ['2363', '矽統', '半導體', 'AI']
dataset_2363, dates = make_dataset('微股力個股交易數據-2年', '2363', keywords_2363)
dataset_2363

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exchange_data['delta'] = exchange_data['close'] - exchange_data['open']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exchange_data['gain_rate'] = exchange_data['delta'] / exchange_data['open']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exchange_data['date'] = pd.to_datetime(exchange_data['dat

語料庫中最顯著影響漲跌的關鍵字: ['有限公司', 'computex', '上午', '董事', '公共電視', '文化', '股份', '二季', '聯貸', '財團', '報名', '製作', '股份有限', '半年', '庫存', '單位', '第二季', '走低', '上半年', '三季', '上半', '報名單', 'genio', '光寶', '純益', '不適', '有限', '回饋', '財團法人', 'cpi']


Unnamed: 0,content,gain_rate
0,董事 董事,0.038202
1,董事 董事 董事 董事 股份 股份 董事 董事,-0.010965
2,單位,-0.010965
3,走低,-0.010965
4,,0.026316
...,...,...
24054,,0.037791
24055,第二季 二季,0.037791
24056,,0.037791
24057,董事 董事,0.037791


In [84]:
# REMOVE THIS LINE IF YOU HAVE GOOGLE COLAB PRO
dataset_2363 = dataset_2363.loc[:1000].reset_index(drop=True)

In [85]:
total_length = sum(len(s) for s in dataset_2363["content"])
document_length_mean = total_length / len(dataset_2363["content"])
document_length_mean

8.426573426573427

#### Encoding text with BERT

##### Load Dependencies

In [86]:
!pip install -U "tensorflow-text==2.13.*"
!pip install "tf-models-official==2.13.*"



In [87]:
import os
import shutil
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

##### BERT configuration

In [88]:
map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}


# print(f'BERT model selected           : {tfhub_handle_encoder}')
# print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

##### Encoding

In [89]:
def make_bert_preprocess_model(tfhub_handle_preprocess, input_names, seq_length=128):
    """Returns Model mapping string features to BERT inputs.

    Args:
    sentence_features: a list with the names of string-valued features.
    seq_length: an integer that defines the sequence length of BERT inputs.

    Returns:
    A Keras Model that can be called on a list or dict of string Tensors
    (with the order or names, resp., given by sentence_features) and
    returns a dict of tensors for input to BERT.
    """

    input_segments = [
        tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
        for ft in input_names]

    # Tokenize the text to word pieces.
    bert_preprocess = hub.load(tfhub_handle_preprocess)
    tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
    segments = [tokenizer(s) for s in input_segments]

    # Optional: Trim segments in a smart way to fit seq_length.
    # Simple cases (like this example) can skip this step and let
    # the next step apply a default truncation to approximately equal lengths.
    truncated_segments = segments

    # Pack inputs. The details (start/end token ids, dict of output tensors)
    # are model-dependent, so this gets loaded from the SavedModel.
    packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                            arguments=dict(seq_length=seq_length),
                            name='packer')
    model_inputs = packer(truncated_segments)
    return tf.keras.Model(input_segments, model_inputs)

def get_embbedings_bert(tfhub_handle_preprocess, tfhub_handle_encoder, seq_length = 256, raw_text_list=[]):
    # text preprocessing
    test_preprocess_model = make_bert_preprocess_model(tfhub_handle_preprocess, ['my_input'], seq_length)
    test_text = [np.array(raw_text_list)]
    text_preprocessed = test_preprocess_model(test_text)
    # print('Preprocessing')
    # print('Keys           : ', list(text_preprocessed.keys()))
    # print('Shape Word Ids : ', text_preprocessed['input_word_ids'].shape)
    # print('Word Ids       : ', text_preprocessed['input_word_ids'])
    # print('Shape Mask     : ', text_preprocessed['input_mask'].shape)
    # print('Input Mask     : ', text_preprocessed['input_mask'])
    # print('Shape Type Ids : ', text_preprocessed['input_type_ids'].shape)
    # print('Type Ids       : ', text_preprocessed['input_type_ids'])


    # using the bert model
    # print('Using Bert Model')
    bert_model = hub.KerasLayer(tfhub_handle_encoder)
    bert_results = bert_model(text_preprocessed)
    print(f'Loaded BERT: {tfhub_handle_encoder}')
    print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
    print(f'Pooled Outputs Values:{bert_results["pooled_output"]}')
    # print("\n")
    # print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
    # print(f'Sequence Outputs Values:{bert_results["sequence_output"]}')

    return bert_results

In [91]:
bert_model_name = "bert_multi_cased_L-12_H-768_A-12"
tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]
seq_length = int(document_length_mean)

embeddings = get_embbedings_bert(tfhub_handle_preprocess, tfhub_handle_encoder, seq_length, dataset_2363['content'])

Loaded BERT: https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3
Pooled Outputs Shape:(1001, 768)
Pooled Outputs Values:[[ 0.18309645  0.08064764  0.21475442 ... -0.30761114  0.12122031
   0.30289584]
 [ 0.21329546  0.09726274  0.22383195 ... -0.31341183  0.08392922
   0.17273737]
 [ 0.18345778 -0.24800119  0.17865479 ... -0.2588909   0.21953095
   0.255037  ]
 ...
 [ 0.2546832  -0.00992665  0.18233944 ... -0.18043229  0.12816739
   0.19316058]
 [ 0.18429932  0.10678127  0.15581837 ... -0.15855749  0.12668696
   0.19964336]
 [ 0.3810632  -0.02448309  0.13080055 ... -0.24282624  0.18496262
   0.17372748]]


In [None]:
# def get_embbedings_bert(tfhub_handle_preprocess, tfhub_handle_encoder, raw_text_list=[]):
#     # text preprocessing
#     bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
#     text_preprocessed = bert_preprocess_model(raw_text_list)
#     print('Preprocessing!')
#     print('Keys           : ', list(text_preprocessed.keys()))
#     print('Shape Word Ids : ', text_preprocessed['input_word_ids'].shape)
#     print('Word Ids       : ', text_preprocessed['input_word_ids'])
#     print('Shape Mask     : ', text_preprocessed['input_mask'].shape)
#     print('Input Mask     : ', text_preprocessed['input_mask'])
#     print('Shape Type Ids : ', text_preprocessed['input_type_ids'].shape)
#     print('Type Ids       : ', text_preprocessed['input_type_ids'])


#     # using the bert model
#     print('Using Bert Model!')
#     bert_model = hub.KerasLayer(tfhub_handle_encoder)
#     bert_results = bert_model(text_preprocessed)
#     print(f'Loaded BERT: {tfhub_handle_encoder}')
#     print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
#     print(f'Pooled Outputs Values:{bert_results["pooled_output"]}')
#     print("\n")
#     # print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
#     # print(f'Sequence Outputs Values:{bert_results["sequence_output"]}')

#     return bert_results

In [None]:
# bert_model_name = "bert_multi_cased_L-12_H-768_A-12"
# tfhub_handle_encoder = map_name_to_handle[bert_model_name]
# tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]
# raw_text_list = dataset_2363['content']
# get_embbedings_bert(tfhub_handle_preprocess, tfhub_handle_encoder, raw_text_list)

### Model Training

#### Load Data

In [92]:
import torch
from torch.utils.data import Dataset
import pandas as pd

class LabeledDataset(Dataset):
    def __init__(self, dataframe):
        """
        Initialize the dataset.
        Args:
            dataframe (pd.DataFrame): Input DataFrame with columns 'date', 'embeddings', 'label'.
        """
        self.dataframe = dataframe
    def __len__(self):
        """
        Return the total number of samples in the dataset.
        """
        return len(self.dataframe)

    def __getitem__(self, idx):
        """
        Retrieve the ith sample from the dataset.
        Args:
            idx (int): The index of the sample to retrieve.
        """
        # Extract embeddings and label from the DataFrame
        embeddings = torch.tensor(self.dataframe.loc[idx, 'embeddings'], dtype=torch.float32)
        label = 1 if self.dataframe.iloc[idx]['gain_rate'] >= 0 else 0

        return embeddings, label

# Example usage:
# Assuming 'df' is a pandas DataFrame with the specified columns
# dataset = CustomDataset(df)
# loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)

In [93]:
dataset_2363['embeddings'] = pd.Series(embeddings['sequence_output'].numpy().tolist())
dataset_2363

Unnamed: 0,content,gain_rate,embeddings
0,董事 董事,0.038202,"[[0.16009031236171722, -0.18202456831932068, 0..."
1,董事 董事 董事 董事 股份 股份 董事 董事,-0.010965,"[[0.08386759459972382, -0.14728020131587982, 0..."
2,單位,-0.010965,"[[0.3588416576385498, -0.26375919580459595, 0...."
3,走低,-0.010965,"[[-0.2404404878616333, -0.16392701864242554, 0..."
4,,0.026316,"[[0.03273423761129379, -0.005390644073486328, ..."
...,...,...,...
996,,-0.006316,"[[0.03273423761129379, -0.005390644073486328, ..."
997,半年,-0.006316,"[[0.3706050217151642, 0.000394187867641449, 0...."
998,董事 半年,-0.006316,"[[0.14302104711532593, -0.09625136852264404, 0..."
999,,-0.006316,"[[0.03273423761129379, -0.005390644073486328, ..."


#### Train Model

In [94]:
# Assuming X contains your features and y contains your labels
train_percentage = 0.8
split_point = int(dataset_2363.shape[0]*train_percentage)
train_dataset = dataset_2363.iloc[:split_point].reset_index(drop=True)
test_dataset = dataset_2363.iloc[split_point:].reset_index(drop=True)

In [95]:
labeled_train = LabeledDataset(train_dataset)
labeled_train[0]

(tensor([[ 0.1601, -0.1820,  0.2827,  ...,  0.2833, -0.3383,  0.1182],
         [-0.0190, -0.3687,  0.8042,  ...,  0.1948, -0.6305,  0.0225],
         [-0.1239, -0.6466,  0.6440,  ...,  0.3929, -0.7878,  0.1110],
         ...,
         [ 0.1550, -0.2246,  0.8612,  ...,  0.1700, -0.5110,  0.2102],
         [ 0.2062, -0.2927,  0.6719,  ...,  0.1975, -0.4702,  0.2815],
         [ 0.2382, -0.3394,  0.7260,  ...,  0.3332, -0.5280,  0.2999]]),
 1)

In [96]:
labeled_test = LabeledDataset(test_dataset)
labeled_test[0]

(tensor([[ 0.0514,  0.0071,  0.6427,  ...,  0.4163,  0.1918, -0.1783],
         [ 0.1776, -0.3374,  0.5966,  ...,  0.5032,  0.5538, -0.2103],
         [ 0.4603, -0.3755,  1.5502,  ...,  0.6710,  0.5048, -0.5997],
         ...,
         [ 0.0268, -0.6706,  0.8885,  ...,  0.7294,  0.5048, -0.1331],
         [ 0.1864, -0.1586,  1.2895,  ...,  1.0080,  0.5532, -0.3978],
         [ 0.1506, -0.1976,  0.9179,  ...,  0.2490,  0.3431, -0.2163]]),
 1)

In [97]:
train_loader = torch.utils.data.DataLoader(labeled_train, batch_size=32)
test_loader = torch.utils.data.DataLoader(labeled_test, batch_size=32)

In [110]:
"""
Example code of a simple RNN, GRU, LSTM on the MNIST dataset.

Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
*    2020-05-09 Initial coding
*    2022-12-16 Updated with more detailed comments, docstrings to functions, and checked code still functions as intended.

"""

# Imports
import torch
import torch.nn.functional as F  # Parameterless functions, like (some) activation functions
import torchvision.datasets as datasets  # Standard datasets
import torchvision.transforms as transforms  # Transformations we can perform on our dataset for augmentation
from torch import optim  # For optimizers like SGD, Adam, etc.
from torch import nn  # All neural network modules
from torch.utils.data import (
    DataLoader,
)  # Gives easier dataset managment by creating mini batches etc.
from tqdm import tqdm  # For a nice progress bar!

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"



# Recurrent neural network (many-to-one)
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        # Forward propagate LSTM
        out, _ = self.rnn(x, h0)
        out = out.reshape(out.shape[0], -1)

        # Decode the hidden state of the last time step
        out = self.fc(out)
        return out


# Recurrent neural network with GRU (many-to-one)
class RNN_GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN_GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        # self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        # Forward propagate LSTM
        out, _ = self.gru(x, h0)
        # out = out.reshape(out.shape[0], -1)
        # out = out.contiguous().view(out.size(0), -1)
        out = out[:, -1, :]

        # Decode the hidden state of the last time step
        out = self.fc(out)
        return out


# Recurrent neural network with LSTM (many-to-one)
class RNN_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        # Forward propagate LSTM
        out, _ = self.lstm(
            x, (h0, c0)
        )  # out: tensor of shape (batch_size, seq_length, hidden_size)
        out = out.reshape(out.shape[0], -1)

        # Decode the hidden state of the last time step
        out = self.fc(out)
        return out

# Train Network
sequence_length = document_length_mean
learning_rate = 0.005
def train(model, data_loader, num_epochs=1):
  # Loss and optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=learning_rate)
  for epoch in range(num_epochs):
      for batch_idx, (data, targets) in enumerate(tqdm(data_loader)):
          # Get data to cuda if possible
          print(data.shape)
          print(targets.shape)
          data = data.to(device=device)
          targets = targets.to(device=device)

          # forward
          scores = model(data)
          loss = criterion(scores, targets)

          # backward
          optimizer.zero_grad()
          loss.backward()

          # gradient descent update step/adam step
          optimizer.step()

# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0

    # Set model to eval
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device)
            y = y.to(device=device)

            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum().item()
            num_samples += predictions.size(0)

    # Toggle model back to train
    model.train()
    return num_correct / num_samples

# Initialize network (try out just using simple RNN, or GRU, and then compare with LSTM)

In [114]:
device

'cuda'

In [116]:
# Hyperparameters
input_size = 768
hidden_size = 128
num_layers = 2
num_classes = 2

model = RNN_GRU(input_size, hidden_size, num_layers, num_classes).to(device)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [106]:
train(model, train_loader, 1)
print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:2f}")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")

  0%|          | 0/25 [00:00<?, ?it/s]


tensor([[[ 1.6009e-01, -1.8202e-01,  2.8267e-01,  ...,  2.8330e-01,
          -3.3826e-01,  1.1822e-01],
         [-1.8994e-02, -3.6866e-01,  8.0418e-01,  ...,  1.9483e-01,
          -6.3049e-01,  2.2532e-02],
         [-1.2385e-01, -6.4656e-01,  6.4396e-01,  ...,  3.9285e-01,
          -7.8780e-01,  1.1095e-01],
         ...,
         [ 1.5499e-01, -2.2463e-01,  8.6117e-01,  ...,  1.7001e-01,
          -5.1098e-01,  2.1020e-01],
         [ 2.0618e-01, -2.9266e-01,  6.7191e-01,  ...,  1.9750e-01,
          -4.7021e-01,  2.8151e-01],
         [ 2.3820e-01, -3.3936e-01,  7.2602e-01,  ...,  3.3321e-01,
          -5.2804e-01,  2.9992e-01]],

        [[ 8.3868e-02, -1.4728e-01,  2.9947e-01,  ...,  3.7794e-01,
          -3.3389e-01,  9.9703e-02],
         [-2.1619e-01, -3.4411e-01,  7.9906e-01,  ...,  1.1519e-01,
          -5.0157e-01,  1.5102e-01],
         [-2.3093e-01, -5.7077e-01,  6.8481e-01,  ...,  6.0698e-01,
          -8.1978e-01,  4.7740e-02],
         ...,
         [-7.1060e-02, -4

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
