## 環境設定及整合


### Python 套件導入

In [1]:
import os
import numpy as np
import pandas as pd
import chardet
import datetime
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_regression, chi2

In [2]:
!pip install TCSP

Collecting TCSP
  Downloading TCSP-0.0.9-py3-none-any.whl.metadata (1.3 kB)
Downloading TCSP-0.0.9-py3-none-any.whl (5.7 kB)
Installing collected packages: TCSP
Successfully installed TCSP-0.0.9


### 資料集下載及讀取

In [3]:
root = r'C:\Users\tony\Desktop\bda_project\BDA'
data = {}
for filename in os.listdir(root):
    if '.ipynb_checkpoints' in filename:
        continue  # 忽略 .ipynb_checkpoints 目录
    words = filename.split('_')
    readable_filename = ''.join(words[-2:])
    filepath = f'{root}/{filename}'
    print(f'正在載入 {readable_filename} ...')
    df = pd.read_csv(filepath, encoding='utf-8')
    print(df)
    data[readable_filename[:-4]] = df
    print(f'成功載入 {readable_filename}。')

正在載入 內容數據新聞1.csv ...
                       id p_type                   s_name s_area_name  \
0       1646075372873_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
1       1646075374896_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
2       1646075377238_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
3       1646084374652_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
4       1646084376722_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
...                   ...    ...                      ...         ...   
179444  1669821279154_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
179445  1669821281498_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
179446  1669821283632_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
179447  1669821286206_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   
179448  1669822133851_N01   news  Yahoo股市(收錄2022/12/1前資料)        財經新聞   

                      post_time                                 title  author  \
0       2022-03-01 02

  df = pd.read_csv(filepath, encoding='utf-8')


        stock_name stock_symbol   open   high    low  close  volume  \
0               日馳         1526  47.55  48.45  47.55  48.30     138   
1               日馳         1526  48.30  48.30  47.40  47.95     153   
2               日馳         1526  48.45  48.70  47.80  48.10     120   
3               日馳         1526  47.95  47.95  47.55  47.60     165   
4               日馳         1526  47.65  47.65  45.30  45.65     514   
...            ...          ...    ...    ...    ...    ...     ...   
1154220         統一         1216  76.20  78.60  76.00  77.20   20607   
1154221         統一         1216  76.80  78.80  76.40  78.80   16466   
1154222         統一         1216  78.00  78.70  77.10  78.20   16015   
1154223         統一         1216  78.00  78.00  76.30  77.40    8524   
1154224         統一         1216  76.10  77.20  75.80  76.20    8347   

                        date  
0        2022-03-01 00:00:00  
1        2022-03-02 00:00:00  
2        2022-03-03 00:00:00  
3        2022-03-04 00:

  df = pd.read_csv(filepath, encoding='utf-8')


In [4]:
data.keys()

dict_keys(['內容數據新聞1', '內容數據新聞2', '內容數據新聞3', '討論數據dcard', '討論數據mobile01-1', '討論數據mobile01-2', '討論數據ptt', '微股力個股交易數據-2年', '微股力社群PKTD-2年', '微股力籌碼數據-2年', '微股力財報數據-2年'])

In [5]:
# 類股代號皆轉為 string 型態
data['微股力個股交易數據-2年']['stock_symbol'] = data['微股力個股交易數據-2年']['stock_symbol'].astype(str)
data['微股力社群PKTD-2年']['stock_symbol'] = data['微股力社群PKTD-2年']['stock_symbol'].astype(str)
data['微股力籌碼數據-2年']['stock_symbol'] = data['微股力籌碼數據-2年']['stock_symbol'].astype(str)
data['微股力財報數據-2年']['stock_symbol'] = data['微股力財報數據-2年']['stock_symbol'].astype(str)


# 呼叫次數整理
data['微股力社群PKTD-2年']['actionP'] = data['微股力社群PKTD-2年']['actionP'] // 123
data['微股力社群PKTD-2年']['actionK'] = data['微股力社群PKTD-2年']['actionK'] // 123
data['微股力社群PKTD-2年']['actionT'] = data['微股力社群PKTD-2年']['actionT'] // 123
data['微股力社群PKTD-2年']['actionD'] = data['微股力社群PKTD-2年']['actionD'] // 123

### 中文文本處理

In [6]:
import jieba
!git clone https://github.com/fxsjy/jieba.git
jieba.set_dictionary('jieba/extra_dict/dict.txt.big')

Cloning into 'jieba'...


#### 中文文本斷詞

In [7]:
data['討論數據dcard'] = data['討論數據dcard'][['post_time', 'title', 'content']]
data['討論數據dcard']

Unnamed: 0,post_time,title,content
0,2022-03-01 00:00:18.000,#分享 投資股票個人經驗分享,定股美股ETF長期去抓報酬也是適合的投資工具方式
1,2022-03-01 00:00:22.000,#分享 投資股票個人經驗分享,最近剛申辦覺得定期定額投資美股很方便
2,2022-03-01 00:07:48.000,#標的 請問現在 台積電 是時候買入嗎？,我三百買的 給你參考
3,2022-03-01 00:14:04.000,#分享 明天燦坤有機會漲停,中鋼呢
4,2022-03-01 00:17:39.000,#分享 當沖 六個月的心情😂,有100時候怎麼沒有選擇減碼落袋為安\n現在用什麼心態在做當沖呢？？
...,...,...,...
231315,2024-02-29 23:57:06.000,#分享 2/29 也是速戰速決,我的好朋友封心小y陪到脫褲子了
231316,2024-02-29 23:58:11.000,#請益 找台新證券的營業員,凱基
231317,2024-02-29 23:58:31.000,#其他 2024台股看漲or看跌,已經刪除的內容就像 Dcard 一樣，錯過是無法再相見的！
231318,2024-02-29 23:58:42.000,#分享 「大綜亮燈，亞力被洗掉了」,想請問紫大，前幾天討論的4939和8096，我有買了～這幾天感覺沒什麼波動，我應該這樣繼續放...


In [8]:
data['討論數據dcard']['content'][0]

'定股美股ETF長期去抓報酬也是適合的投資工具方式'

In [9]:
list(jieba.cut_for_search(data['討論數據dcard']['content'][0]))

Building prefix dict from c:\Users\tony\Desktop\bda_project\jieba\extra_dict\dict.txt.big ...
Dumping model to file cache C:\Users\tony\AppData\Local\Temp\jieba.u8dce6070ba8a632911c3d41cfe05e95c.cache
Loading model cost 1.467 seconds.
Prefix dict has been built successfully.


['定股',
 '美股',
 'ETF',
 '長期',
 '去',
 '抓',
 '報酬',
 '也',
 '是',
 '適合',
 '的',
 '投資',
 '工具',
 '方式']

#### 語料庫中搜尋文本

In [10]:
start_date = '2024-01-01'
end_date = '2024-03-01'
dcard_stock_corpus = data['討論數據dcard'][np.logical_and(start_date <= data['討論數據dcard']['post_time'], data['討論數據dcard']['post_time'] <= end_date)]['content'].reset_index()['content'].astype(str)
dcard_stock_corpus

0                                     1.51%…這種勝率，賠的時候都很慘吧…
1        2023年成果結算，總算在Q4跟上車了\nhttps://megapx-assets.dca...
2        https://megapx-assets.dcard.tw/images/57727790...
3                                   哥 這樣按一開始的本金算 今年總獲利是幾%？
4        同為大二\nhttps://megapx-assets.dcard.tw/images/07...
                               ...                        
18528                                      我的好朋友封心小y陪到脫褲子了
18529                                                   凱基
18530                        已經刪除的內容就像 Dcard 一樣，錯過是無法再相見的！
18531    想請問紫大，前幾天討論的4939和8096，我有買了～這幾天感覺沒什麼波動，我應該這樣繼續放...
18532                                    為什麼上馬一直打一樣的英文\nb0
Name: content, Length: 18533, dtype: object

#### 移除 Stopwords

In [11]:
!git clone https://github.com/goto456/stopwords.git
stopwords = []
def remove_stopwords(tokens):
    return [token for token in tokens if token and token not in '，。：；「」『』（）、《》〈〉——＠＃＄％＾＆＊＋“”']

Cloning into 'stopwords'...


In [12]:
from TCSP import read_stopwords_list

with open('stopwords_zh.txt', 'r', encoding = 'utf-8') as file:
    stopwords = file.read().splitlines()
file.close()

stopwords += read_stopwords_list()
stopwords += ['記者', '報導']
stopwords += ['lex', '①①', '①②', '①③', '①④', '①⑤', '①⑥', '①⑦', '①⑧', '①⑨', '①ａ', '①ｂ', '①ｃ', '①ｄ', '①ｅ', '①ｆ', '①ｇ', '①ｈ', '①ｉ', '①ｏ', '②①', '②②', '②③', '②④', '②⑤', '②⑥', '②⑦', '②⑧', '②⑩', '②ａ', '②ｂ', '②ｄ', '②ｅ', '②ｆ', '②ｇ', '②ｈ', '②ｉ', '②ｊ', '③①', '③⑩', '③ａ', '③ｂ', '③ｃ', '③ｄ', '③ｅ', '③ｆ', '③ｇ', '③ｈ', '④ａ', '④ｂ', '④ｃ', '④ｄ', '④ｅ', '⑤ａ', '⑤ｂ', '⑤ｄ', '⑤ｅ', '⑤ｆ', '１２', 'ｌｉ', 'ｚｘｆｉｔｌ']
print(len(stopwords))

3001


### 訓練資料集生成


In [15]:
def where_key_words_in(df, keywords):
  res = pd.DataFrame()
  for kw in keywords:
    temp = df[df['title'].str.contains(kw) | df['content'].str.contains(kw)]
    res = pd.concat([res, temp], ignore_index=True)
  return res

def tokenize(text):
    return list(jieba.cut(text, cut_all=True))

In [48]:
FORBIDDEN_FEATURES = ['閱讀', '全文', 'br', ' ', '  '] + stopwords
K_BEST = 100

In [49]:
def make_dataset_tfidf(stock_data_source, content_data_source, stock_symbol, keywords):
    # 算出漲跌幅
    exchange_data = data[stock_data_source][data[stock_data_source]['stock_symbol'].isin([stock_symbol])]
    exchange_data['delta'] = exchange_data['close'] - exchange_data['open']
    exchange_data['gain_rate'] = exchange_data['delta'] / exchange_data['open']
    article_df = where_key_words_in(data[content_data_source], keywords)
    article_df['date'] = pd.to_datetime(article_df['post_time']).dt.date.shift(5)
    exchange_data['date'] = pd.to_datetime(exchange_data['date'])
    article_df = article_df[5:].reset_index(drop=True)
    article_df['date'] = pd.to_datetime(article_df['date'])
    article_df['gain_rate'] = 0
    # 把漲跌幅和文章時間對齊
    for date in exchange_data['date']:
        article_df.loc[article_df['date'].dt.strftime('%Y-%m-%d') == date.strftime('%Y-%m-%d'), 'gain_rate'] = exchange_data[exchange_data['date'] == date]['gain_rate'].values[0]
        # print(f'{date} gain rate set to', exchange_data[exchange_data['date'] == date]['gain_rate'].values[0])
    article_df['content'] = article_df['title'] + ' ' + article_df['content']

    article_df['label'] = article_df['gain_rate'].apply(lambda x: 1 if x >= 0 else 0)

    # corpus = article_df['content'].apply(' '.join)
    TFIDF_vectorizer = TfidfVectorizer(stop_words=FORBIDDEN_FEATURES, ngram_range = (1,1),  max_features= 200)
    TFIDF_vectors = TFIDF_vectorizer.fit_transform(article_df['content'].values)

    # Store TF-IDF vectors in a dictionary with article IDs as keys
    tfidf_dict = {}
    for i, tfidf_vector in enumerate(TFIDF_vectors):
        article_id = article_df.index[i]  # Assuming index is the unique identifier
        tfidf_dict[article_id] = tfidf_vector

    # 選擇前 K_BEST 高關聯性的特徵，以 anova 方式
    anova_selector = SelectKBest(chi2, k=K_BEST)
    X_kbest = anova_selector.fit_transform(TFIDF_vectors, article_df['label'])

    feature_names = TFIDF_vectorizer.get_feature_names_out()
    selected_features = feature_names[anova_selector.get_support()]
    selected_features_set = set(selected_features)

    print(f'語料庫中最顯著影響漲跌的關鍵字:', np.array(selected_features_set))
    article_df['content'] = article_df['content'].apply(lambda text: ' '.join([word for word in jieba.lcut(text) if word in selected_features_set]))
    print(f'tfidf_vectors:', tfidf_vector)
    return article_df[['content', 'gain_rate']], article_df['date'], tfidf_dict

In [53]:
keywords_2330 = ['台積電', '半導體', 'AI', '蘋果', 'Apple', '台灣積體電路', '臺灣積體電路', '張忠謀', '晶片', '晶圓', '護國神山']
# keywords_2363 = ['2363', '矽統', '半導體', 'AI']

dataset_2330, dates, tfidf_dict = make_dataset_tfidf('微股力個股交易數據-2年', '內容數據新聞1', '2330', keywords_2330)
# dataset_2363, dates = make_dataset_tfidf('微股力個股交易數據-2年', '內容數據新聞1', '2363', keywords_2363)
dataset_2330

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exchange_data['delta'] = exchange_data['close'] - exchange_data['open']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exchange_data['gain_rate'] = exchange_data['delta'] / exchange_data['open']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exchange_data['date'] = pd.to_datetime(exchange_data['dat

語料庫中最顯著影響漲跌的關鍵字: {'台北股市', '台積電', '11', '75', '50', '據了解', '35', '他說', '2303', '5元', '收2', 'ky', '半導體', '萬海', 'ai', '下跌2', '整體來看', 'nvidia', '台北電', '33', '68', 'inc', '48', '財訊快報', '81', '2603', '譜瑞', '84', '82', '72', '陳孟朔', '5億元', '42', '39', '2330', '91', '2618', '各報要聞', '27', '49', '長榮航', '2615', '編輯', '22', '5g', '時報記者林資傑台北報導', '86', 'fed', '38', '08', '法人指出', '長榮', '88', '73', '52', '74', '矽力', '工商時報一涂志豪', '時報', 'cpi', '陽明', '14', '2454', '62', '25', 'pro', '67', '02', '51', '57', 'amd', '買進', 'moneydj新聞', '5274', '2609', '32', '或0', '台北報導', '64', '31', '45', '04', '01', '記者李純君報導', '47', '新聞來源', '16', '時報資訊', '05', '98', '23', '07', '工商時報', '評等', 'iphone', '大立光', '76', '03', '83', '2022'}


Unnamed: 0,content,gain_rate
0,2022 台積電,0.008347
1,16 25 台積電,0.008347
2,台積電 2022 03 01 33 半導體 11 45 50 半導體 台積電 2330 23...,0.008347
3,買進 台積電 2330 2454 長榮 2603 長榮 2618 半導體 長榮,0.008347
4,半導體 台積電 2330,0.008347
...,...,...
59660,2022 11 25 25 31 25 14,0.008097
59661,2022 11 01 台積電 半導體 台積電 半導體 台積電 台積電 台積電 台積電 台積電...,0.000000
59662,31 時報 31 2022 11 38 時報 31 25 14 31 31 11 11 新聞...,0.000000
59663,2022 11 14 11 台積電 半導體 11 11,-0.011317


In [54]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

dataset_2330['gain_rate_tri'] = dataset_2330['gain_rate'].apply(lambda x: 'raise' if x > 0.01 else 'fall' if x < -0.01 else 'no_change')

X = [tfidf_dict[index].toarray()[0] for index in tfidf_dict.keys()] 
y = dataset_2330['gain_rate_tri']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8024805162155367
