In [1]:
import pandas as pd
import numpy as np

import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_excel(r"TSE50半年新聞.xlsx")
labels = pd.read_csv(r"Labels.csv")

all_df = df.merge(labels, on='ID')
all_df

Unnamed: 0,ID,證券代碼,年月日,則次,新聞標題,新聞內容,label
0,1,1216 統一,20250610,1,統一普通股定 114/07/31除息交易， 114/08/01為最後過戶日，每股配發現金股利...,,0
1,2,1216 統一,20250624,1,家樂福西班牙進口「HELIOS」美乃滋防腐劑超標 破百公斤遭退運銷毀,【記者賴昀岫／台北報導】家樂福西班牙進口「HELIOS」美乃滋防腐劑含量超標，衛福部食藥署本...,-1
2,3,1216 統一,20250625,1,節能標章有期限，過期仍刊登廣告觸法！,公平會在114年6月25日第1757次委員會議通過，香港商雅虎資訊股份有限公司台灣分公司與神...,0
3,4,1216 統一,20250709,1,統一斥資逾10億元打造DREAM PLAZA，7/25開幕，主打都會區上班族,市場傳出統一(1216)集團斥資超過10億元打造DREAM PLAZA商場將在7月25日開幕...,0
4,5,1216 統一,20250718,1,"統一子公司統一國際開發 113/10/18處分台積電 297 千股, 交易金額...",,0
...,...,...,...,...,...,...,...
2044,2045,6919 康霈*,20240806,1,"康霈* 7月份合併營收計 0千元, 累計113 年 1月至 7月月...",,-1
2045,2046,6919 康霈*,20240809,1,康霈* 財務公告未經會計師核閱數，累計 113/01 至 113 /06合併稅前盈餘 -3...,,-1
2046,2047,6919 康霈*,20240813,1,"康霈* 財務公告累計 113/01 至 113 /06合併稅前盈餘 -39,512...",,-1
2047,2048,6919 康霈*,20240829,2,康霈上市前業績發表 首度證實 CBL-514 為天然物開發新藥,生技股后康霈*（6919）29日舉行上市前業績發表會，執行長凌玉芳首度公開證實，該公司開發醫...,0


In [3]:
# 計算每篇新聞的字元長度
lengths = df["新聞內容"].astype(str).str.len()

# 最長長度
max_len = lengths.max()
print("最長字元數:", max_len)

最長字元數: 3675


In [4]:
def normalize_text(text: str) -> str:
    if not isinstance(text, str):
        return ""

    # yyyy/mm/dd
    text = re.sub(r"\d{4}[/-]\d{1,2}[/-]\d{1,2}", " 日期 ", text)

    # mm/dd（ 不用 \b）
    text = re.sub(r"\d{1,2}[/-]\d{1,2}", " 日期 ", text)

    # 民國年
    text = re.sub(r"\d{2,3}年\d{1,2}月\d{1,2}日", " 日期", text)

    # 年月
    text = re.sub(r"\d{2,4}年\d{1,2}月", " 日期 ", text)

    # 月份
    text = re.sub(r"\d{1,2}月(份)?", " 日期 ", text)

    # 季度
    text = re.sub(r"Q[1-4]|第[一二三四]季", " 季 ", text)

    # 百分比
    text = re.sub(r"\d+(\.\d+)?\s*%", " 百分比 ", text)

    # 金額
    text = re.sub(r"\d+(\.\d+)?\s*(兆|億|萬|元|股|張|點|口|筆)", " 量詞 ", text)

    # 其他數字
    text = re.sub(r"\d+(\.\d+)?", " 數字 ", text)

    return re.sub(r"\s+", " ", text).strip()


In [5]:
df = all_df.copy()

df["title"] = df["新聞標題"].fillna("")
df["content"] = df["新聞內容"].fillna("")
df["text_raw"] = df["title"] + " " + df["content"]
df["text"] = df["text_raw"]

df["label"] = df["label"].astype(int)

X_values = df["text"].values
y = df["label"].values

In [6]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "hfl/chinese-roberta-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(model_name)

token_lens = df["新聞內容"].astype(str).apply(
    lambda x: len(tokenizer.encode(x, add_special_tokens=True))
)

print("最長 token 長度:", token_lens.max())
print(token_lens.describe())



model = AutoModel.from_pretrained(model_name)
model.to(device)
model.eval()

CUDA available: True
GPU: NVIDIA GeForce RTX 4070 Ti
最長 token 長度: 3386
count    2049.000000
mean      309.697901
std       420.782523
min         4.000000
25%         4.000000
50%         4.000000
75%       640.000000
max      3386.000000
Name: 新聞內容, dtype: float64


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [7]:
print(model.config.model_type)
print(type(tokenizer))
print(tokenizer.special_tokens_map)
print(model.config)

bert
<class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>
{'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}
BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "directionality": "bidi",
  "dtype": "float32",
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.3",
  "type_vocab_size": 2,
 

In [8]:
from tqdm import tqdm

def encode_texts(texts, batch_size=32, max_length=256):
    all_embeddings = []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]

            encoded = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            )

            encoded = {k: v.to(device) for k, v in encoded.items()}

            outputs = model(**encoded)
            # outputs.last_hidden_state shape: (batch, seq_len, hidden_size)

            cls_embeddings = outputs.last_hidden_state[:, 0, :]  
            # shape: (batch, hidden_size=768)

            all_embeddings.append(cls_embeddings.cpu().numpy())

    return np.vstack(all_embeddings)

In [9]:
texts = df["text"].astype(str).tolist()

embeddings = encode_texts(
    texts,
    batch_size=32, 
    max_length=384
)

print("Embedding shape:", embeddings.shape)

Embedding shape: (2049, 768)


In [10]:
X = embeddings
y = df['label']

# SVM

In [88]:
X = embeddings
y = df['label']

# 1) Train/Test split（保持類別比例）
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=20,
    stratify=y
)

# 2) 建模（class_weight="balanced" 建議先開，避免 0 類別太多）
svm_clf = LinearSVC(
    C=1.0,
    class_weight="balanced"
)

# 3) 訓練
svm_clf.fit(X_train, y_train)

# 4) 預測
y_pred = svm_clf.predict(X_test)

# 5) 評估
print("Train label counts:")
print(pd.Series(y_train).value_counts().reindex([-1, 0, 1], fill_value=0))

print("\nTest label counts:")
print(pd.Series(y_test).value_counts().reindex([-1, 0, 1], fill_value=0))

print("\nClassification report (labels order = [-1, 0, 1]):")
print(classification_report(y_test, y_pred, labels=[-1, 0, 1], digits=4))

print("Confusion matrix (rows=true, cols=pred) order=[-1,0,1]:")
print(confusion_matrix(y_test, y_pred, labels=[-1, 0, 1]))

Train label counts:
label
-1    200
 0    853
 1    586
Name: count, dtype: int64

Test label counts:
label
-1     50
 0    213
 1    147
Name: count, dtype: int64

Classification report (labels order = [-1, 0, 1]):
              precision    recall  f1-score   support

          -1     0.6935    0.8600    0.7679        50
           0     0.8274    0.7653    0.7951       213
           1     0.7815    0.8027    0.7919       147

    accuracy                         0.7902       410
   macro avg     0.7675    0.8093    0.7850       410
weighted avg     0.7946    0.7902    0.7907       410

Confusion matrix (rows=true, cols=pred) order=[-1,0,1]:
[[ 43   6   1]
 [ 18 163  32]
 [  1  28 118]]


In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

X = embeddings
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
pipe = Pipeline([
    ("svm", LinearSVC())
])

param_grid = {
    "svm__C": [0.1, 0.5, 1.0, 2.0, 3.0, 5.0],
    "svm__class_weight": [None, "balanced"],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

print("\nBest params:")
print(grid.best_params_)
print("Best CV macro-F1:", grid.best_score_)

best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)
print("\nOut-of-sample Report:")
print(classification_report(y_test, y_pred, labels=[-1, 0, 1], digits=4))

Fitting 5 folds for each of 12 candidates, totalling 60 fits

Best params:
{'svm__C': 0.5, 'svm__class_weight': None}
Best CV macro-F1: 0.7738029682755972

Out-of-sample Report:
              precision    recall  f1-score   support

          -1     0.7647    0.7800    0.7723        50
           0     0.8128    0.7746    0.7933       213
           1     0.7244    0.7687    0.7459       147

    accuracy                         0.7732       410
   macro avg     0.7673    0.7745    0.7705       410
weighted avg     0.7752    0.7732    0.7737       410



# RBF SVM

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV


X = embeddings
y = df['label']

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel="rbf"))
])

param_grid = {
    "svm__C": [0.5, 1, 2, 5, 10, 20],
    "svm__gamma": ["scale", 0.01, 0.03, 0.1, 0.3, 1.0],
    "svm__class_weight": [None, "balanced"]
}


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV macro-F1:", grid.best_score_)
best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)

# 5) 評估
print("Train label counts:")
print(pd.Series(y_train).value_counts().reindex([-1, 0, 1], fill_value=0))

print("\nTest label counts:")
print(pd.Series(y_test).value_counts().reindex([-1, 0, 1], fill_value=0))

print("\nClassification report (labels order = [-1, 0, 1]):")
print(classification_report(y_test, y_pred, labels=[-1, 0, 1], digits=4))

print("Confusion matrix (rows=true, cols=pred) order=[-1,0,1]:")
print(confusion_matrix(y_test, y_pred, labels=[-1, 0, 1]))

Fitting 5 folds for each of 72 candidates, totalling 360 fits


KeyboardInterrupt: 

# KNN

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score

X = embeddings.astype(np.float32)
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

pipe_cos = Pipeline([
    ("norm", Normalizer()),  # L2 normalize，讓 cosine 更穩
    ("knn", KNeighborsClassifier(metric="cosine"))
])

param_grid_cos = {
    "knn__n_neighbors": [3,5,7,9,11,15,21,31,41,51],
    "knn__weights": ["uniform", "distance"]
}

grid_cos = GridSearchCV(
    pipe_cos,
    param_grid=param_grid_cos,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,  
    verbose=2
)

grid_cos.fit(X_train, y_train)

best_cos = grid_cos.best_estimator_
pred = best_cos.predict(X_test)

print("Best params (cosine):", grid_cos.best_params_)
print("Test macro-F1:", f1_score(y_test, pred, average="macro"))
print(classification_report(y_test, pred, digits=4))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best params (cosine): {'knn__n_neighbors': 9, 'knn__weights': 'distance'}
Test macro-F1: 0.8070222154994836
              precision    recall  f1-score   support

          -1     0.9286    0.7800    0.8478        50
           0     0.7978    0.7100    0.7513       100
           1     0.7563    0.9000    0.8219       100

    accuracy                         0.8000       250
   macro avg     0.8275    0.7967    0.8070       250
weighted avg     0.8073    0.8000    0.7989       250



# Downsample 穩定度檢測

In [12]:
def downsample(df, label_col="label", random_state=42):
    """
    Downsample to ratio 2:2:1 for labels (0 : 1 : -1)
    """
    # 取得各類別
    df_neg = df[df[label_col] == -1]
    df_pos = df[df[label_col] == 1]
    df_neu = df[df[label_col] == 0]

    # 以 -1 為基準
    n_neg = len(df_neg)
    n_pos = min(len(df_pos), 2 * n_neg)
    n_neu = min(len(df_neu), 3 * n_neg)

    df_neg_ds = df_neg.sample(n=n_neg, random_state=random_state)
    df_pos_ds = df_pos.sample(n=n_pos, random_state=random_state)
    df_neu_ds = df_neu.sample(n=n_neu, random_state=random_state)

    df_ds = pd.concat([df_neg_ds, df_pos_ds, df_neu_ds], axis=0)
    df_ds = df_ds.sample(frac=1, random_state=random_state).reset_index(drop=True)

    return df_ds

### 1. Linear SVM

In [97]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report
)


N_RUNS = 100

# 存每次結果
records = []

best_idx = None
best_score = -np.inf
best_report = None
best_seed = None

worst_idx = None
worst_score = np.inf
worst_seed = None

for i in tqdm(range(N_RUNS)):
    df_ds = downsample(df, random_state=i)   # <- 用 downsample 後的資料

    y = df_ds["label"].values

    texts = df_ds["text"].astype(str).tolist()

    embeddings = encode_texts(
        texts,
        batch_size=32, 
        max_length=384
    )

    X = embeddings

    # Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    svm_clf = LinearSVC(
        C=0.5,
        class_weight= None,
        random_state=42
    )

    svm_clf.fit(X_train, y_train)
    y_pred = svm_clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec_macro = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec_macro  = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1_macro   = f1_score(y_test, y_pred, average="macro", zero_division=0)

    score_for_selection = f1_macro   # <- 用它當 best/worst 的依據

    records.append({
        "seed": i,
        "accuracy": acc,
        "precision_macro": prec_macro,
        "recall_macro": rec_macro,
        "f1_macro": f1_macro
    })

    if score_for_selection > best_score:
        best_score = score_for_selection
        best_seed = i
        best_report = classification_report(y_test, y_pred, labels=[-1, 0, 1], digits=4)
    if score_for_selection < worst_score:
        worst_score = score_for_selection
        worst_seed = i

results_df = pd.DataFrame(records)

print("BERT embedding + Linear SVM Random Seed穩定度測試")
# ===== 平均與標準差 =====
print("==== Averages (mean ± std) over runs ====")
for col in ["accuracy", "precision_macro", "recall_macro", "f1_macro"]:
    print(f"{col:>16}: {results_df[col].mean():.4f} ± {results_df[col].std():.4f}")

# ===== 最佳 / 最差（以 f1_macro 為準）=====
print("\n==== Best / Worst (by f1_macro) ====")
print(f"Best seed : {best_seed} | best f1_macro : {results_df.loc[results_df['seed']==best_seed,'f1_macro'].values[0]:.4f}")
print(f"Worst seed: {worst_seed} | worst f1_macro: {results_df.loc[results_df['seed']==worst_seed,'f1_macro'].values[0]:.4f}")

# ===== 印最佳那次 report =====
print("\n==== Classification report of BEST run ====")
print(best_report)

100%|██████████| 100/100 [12:54<00:00,  7.74s/it]

BERT embedding + Linear SVM Random Seed穩定度測試
==== Averages (mean ± std) over runs ====
        accuracy: 0.7682 ± 0.0241
 precision_macro: 0.7657 ± 0.0255
    recall_macro: 0.7706 ± 0.0253
        f1_macro: 0.7670 ± 0.0247

==== Best / Worst (by f1_macro) ====
Best seed : 63 | best f1_macro : 0.8352
Worst seed: 55 | worst f1_macro: 0.7094

==== Classification report of BEST run ====
              precision    recall  f1-score   support

          -1     0.8039    0.8200    0.8119        50
           0     0.8552    0.8267    0.8407       150
           1     0.8365    0.8700    0.8529       100

    accuracy                         0.8400       300
   macro avg     0.8319    0.8389    0.8352       300
weighted avg     0.8404    0.8400    0.8400       300






### 2. KNN

In [95]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report
)

N_RUNS = 100
records = []

best_score = -np.inf
best_report = None
best_seed = None

worst_score = np.inf
worst_seed = None

for i in tqdm(range(N_RUNS)):
    df_ds = downsample(df, random_state=i)

    y = df_ds["label"].values
    texts = df_ds["text"].astype(str).tolist()

    embeddings = encode_texts(
        texts,
        batch_size=32,
        max_length=384
    )
    
    X = embeddings.astype(np.float32)

    # Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    # ===== KNN + Normalizer =====
    knn_clf = Pipeline([
        ("norm", Normalizer()), 
        ("knn", KNeighborsClassifier(
            n_neighbors=9,    
            metric="cosine",
            weights="distance"   
        ))
    ])

    knn_clf.fit(X_train, y_train)
    y_pred = knn_clf.predict(X_test)

    # metrics
    acc = accuracy_score(y_test, y_pred)
    prec_macro = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec_macro  = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1_macro   = f1_score(y_test, y_pred, average="macro", zero_division=0)

    score_for_selection = f1_macro

    records.append({
        "seed": i,
        "accuracy": acc,
        "precision_macro": prec_macro,
        "recall_macro": rec_macro,
        "f1_macro": f1_macro
    })

    if score_for_selection > best_score:
        best_score = score_for_selection
        best_seed = i
        best_report = classification_report(y_test, y_pred, labels=[-1, 0, 1], digits=4)

    if score_for_selection < worst_score:
        worst_score = score_for_selection
        worst_seed = i

results_df = pd.DataFrame(records)

print("BERT embedding + KNN Random Seed 穩定度測試")
print("==== Averages (mean ± std) over runs ====")
for col in ["accuracy", "precision_macro", "recall_macro", "f1_macro"]:
    print(f"{col:>16}: {results_df[col].mean():.4f} ± {results_df[col].std():.4f}")

print("\n==== Best / Worst (by f1_macro) ====")
print(f"Best seed : {best_seed} | best f1_macro : {results_df.loc[results_df['seed']==best_seed,'f1_macro'].values[0]:.4f}")
print(f"Worst seed: {worst_seed} | worst f1_macro: {results_df.loc[results_df['seed']==worst_seed,'f1_macro'].values[0]:.4f}")

print("\n==== Classification report of BEST run ====")
print(best_report)


100%|██████████| 100/100 [12:02<00:00,  7.23s/it]

BERT embedding + KNN Random Seed 穩定度測試
==== Averages (mean ± std) over runs ====
        accuracy: 0.7669 ± 0.0205
 precision_macro: 0.7746 ± 0.0220
    recall_macro: 0.7678 ± 0.0232
        f1_macro: 0.7683 ± 0.0215

==== Best / Worst (by f1_macro) ====
Best seed : 89 | best f1_macro : 0.8145
Worst seed: 81 | worst f1_macro: 0.7128

==== Classification report of BEST run ====
              precision    recall  f1-score   support

          -1     0.8667    0.7800    0.8211        50
           0     0.8389    0.8333    0.8361       150
           1     0.7642    0.8100    0.7864       100

    accuracy                         0.8167       300
   macro avg     0.8232    0.8078    0.8145       300
weighted avg     0.8186    0.8167    0.8170       300






### 3. Rbf SVM

In [108]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report
)


N_RUNS = 100

# 存每次結果
records = []

best_idx = None
best_score = -np.inf
best_report = None
best_seed = None

worst_idx = None
worst_score = np.inf
worst_seed = None

for i in tqdm(range(N_RUNS)):
    df_ds = downsample(df, random_state=i)   # <- 用 downsample 後的資料

    y = df_ds["label"].values

    texts = df_ds["text"].astype(str).tolist()

    embeddings = encode_texts(
        texts,
        batch_size=32, 
        max_length=384
    )

    X = embeddings

    # Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm", SVC(
            kernel="rbf",
            C= 20, 
            class_weight= 'balanced',
            gamma='scale'))
    ])

    svm_clf.fit(X_train, y_train)
    y_pred = svm_clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec_macro = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec_macro  = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1_macro   = f1_score(y_test, y_pred, average="macro", zero_division=0)

    score_for_selection = f1_macro   # <- 用它當 best/worst 的依據

    records.append({
        "seed": i,
        "accuracy": acc,
        "precision_macro": prec_macro,
        "recall_macro": rec_macro,
        "f1_macro": f1_macro
    })

    if score_for_selection > best_score:
        best_score = score_for_selection
        best_seed = i
        best_report = classification_report(y_test, y_pred, labels=[-1, 0, 1], digits=4)
    if score_for_selection < worst_score:
        worst_score = score_for_selection
        worst_seed = i

results_df = pd.DataFrame(records)

print("BERT embedding + Rbf SVM Random Seed穩定度測試")
# ===== 平均與標準差 =====
print("==== Averages (mean ± std) over runs ====")
for col in ["accuracy", "precision_macro", "recall_macro", "f1_macro"]:
    print(f"{col:>16}: {results_df[col].mean():.4f} ± {results_df[col].std():.4f}")

# ===== 最佳 / 最差（以 f1_macro 為準）=====
print("\n==== Best / Worst (by f1_macro) ====")
print(f"Best seed : {best_seed} | best f1_macro : {results_df.loc[results_df['seed']==best_seed,'f1_macro'].values[0]:.4f}")
print(f"Worst seed: {worst_seed} | worst f1_macro: {results_df.loc[results_df['seed']==worst_seed,'f1_macro'].values[0]:.4f}")

# ===== 印最佳那次 report =====
print("\n==== Classification report of BEST run ====")
print(best_report)

100%|██████████| 100/100 [12:27<00:00,  7.48s/it]

BERT embedding + Rbf SVM Random Seed穩定度測試
==== Averages (mean ± std) over runs ====
        accuracy: 0.7888 ± 0.0206
 precision_macro: 0.7901 ± 0.0239
    recall_macro: 0.7877 ± 0.0225
        f1_macro: 0.7874 ± 0.0214

==== Best / Worst (by f1_macro) ====
Best seed : 63 | best f1_macro : 0.8333
Worst seed: 84 | worst f1_macro: 0.7312

==== Classification report of BEST run ====
              precision    recall  f1-score   support

          -1     0.8913    0.8200    0.8542        50
           0     0.8389    0.8333    0.8361       150
           1     0.7905    0.8300    0.8098       100

    accuracy                         0.8300       300
   macro avg     0.8402    0.8278    0.8333       300
weighted avg     0.8315    0.8300    0.8303       300






# 回測建構因子用文本預測

In [45]:
from pathlib import Path

def read_news_sep_txt(file_path):
    file_path = Path(file_path)

    raw = file_path.read_bytes()
    text = raw.decode("big5", errors="ignore")

    tmp_path = file_path.with_suffix(".utf8.txt")
    tmp_path.write_text(text, encoding="utf-8")

    cols = ["證券代碼", "年月日", "則次", "新聞標題", "新聞內容"]

    df = pd.read_csv(
        tmp_path,
        sep=r"\[SEP\]",
        engine="python",
        names=cols
    )

    return df

In [46]:
#all_docs = pd.read_csv('all新聞.csv', encoding='ANSI')
all_docs = read_news_sep_txt(r"因子新聞.txt")
#print(all_docs[1:1000])


In [47]:
df_pred = all_docs.copy()

df_pred["title"] = df_pred["新聞標題"].fillna("")
df_pred["content"] = df_pred["新聞內容"].fillna("")
df_pred["text_raw"] = df_pred["title"] + " " + df_pred["content"]
df_pred["text"] = df_pred["text_raw"].apply(normalize_text)

In [49]:
df_pred = df_pred[1:]

In [51]:
texts = df_pred["text"].astype(str).tolist()

pred_embeddings = encode_texts(
    texts,
    batch_size=32, 
    max_length=384
)

print("Embedding shape:", pred_embeddings.shape)
X_pred = pred_embeddings

Embedding shape: (462300, 768)


In [None]:
np.save("bert_embeddings.npy", X_pred)

### 第一種預測: Rbf SVM

In [52]:
# 用訓練資料最好參數訓練模型
df_ds = downsample(df, random_state=63)
texts = df_ds["text"].astype(str).tolist()

embeddings = encode_texts(
    texts,
    batch_size=32, 
    max_length=384
)

print("Embedding shape:", embeddings.shape)

Embedding shape: (1500, 768)


In [57]:
X_train = embeddings
y_train = df_ds['label']

svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(
        kernel="rbf",
        C= 20, 
        class_weight= 'balanced',
        gamma='scale'))
])

svm_clf.fit(X_train, y_train)
y_pred = svm_clf.predict(X_pred)
df_pred['factor_1'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred['factor_1'] = y_pred


### 第二種預測: KNN

In [59]:
# 用訓練資料最好參數訓練模型
df_ds = downsample(df, random_state=89)
texts = df_ds["text"].astype(str).tolist()

embeddings = encode_texts(
    texts,
    batch_size=32, 
    max_length=384
)

print("Embedding shape:", embeddings.shape)

Embedding shape: (1500, 768)


In [62]:
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.neighbors import KNeighborsClassifier

X_train = embeddings
y_train = df_ds['label']

knn_clf = Pipeline([
        ("norm", Normalizer()), 
        ("knn", KNeighborsClassifier(
            n_neighbors=9,    
            metric="cosine",
            weights="distance"   
        ))
    ])

knn_clf.fit(X_train, y_train)
y_pred = knn_clf.predict(X_pred)
df_pred['factor_2'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred['factor_2'] = y_pred


In [63]:
# 兩個模型估出來的值的相關係數
print(df_pred["factor_1"].corr(df_pred["factor_2"], method="pearson"))

0.7503396044665851


In [64]:
df_pred['factor'] = (df_pred["factor_1"] + df_pred["factor_2"])/2
df_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred['factor'] = (df_pred["factor_1"] + df_pred["factor_2"])/2


Unnamed: 0,證券代碼,年月日,則次,新聞標題,新聞內容,title,content,text_raw,text,factor_1,factor_2,factor
1,1101 台泥,20140110,1,台泥國際大賺 獲利年增1.7 倍,由於大陸水泥市場去年下半年價格飆漲，加上產能增加，台泥國際去年營運獲利大幅提升，香港聯交所9...,台泥國際大賺 獲利年增1.7 倍,由於大陸水泥市場去年下半年價格飆漲，加上產能增加，台泥國際去年營運獲利大幅提升，香港聯交所9...,台泥國際大賺 獲利年增1.7 倍 由於大陸水泥市場去年下半年價格飆漲，加上產能增加，台泥國...,台泥國際大賺 獲利年增 數字 倍 由於大陸水泥市場去年下半年價格飆漲，加上產能增加，台泥國際...,1,-1,0.0
2,1101 台泥,20140111,1,台泥+16%,台泥10日公告轉投資事業台泥國際去年獲利將較前年的6.1 億港元( 約合新台幣23.61 億...,台泥+16%,台泥10日公告轉投資事業台泥國際去年獲利將較前年的6.1 億港元( 約合新台幣23.61 億...,台泥+16% 台泥10日公告轉投資事業台泥國際去年獲利將較前年的6.1 億港元( 約合新台幣...,台泥+ 百分比 台泥 數字 日公告轉投資事業台泥國際去年獲利將較前年的 量詞 港元( 約合新...,0,0,0.0
3,1101 台泥,20140113,1,台泥董事本人富品投資於 103/01/13解除 500 千股於中信銀城東分行設...,�,台泥董事本人富品投資於 103/01/13解除 500 千股於中信銀城東分行設...,�,台泥董事本人富品投資於 103/01/13解除 500 千股於中信銀城東分行設...,台泥董事本人富品投資於 數字 日期 / 數字 解除 數字 千股於中信銀城東分行設質專戶，累計...,0,0,0.0
4,1101 台泥,20140121,1,據報導國泰證券對台泥的評等為買進，目標價為 53.00。,�,據報導國泰證券對台泥的評等為買進，目標價為 53.00。,�,據報導國泰證券對台泥的評等為買進，目標價為 53.00。 �,據報導國泰證券對台泥的評等為買進，目標價為 數字 。 �,0,0,0.0
5,1101 台泥,20140128,1,台泥吃台泥國際股權 准了,台商投資大陸回暖！經濟部投審會27日通過台泥以約7.2 億美元( 約新台幣21億元) 購入台...,台泥吃台泥國際股權 准了,台商投資大陸回暖！經濟部投審會27日通過台泥以約7.2 億美元( 約新台幣21億元) 購入台...,台泥吃台泥國際股權 准了 台商投資大陸回暖！經濟部投審會27日通過台泥以約7.2 億美元(...,台泥吃台泥國際股權 准了 台商投資大陸回暖！經濟部投審會 數字 日通過台泥以約 量詞 美元(...,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
462296,9962 有益,20250314,2,"有益財務公告累計 113/01 至 113 /12稅前盈餘 7,677 萬元，...",�,"有益財務公告累計 113/01 至 113 /12稅前盈餘 7,677 萬元，...",�,"有益財務公告累計 113/01 至 113 /12稅前盈餘 7,677 萬元，...","有益財務公告累計 數字 日期 至 數字 / 數字 稅前盈餘 數字 , 量詞 元，比去年同期衰...",0,-1,-0.5
462297,9962 有益,20250318,1,有益普通股定 114/04/17除息交易， 114/04/18為最後過戶日，每股配發現金股利...,�,有益普通股定 114/04/17除息交易， 114/04/18為最後過戶日，每股配發現金股利...,�,有益普通股定 114/04/17除息交易， 114/04/18為最後過戶日，每股配發現金股利...,有益普通股定 數字 日期 / 數字 除息交易， 數字 日期 / 數字 為最後過戶日，每股配發...,0,0,0.0
462298,9962 有益,20250417,1,有益普通股定 114/04/17除息交易，除息參考價 13.20元， 114/04/18...,�,有益普通股定 114/04/17除息交易，除息參考價 13.20元， 114/04/18...,�,有益普通股定 114/04/17除息交易，除息參考價 13.20元， 114/04/18...,有益普通股定 數字 日期 / 數字 除息交易，除息參考價 量詞 ， 數字 日期 / 數字 為...,0,0,0.0
462299,9962 有益,20250507,1,"有益財務公告未經會計師核閱數，累計 114/01 至 114 /03合併稅前盈餘 1,39...",�,"有益財務公告未經會計師核閱數，累計 114/01 至 114 /03合併稅前盈餘 1,39...",�,"有益財務公告未經會計師核閱數，累計 114/01 至 114 /03合併稅前盈餘 1,39...","有益財務公告未經會計師核閱數，累計 數字 日期 至 數字 / 數字 合併稅前盈餘 數字 , ...",0,1,0.5


In [None]:
# 輸出情緒因子
output = df_pred[['證券代碼', '年月日', '則次', 'factor_1', 'factor_2', 'factor']]
print(f"Rbf SVM預測值分布{output['factor_1'].value_counts()}")
print(f"KNN預測值分布{output['factor_2'].value_counts()}")
print(f"因子值分布{output['factor'].value_counts()}")
pd.to_pickle(output, 'sentiment_factor.pkl')

Rbf SVM預測值分布factor_1
 0    3797
 1    1786
-1     544
Name: count, dtype: int64
KNN預測值分布factor_2
 0    3031
 1    2319
-1     777
Name: count, dtype: int64
因子值分布factor
 0.0    2774
 1.0    1468
 0.5    1090
-1.0     447
-0.5     348
Name: count, dtype: int64
