In [1]:
import pandas as pd
import numpy as np
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(torch.__version__) 
print(device)

from transformers import BertTokenizer, BertModel

1.3.0
cuda


To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


In [2]:
# 讀取tsv # \t tab做區隔
df_news = pd.read_csv("data/2015_Company.tsv", sep="\t", encoding="utf-8")
df_news['text'] = df_news['text'].astype(str)
df_news

Unnamed: 0,id,label,text
0,0,財務營運風險,財政部充分尊重銀行團確保債權的必要作為，預計元旦後就會請法院實施假扣押，如三重土地順利拍賣，...
1,1,財務營運風險,至三趴以上，「在假扣押前，仍歡迎頂新還錢。」頂新：將持續協商財政部次長吳當傑尊重並同意銀行團...
2,2,法律風險,測報告中指出，各款手機在裝上SIM卡前後開機連線時，傳送之資料皆未涉及第一類的敏感資訊，僅為...
3,3,法律風險,大哥大，透過4G漫遊方式共網爭議，戰火延燒，威寶電信（台灣之星）及中華電信、遠傳電信等業者，...
4,4,法律風險,測報告中指出，各款手機在裝上SIM卡前後開機連線時，傳送之資料皆未涉及第一類的敏感資訊，僅為...
...,...,...,...
15887,15887,資安風險,"感覺華碩客服有點推託,所謂的個資法是企業有義務妥善保管蒐集來的個人資料不\n感覺華碩客服有點..."
15888,15888,勞動風險,上市公司玉晶光（3406）今天傳出裁員，對此，公司發言人趙志強表示，並非裁員而\n上市公司玉...
15889,15889,勞動風險,繼台企銀（2834）工會因合併問題而罷工，創金融史首例後，大眾銀行（2847）工\n繼台企銀...
15890,15890,勞動風險,6億元賣給英業達，獲利21億元，預計明年首季入帳，活化資產。外界認為這將有機會帶動宏達電明年...


In [3]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [24]:
# Tokenize input
text = "很好看的動作片，不會浪費錢跟時間。很久沒有這樣的探險片。可說是女版的印第安那瓊。"
tokenized_text = tokenizer.tokenize(text)      # 每個字切詞成一個list
print(type(tokenized_text))                 # list
np.array(tokenized_text)                    # 轉成numpy

<class 'list'>


array(['很', '好', '看', '的', '動', '作', '片', '，', '不', '會', '浪', '費', '錢',
       '跟', '時', '間', '。', '很', '久', '沒', '有', '這', '樣', '的', '探', '險',
       '片', '。', '可', '說', '是', '女', '版', '的', '印', '第', '安', '那', '瓊',
       '。'], dtype='<U1')

In [25]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)   # 每個字轉成id
print(type(indexed_tokens))                         # list
np.array(indexed_tokens)

<class 'list'>


array([2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298, 3857,
       6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300, 6857,
       3564, 4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957, 4276,
       4638, 1313, 5018, 2128, 6929, 4475,  511])

In [26]:
indexed_tokens2 = tokenizer.build_inputs_with_special_tokens(indexed_tokens)    # 句子前後加上 CLS SEP
print(type(indexed_tokens2))
np.array(indexed_tokens2)

<class 'list'>


array([ 101, 2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298,
       3857, 6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300,
       6857, 3564, 4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957,
       4276, 4638, 1313, 5018, 2128, 6929, 4475,  511,  102])

In [27]:
n = 512 - len(indexed_tokens2)
indexed_tokens3 = np.pad(indexed_tokens2, (0, n), mode ='constant', constant_values=(0))  # array右邊append n 個 0  補長度到512
print(len(indexed_tokens3))
indexed_tokens3

512


array([ 101, 2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298,
       3857, 6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300,
       6857, 3564, 4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957,
       4276, 4638, 1313, 5018, 2128, 6929, 4475,  511,  102,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [29]:
segments_ids = tokenizer.create_token_type_ids_from_sequences(indexed_tokens2)
print(type(segments_ids))                                # list
np.array(segments_ids)

<class 'list'>


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

#### np.pad doc

https://docs.scipy.org/doc/numpy/reference/generated/numpy.pad.html 

In [30]:
def tokenize_all(text):
#    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    tokenized_text = tokenizer.tokenize(text)    
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    indexed_tokens2 = tokenizer.build_inputs_with_special_tokens(indexed_tokens)
    indexed_tokens2 = np.array(indexed_tokens2)          # list 轉 numpy
    if len(indexed_tokens2) < 512:
        n = 512 - len(indexed_tokens2)
        indexed_tokens3 = np.pad(indexed_tokens2, (0, n), mode ='constant', constant_values=(0))  # array右邊append n 個 0  補長度到512
    return indexed_tokens3

In [31]:
def segments_all(text):
    tokenized_text = tokenizer.tokenize(text)    
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    indexed_tokens2 = tokenizer.build_inputs_with_special_tokens(indexed_tokens)
    indexed_tokens2 = np.array(indexed_tokens2)          # list 轉 numpy
    segments_ids = tokenizer.create_token_type_ids_from_sequences(indexed_tokens2)
    segments_ids = np.array(segments_ids)              # list 轉numpy
    if len(segments_ids) < 512:
        n = 512 - len(segments_ids)
        segments_ids2 = np.pad(segments_ids, (0, n), mode ='constant', constant_values=(0))  # array右邊append n 個 0  補長度到512    
    return segments_ids2

In [32]:
def masks_all(text):
    tokenized_text = tokenizer.tokenize(text)    
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    indexed_tokens2 = tokenizer.build_inputs_with_special_tokens(indexed_tokens)
    indexed_tokens2 = np.array(indexed_tokens2)          # list 轉 numpy
    segments_ids = tokenizer.create_token_type_ids_from_sequences(indexed_tokens2)
    segments_ids = np.array(segments_ids)              # list 轉numpy
    mask_token = np.array([1,1])
    mask_token = np.pad(mask_token, (0, len( segments_ids - 2 )), mode ='constant', constant_values=(1)) 
    # array右邊append 1 到跟segment一樣長
    if len(mask_token) < 512:
        n = 512 - len(mask_token)
        mask_token2 = np.pad(mask_token, (0, n), mode ='constant', constant_values=(0))  # array右邊append n 個 0  補長度到512
    return mask_token2

In [33]:
# df['text'] = df.comment.apply(lambda x: " ".join(jieba.cut(x)))
# df_news['tokenized_text'] = df_news['text'].apply(lambda x:  tokenizer.tokenize(x) )
df_news['indexed_tokens'] = df_news['text'].apply(tokenize_all)
df_news['segments_ids'] = df_news['text'].apply(segments_all)
df_news['mask_token'] = df_news['text'].apply(masks_all)
df_news

Unnamed: 0,id,label,text,indexed_tokens,segments_ids,mask_token
0,0,財務營運風險,財政部充分尊重銀行團確保債權的必要作為，預計元旦後就會請法院實施假扣押，如三重土地順利拍賣，...,"[101, 6512, 3124, 6956, 1041, 1146, 2203, 7028...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1,財務營運風險,至三趴以上，「在假扣押前，仍歡迎頂新還錢。」頂新：將持續協商財政部次長吳當傑尊重並同意銀行團...,"[101, 5635, 676, 6640, 809, 677, 8024, 519, 17...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,2,法律風險,測報告中指出，各款手機在裝上SIM卡前後開機連線時，傳送之資料皆未涉及第一類的敏感資訊，僅為...,"[101, 3947, 1841, 1440, 704, 2900, 1139, 8024,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,3,法律風險,大哥大，透過4G漫遊方式共網爭議，戰火延燒，威寶電信（台灣之星）及中華電信、遠傳電信等業者，...,"[101, 1920, 1520, 1920, 8024, 6851, 6882, 100,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,4,法律風險,測報告中指出，各款手機在裝上SIM卡前後開機連線時，傳送之資料皆未涉及第一類的敏感資訊，僅為...,"[101, 3947, 1841, 1440, 704, 2900, 1139, 8024,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...,...
15887,15887,資安風險,"感覺華碩客服有點推託,所謂的個資法是企業有義務妥善保管蒐集來的個人資料不\n感覺華碩客服有點...","[101, 2697, 6221, 5836, 4820, 2145, 3302, 3300...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15888,15888,勞動風險,上市公司玉晶光（3406）今天傳出裁員，對此，公司發言人趙志強表示，並非裁員而\n上市公司玉...,"[101, 677, 2356, 1062, 1385, 4373, 3253, 1045,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15889,15889,勞動風險,繼台企銀（2834）工會因合併問題而罷工，創金融史首例後，大眾銀行（2847）工\n繼台企銀...,"[101, 5262, 1378, 821, 7065, 8020, 11152, 8159...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15890,15890,勞動風險,6億元賣給英業達，獲利21億元，預計明年首季入帳，活化資產。外界認為這將有機會帶動宏達電明年...,"[101, 127, 1023, 1039, 6546, 5183, 5739, 3511,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [34]:
print(len(df_news['indexed_tokens'][0]))
df_news['indexed_tokens'][0] 

512


array([ 101, 6512, 3124, 6956, 1041, 1146, 2203, 7028, 7065, 6121, 1757,
       4825,  924, 1002, 3609, 4638, 2553, 6206,  868, 4158, 8024, 7521,
       6243, 1039, 3190, 2527, 2218, 3298, 6313, 3791, 7368, 2179, 3177,
        969, 2807, 2852, 8024, 1963,  676, 7028, 1759, 1765, 7518, 1164,
       2864, 6546, 8024, 7515, 4372, 6917, 3621, 2200,  679, 2512, 7513,
       1456, 1059,  127, 1283, 1399, 1519, 2339, 4638, 4495, 6243,  511,
       1042, 6493, 7065, 6121, 6134, 4850, 8024, 7515, 4372, 7274, 4634,
       5474, 6526, 2234, 3309, 6889, 5147, 3428, 8024, 5195, 5474, 6526,
       7065, 6121, 1757, 3748, 6359, 4634, 1139,  998, 1440, 2100, 6349,
        928, 1141, 8024, 6313,  102,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [35]:
print(len(df_news['segments_ids'][0]))
df_news['segments_ids'][0]

512


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [36]:
print(len(df_news['mask_token'][0]))
df_news['mask_token'][0]

512


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,