In [1]:
# # google colab tesla P100
# ! pip install numpy==1.17.4 scipy==1.3.1 pandas==0.25.3 tensorflow-gpu==2.0.0 torch==1.3.1 torchvision==0.4.2 scikit-learn==0.21.3
# ! pip install transformers==2.2.1
# ! pip install git+https://github.com/huggingface/transformers.git

In [2]:
# # linux系統指令 可省略 win可能跑不了
# ! nvidia-smi
# ! lscpu
# ! free -h
try:
    import os
    f = os.popen('nvidia-smi')
    f = f.read()
    print(f)
    f = os.popen('lscpu')
    f = f.read()
    print(f)
    f = os.popen('free -h')
    f = f.read()
    print(f)
except:
    pass

Fri Dec  6 21:29:48 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.87.01    Driver Version: 418.87.01    CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 2080    On   | 00000000:01:00.0  On |                  N/A |
| 30%   48C    P0    48W / 215W |     90MiB /  7951MiB |      2%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|    0  

In [3]:
import pandas as pd
import numpy as np

import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, TFBertForSequenceClassification, TFBertMainLayer, AdamW, BertConfig

In [4]:
# 讀取tsv # \t tab做區隔
df_news = pd.read_csv("https://github.com/roccqqck/news_bert/raw/master/data/2015_Company.tsv", sep="\t", encoding="utf-8")
df_news['text'] = df_news['text'].astype(str)
df_news

Unnamed: 0,id,label,text
0,1,0,財政部充分尊重銀行團確保債權的必要作為，預計元旦後就會請法院實施假扣押，如三重土地順利拍賣，...
1,2,0,至三趴以上，「在假扣押前，仍歡迎頂新還錢。」頂新：將持續協商財政部次長吳當傑尊重並同意銀行團...
2,3,1,測報告中指出，各款手機在裝上SIM卡前後開機連線時，傳送之資料皆未涉及第一類的敏感資訊，僅為...
3,4,1,大哥大，透過4G漫遊方式共網爭議，戰火延燒，威寶電信（台灣之星）及中華電信、遠傳電信等業者，...
4,5,1,測報告中指出，各款手機在裝上SIM卡前後開機連線時，傳送之資料皆未涉及第一類的敏感資訊，僅為...
...,...,...,...
16713,16714,3,上市公司玉晶光（3406）今天傳出裁員，對此，公司發言人趙志強表示，並非裁員而 上市公司玉晶...
16714,16715,3,繼台企銀（2834）工會因合併問題而罷工，創金融史首例後，大眾銀行（2847）工 繼台企銀（...
16715,16716,3,6億元賣給英業達，獲利21億元，預計明年首季入帳，活化資產。外界認為這將有機會帶動宏達電明年...
16716,16717,3,智慧手機大廠宏達電，要迎接新的一年之前，卻傳出讓人遺憾的消息！員工爆料，有兩位中階主管，傳出...


In [5]:
df_news['label'].value_counts()

4    5623
0    4877
1    4546
3    1316
2     356
Name: label, dtype: int64

In [6]:
# 文章字數 > 510了話 去尾
# 字數小於512-2 因為還有CLS SEP
def remove_510(text):
    if len(text) > 510:
        text = text[:510]    # 只取前510個字
    return text

In [7]:
df_news["text"] = df_news["text"].apply(remove_510)
df_news.head(2)

Unnamed: 0,id,label,text
0,1,0,財政部充分尊重銀行團確保債權的必要作為，預計元旦後就會請法院實施假扣押，如三重土地順利拍賣，...
1,2,0,至三趴以上，「在假扣押前，仍歡迎頂新還錢。」頂新：將持續協商財政部次長吳當傑尊重並同意銀行團...


In [8]:
# Load pre-trained model tokenizer, to convert our text into tokens that correspond to BERT’s vocabulary.
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

![avatar](https://github.com/roccqqck/news_bert/raw/master/bert_input_encoding.jpg)

https://github.com/roccqqck/news_bert/raw/master/bert_input_encoding.jpg

bert input features 有3個

input_ids: 代表識別每個 token 的索引值，用 tokenizer 轉換即可

token_type_ids: 用來識別句子界限。第一句為 0，第二句則為 1。另外注意句子間的 [SEP] 為 0     (optional) 輸入有1句非必要 輸入有2句則必要

attention_mask: 用來界定自注意力機制範圍。1 讓 BERT 關注該位置，0 則代表是 padding 不需關注
(optional)

https://huggingface.co/transformers/model_doc/bert.html#tfbertforsequenceclassification

In [9]:
# Tokenize input
text = "很好看的動作片，不會浪費錢跟時間。很久沒有這樣的探險片。可說是女版的印第安那瓊。"
tokens = tokenizer.tokenize(text)      # 每個字切詞成一個list
print(type(tokens))                 # list
np.array(tokens)                    # 轉成numpy

<class 'list'>


array(['很', '好', '看', '的', '動', '作', '片', '，', '不', '會', '浪', '費', '錢',
       '跟', '時', '間', '。', '很', '久', '沒', '有', '這', '樣', '的', '探', '險',
       '片', '。', '可', '說', '是', '女', '版', '的', '印', '第', '安', '那', '瓊',
       '。'], dtype='<U1')

In [10]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)   # 每個字轉成id
print(type(input_ids))                         # list
print(len(input_ids))
np.array(input_ids)

<class 'list'>
40


array([2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298, 3857,
       6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300, 6857,
       3564, 4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957, 4276,
       4638, 1313, 5018, 2128, 6929, 4475,  511])

In [11]:
token_type_ids = tokenizer.create_token_type_ids_from_sequences(input_ids) # token_type_ids 必須input還沒加CLS SEP
print(type(token_type_ids))                                # list
print(len(token_type_ids)) 
np.array(token_type_ids)

<class 'list'>
42


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [12]:
input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)    # 句子前後加上 CLS SEP 的 id
print(type(input_ids))
print(len(input_ids))
np.array(input_ids)

<class 'list'>
42


array([ 101, 2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298,
       3857, 6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300,
       6857, 3564, 4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957,
       4276, 4638, 1313, 5018, 2128, 6929, 4475,  511,  102])

In [13]:
n = 512 - len(input_ids)
input_ids2 = np.pad(input_ids, (0, n), mode ='constant', constant_values=(0))  
# array右邊append n 個 0  補長度到512
print(len(input_ids2))
input_ids2

512


array([ 101, 2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298,
       3857, 6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300,
       6857, 3564, 4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957,
       4276, 4638, 1313, 5018, 2128, 6929, 4475,  511,  102,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [14]:
# input如果是兩個句子

text = "很好看的動作片"
tokens = tokenizer.tokenize(text)      # 每個字切詞成一個list
print(type(tokens))                 # list
np.array(tokens)                    # 轉成numpy

<class 'list'>


array(['很', '好', '看', '的', '動', '作', '片'], dtype='<U1')

In [15]:
text2 = "不會浪費錢跟時間"
tokens2 = tokenizer.tokenize(text2)      # 每個字切詞成一個list
print(type(tokens2))                 # list
np.array(tokens2)                    # 轉成numpy

<class 'list'>


array(['不', '會', '浪', '費', '錢', '跟', '時', '間'], dtype='<U1')

In [16]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)   # 每個字轉成id
print(type(input_ids))                         # list
print(len(input_ids))
np.array(input_ids)

<class 'list'>
7


array([2523, 1962, 4692, 4638, 1240,  868, 4275])

In [17]:
input_ids2 = tokenizer.convert_tokens_to_ids(tokens2)   # 每個字轉成id
print(type(input_ids2))                         # list
print(len(input_ids2))
np.array(input_ids2)

<class 'list'>
8


array([ 679, 3298, 3857, 6527, 7092, 6656, 3229, 7279])

In [18]:
token_type_ids = tokenizer.create_token_type_ids_from_sequences(input_ids, input_ids2) # token_type_ids 必須input還沒加CLS SEP
print(type(token_type_ids))                                # list
print(len(token_type_ids)) 
np.array(token_type_ids)

<class 'list'>
18


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [19]:
input_ids3 = tokenizer.build_inputs_with_special_tokens(input_ids, input_ids2)    # 句子前後加上 CLS SEP 的 id
print(type(input_ids3))
print(len(input_ids3))
np.array(input_ids3)

<class 'list'>
18


array([ 101, 2523, 1962, 4692, 4638, 1240,  868, 4275,  102,  679, 3298,
       3857, 6527, 7092, 6656, 3229, 7279,  102])

numpy.pad   補0到某長度

https://docs.scipy.org/doc/numpy/reference/generated/numpy.pad.html 

也可以使用

```from keras.preprocessing.sequence import pad_sequences```


In [20]:
def input_ids_all(text):
#    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    tokens = tokenizer.tokenize(text)        # 每個字切詞成一個list
    input_ids = tokenizer.convert_tokens_to_ids(tokens)  # 每個字轉成id
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)    # 句子前後加上 CLS SEP 的 id
    input_ids = np.array(input_ids)          # list 轉 numpy
    if len(input_ids) < 512:
        n = 512 - len(input_ids)
        input_ids = np.pad(input_ids, (0, n), mode ='constant', constant_values=(0))  
        # array右邊append n 個 0  補長度到512
    return input_ids

In [21]:
text = "很好看的動作片，不會浪費錢跟時間。很久沒有這樣的探險片。可說是女版的印第安那瓊。"
input_ids_all(text)

array([ 101, 2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298,
       3857, 6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300,
       6857, 3564, 4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957,
       4276, 4638, 1313, 5018, 2128, 6929, 4475,  511,  102,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [22]:
def attention_mask_all(text):
    tokens = tokenizer.tokenize(text)       # 每個字切詞成一個list
    input_ids = tokenizer.convert_tokens_to_ids(tokens)  # 每個字轉成id
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)    # 句子前後加上 CLS SEP 的 id
    input_ids = np.array(input_ids)          # list 轉 numpy
    attention_mask = np.array([1,1])
    attention_mask = np.pad(attention_mask, (0, len(input_ids)-2 ), mode ='constant', constant_values=(1)) 
    # array右邊append 1 到跟segment一樣長
    if len(attention_mask) < 512:
        n = 512 - len(attention_mask)
        attention_mask = np.pad(attention_mask, (0, n), mode ='constant', constant_values=(0))  # array右邊append n 個 0  補長度到512
    return attention_mask

In [23]:
attention_mask_all(text)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [24]:
# 事實上單一句子　出來都是0　不做也沒差

def token_type_ids_all(text):
    tokens = tokenizer.tokenize(text)       # 每個字切詞成一個list
    input_ids = tokenizer.convert_tokens_to_ids(tokens)  # 每個字轉成id
    input_ids = np.array(input_ids)          # list 轉 numpy
    token_type_ids = tokenizer.create_token_type_ids_from_sequences(input_ids)   # token_type_ids 必須input還沒加CLS SEP
    token_type_ids = np.array(token_type_ids)              # list 轉numpy
    if len(token_type_ids) < 512:
        n = 512 - len(token_type_ids)
        token_type_ids = np.pad(token_type_ids, (0, n), mode ='constant', constant_values=(0))  
        # array右邊append n 個 0  補長度到512    
    return token_type_ids

In [25]:
token_type_ids_all(text)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

最後我決定用pandas的apply 比較好視覺化理解

In [26]:
# df['text2'] = df['text']].apply(lambda x: " ".join(jieba.cut(x)))
# df_news['tokens'] = df_news['text'].apply(lambda x:  tokenizer.tokenize(x) )
df_news['input_ids'] = df_news['text'].apply(input_ids_all)
df_news['attention_mask'] = df_news['text'].apply(attention_mask_all)
df_news['token_type_ids'] = df_news['text'].apply(token_type_ids_all)
df_news.head(2)

Unnamed: 0,id,label,text,input_ids,attention_mask,token_type_ids
0,1,0,財政部充分尊重銀行團確保債權的必要作為，預計元旦後就會請法院實施假扣押，如三重土地順利拍賣，...,"[101, 6512, 3124, 6956, 1041, 1146, 2203, 7028...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,0,至三趴以上，「在假扣押前，仍歡迎頂新還錢。」頂新：將持續協商財政部次長吳當傑尊重並同意銀行團...,"[101, 5635, 676, 6640, 809, 677, 8024, 519, 17...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [27]:
# df_news['input_ids'].to_numpy()   # 提出來 竟然不是2d numpy 不能這樣做

In [28]:
print(len(df_news['input_ids'][0]))
df_news['input_ids'][0] 

512


array([ 101, 6512, 3124, 6956, 1041, 1146, 2203, 7028, 7065, 6121, 1757,
       4825,  924, 1002, 3609, 4638, 2553, 6206,  868, 4158, 8024, 7521,
       6243, 1039, 3190, 2527, 2218, 3298, 6313, 3791, 7368, 2179, 3177,
        969, 2807, 2852, 8024, 1963,  676, 7028, 1759, 1765, 7518, 1164,
       2864, 6546, 8024, 7515, 4372, 6917, 3621, 2200,  679, 2512, 7513,
       1456, 1059,  127, 1283, 1399, 1519, 2339, 4638, 4495, 6243,  511,
       1042, 6493, 7065, 6121, 6134, 4850, 8024, 7515, 4372, 7274, 4634,
       5474, 6526, 2234, 3309, 6889, 5147, 3428, 8024, 5195, 5474, 6526,
       7065, 6121, 1757, 3748, 6359, 4634, 1139,  998, 1440, 2100, 6349,
        928, 1141, 8024, 6313,  102,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [29]:
print(len(df_news['token_type_ids'][0]))
df_news['token_type_ids'][0]

512


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [30]:
print(len(df_news['attention_mask'][0]))
df_news['attention_mask'][0]

512


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [31]:
# # df_news['input_ids'].to_numpy() 出來不是一個2d numpy 
# # 只好用for loop一個一個拿出來合併
# # 用np vstack超級慢 不知道為何  改用最外層是list append

# input_ids = np.zeros((1, 512)).astype(int) #宣吿一個都是0的1*512 numpy # np.zeros預設是float 改成int 不然bert餵不進去
# for index, row in df_news.iterrows():  
#     element = df_news.loc[index,'input_ids']
#     input_ids = np.vstack((input_ids, np.array([element])))   # 2維 合併

# input_ids = np.delete(input_ids, 0, 0)              # 刪掉一開始都是0的那一個宣告  
# input_ids

https://www.quora.com/Is-it-better-to-use-np-append-or-list-append

In [32]:
# df_news['input_ids'].to_numpy() 出來不是一個2d numpy 
# 只好用for loop一個一個拿出來合併
# 用np vstack超級慢 不知道為何  改用最外層是list append
input_ids = []      # list
for index, row in df_news.iterrows():  
    np_1d = df_news.loc[index,'input_ids']    # 1d np arrary
    input_ids.append(np_1d)                       # 1d np的 list # list[np_1, np_2, np_3, ....]


input_ids = np.array(input_ids)                       # 轉成2d np
input_ids

array([[ 101, 6512, 3124, ...,    0,    0,    0],
       [ 101, 5635,  676, ...,    0,    0,    0],
       [ 101, 3947, 1841, ...,    0,    0,    0],
       ...,
       [ 101,  127, 1023, ...,    0,    0,    0],
       [ 101, 3255, 2716, ...,    0,    0,    0],
       [ 101, 3229, 3149, ...,    0,    0,    0]])

In [33]:
input_ids.shape

(16718, 512)

In [34]:
# torch.tensor(input_ids)    # numpy 轉 torch tensor

In [35]:
tf.convert_to_tensor(input_ids)   # numpy 轉 tf tensor

<tf.Tensor: id=2, shape=(16718, 512), dtype=int64, numpy=
array([[ 101, 6512, 3124, ...,    0,    0,    0],
       [ 101, 5635,  676, ...,    0,    0,    0],
       [ 101, 3947, 1841, ...,    0,    0,    0],
       ...,
       [ 101,  127, 1023, ...,    0,    0,    0],
       [ 101, 3255, 2716, ...,    0,    0,    0],
       [ 101, 3229, 3149, ...,    0,    0,    0]])>

In [36]:
attention_mask = []      # list
for index, row in df_news.iterrows():  
    np_1d = df_news.loc[index,'attention_mask']    # 1d np arrary
    attention_mask.append(np_1d)                       # 1d np的 list # list[np_1, np_2, np_3, ....]


attention_mask = np.array(attention_mask)                       # 轉成2d np
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [37]:
attention_mask.shape

(16718, 512)

In [38]:
token_type_ids = []      # list
for index, row in df_news.iterrows():  
    np_1d = df_news.loc[index,'token_type_ids']    # 1d np arrary
    token_type_ids.append(np_1d)                       # 1d np的 list # list[np_1, np_2, np_3, ....]


token_type_ids = np.array(token_type_ids)                       # 轉成2d np
token_type_ids

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [39]:
token_type_ids.shape

(16718, 512)

In [40]:
label = df_news['label'].to_numpy()
label

array([0, 0, 1, ..., 3, 3, 3])

In [41]:
# 把2個input_ids, attention_mask , token_type_ids 還有label 切成training data, validation data

from sklearn.model_selection import train_test_split
# Use train_test_split to split our data into train and validation sets for training

# # 設定 stratify = label 把每個類別平均
train_input_ids, validation_input_ids, train_label, validation_label = train_test_split(input_ids, label, 
                                                            random_state=2018, test_size=0.5, stratify=label )

train_attention_mask, validation_attention_mask, _, _ = train_test_split(attention_mask, label,
                                             random_state=2018, test_size=0.5, stratify=label )

train_token_type_ids, validation_token_type_ids, _, _ = train_test_split(token_type_ids, label,
                                             random_state=2018, test_size=0.5, stratify=label )

In [42]:
# # input 可用 numpy 或 tf tensor 下面是numpy轉tf tensor

# train_input_ids = tf.convert_to_tensor(train_input_ids)
# validation_input_ids = tf.convert_to_tensor(validation_input_ids)
# train_label = tf.convert_to_tensor(train_label)
# validation_label = tf.convert_to_tensor(validation_label)
# train_attention_mask = tf.convert_to_tensor(train_attention_mask)
# validation_attention_mask = tf.convert_to_tensor(validation_attention_mask)
# train_token_type_ids = tf.convert_to_tensor(train_token_type_ids)
# validation_token_type_ids = tf.convert_to_tensor(validation_token_type_ids)

In [43]:
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, Dropout, Activation, Flatten, InputLayer

In [44]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
# model = TFBertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=5)
# 下面model跟 TFBertForSequenceClassification一樣

input_layer = Input(shape = (512,), dtype='int64')  # 預設是float 要改成input_id的int64
bert = TFBertModel.from_pretrained('bert-base-chinese')(input_layer)
bert = bert[0]     # 有bug 修正後可能不需要這行
dropout = Dropout(0.1)(bert)
flat = Flatten()(dropout)
classifier = Dense(units=5, activation="softmax")(flat)                  # 分5類 # sigmoid改softmax
model = Model(inputs=input_layer, outputs=classifier)
model.summary()

# softmax只好用5e-6 #平常用sigmoid可以用3e-5
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-6, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 512)]             0         
_________________________________________________________________
tf_bert_model (TFBertModel)  ((None, 512, 768), (None, 102267648 
_________________________________________________________________
dropout_37 (Dropout)         (None, 512, 768)          0         
_________________________________________________________________
flatten (Flatten)            (None, 393216)            0         
_________________________________________________________________
dense (Dense)                (None, 5)                 1966085   
Total params: 104,233,733
Trainable params: 104,233,733
Non-trainable params: 0
_________________________________________________________________


In [57]:
%%time
# Train and evaluate using tf.keras.Model.fit()  # batch size 8就會error 是GPU記憶體爆掉
model_fit = model.fit(train_input_ids, train_label, 
                      batch_size=4, epochs=1, 
                      validation_data=(validation_input_ids, validation_label)
#                      steps_per_epoch=115,
#                      validation_steps=7)
                   )

https://huggingface.co/transformers/model_doc/bert.html#tfbertforsequenceclassification

attention_mask 或 token_type_ids 不一定要放輸入 可選擇

要放了話要加```[ ]```

```model.fit([train_input_ids, train_attention_mask, train_token_type_ids], train_label)```

就是```model.fit(X_train, Y_train)```

```[train_input_ids, train_attention_mask, train_token_type_ids]``` 就是 ```X_train```

```train_label``` 就是 ```Y_train```

In [46]:
# model_fit = model.fit([train_input_ids, train_attention_mask, train_token_type_ids], train_label, 
#                        batch_size=4, epochs=1, 
#                     validation_data=([validation_input_ids, validation_attention_mask, validation_token_type_ids], validation_label)
#                    )

In [47]:
# # 如果train到一半 想要重新train 在jupyter裡面interrupt kernel
# # 這時候 model還是存在在記憶體裡面 只是train到一半 要重新train要釋放model的記憶體
# del model

In [48]:
# model.evaluate(validation_input_ids, validation_label, verbose=1)

https://blog.csdn.net/zds13257177985/article/details/80638384

```predictions = model.predict（test）```預測的是數值，而且輸出的是n*5的編碼值array

要經過```predictions = np.argmax(predictions, axis=1)```才是類別

In [58]:
%%time
predictions = model.predict(validation_input_ids)   # 輸出的是n*5的編碼值array
print(predictions.shape)
predictions

In [59]:
predictions = np.argmax(predictions, axis=1)         # axis = 1是取行的最大值的索引，0是列的最大值的索引
predictions

In [None]:
from sklearn.metrics import accuracy_score
# from sklearn.metrics import precision_score
# from sklearn.metrics import recall_score
# from sklearn.metrics import f1_score
# from sklearn.metrics import cohen_kappa_score
# from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(accuracy_score(validation_label, predictions))
# print(precision_score(validation_label, predictions))
# print(recall_score(validation_label, predictions))
# print(f1_score(validation_label, predictions))
print(confusion_matrix(validation_label, predictions))
print(classification_report(validation_label, predictions))

In [52]:
# Save the entire model to a HDF5 file.
# The '.h5' extension indicates that the model shuold be saved to HDF5.
# model 存起來

tf.keras.models.save_model(
    model,
    "model/model_bert_eland_softmax_2",
    overwrite=True,
    include_optimizer=True,
) 

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: model/model_bert_eland_softmax_2/assets


In [56]:
# # Save the entire model to a HDF5 file.
# # The '.h5' extension indicates that the model shuold be saved to HDF5.
# # model 存起來

# model.save('model/my_model.h5') # 失敗

In [55]:
# Recreate the exact same model, including its weights and the optimizer
# 讀取存的model
input_layer = Input(shape = (512,), dtype='int64')  # 預設是float 要改成input_id的int64
load_model = tf.keras.models.load_model('model/model_bert_eland_softmax_1.model')(input_layer)
new_model = Model(inputs=input_layer, outputs=load_model)

# Show the model architecture
new_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 512)]             0         
_________________________________________________________________
model (Model)                (None, 5)                 104233733 
Total params: 104,233,733
Trainable params: 104,233,733
Non-trainable params: 0
_________________________________________________________________


In [None]:
%%time
predictions = new_model.predict(validation_input_ids)   # 輸出的是n*5的編碼值array
print(predictions.shape)
predictions

In [None]:
predictions = np.argmax(predictions, axis=1)         # axis = 1是取行的最大值的索引，0是列的最大值的索引
predictions

In [None]:
from sklearn.metrics import accuracy_score
# from sklearn.metrics import precision_score
# from sklearn.metrics import recall_score
# from sklearn.metrics import f1_score
# from sklearn.metrics import cohen_kappa_score
# from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(accuracy_score(validation_label, predictions))
# print(precision_score(validation_label, predictions))
# print(recall_score(validation_label, predictions))
# print(f1_score(validation_label, predictions))
print(confusion_matrix(validation_label, predictions))
print(classification_report(validation_label, predictions))