In [1]:
import pandas as pd
import numpy as np
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(torch.__version__) 
print(device)
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

import tensorflow as tf
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, TFBertForSequenceClassification, AdamW

1.3.0
cuda
1
GeForce RTX 2080 Ti


In [2]:
# 讀取tsv # \t tab做區隔
df_news = pd.read_csv("https://github.com/roccqqck/news_bert/raw/master/data/2015_Company.tsv", sep="\t", encoding="utf-8")
df_news['text'] = df_news['text'].astype(str)
df_news

Unnamed: 0,id,label,text
0,0,0,財政部充分尊重銀行團確保債權的必要作為，預計元旦後就會請法院實施假扣押，如三重土地順利拍賣，...
1,1,0,至三趴以上，「在假扣押前，仍歡迎頂新還錢。」頂新：將持續協商財政部次長吳當傑尊重並同意銀行團...
2,2,1,測報告中指出，各款手機在裝上SIM卡前後開機連線時，傳送之資料皆未涉及第一類的敏感資訊，僅為...
3,3,1,大哥大，透過4G漫遊方式共網爭議，戰火延燒，威寶電信（台灣之星）及中華電信、遠傳電信等業者，...
4,4,1,測報告中指出，各款手機在裝上SIM卡前後開機連線時，傳送之資料皆未涉及第一類的敏感資訊，僅為...
...,...,...,...
15887,15887,2,"感覺華碩客服有點推託,所謂的個資法是企業有義務妥善保管蒐集來的個人資料不 感覺華碩客服有點推..."
15888,15888,3,上市公司玉晶光（3406）今天傳出裁員，對此，公司發言人趙志強表示，並非裁員而 上市公司玉晶...
15889,15889,3,繼台企銀（2834）工會因合併問題而罷工，創金融史首例後，大眾銀行（2847）工 繼台企銀（...
15890,15890,3,6億元賣給英業達，獲利21億元，預計明年首季入帳，活化資產。外界認為這將有機會帶動宏達電明年...


In [3]:
# Load pre-trained model tokenizer, to convert our text into tokens that correspond to BERT’s vocabulary.
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

![avatar](https://github.com/roccqqck/news_bert/raw/master/bert_input_encoding.jpg)

https://github.com/roccqqck/news_bert/raw/master/bert_input_encoding.jpg

bert input features 有3個

tokens_tensor (input_ids)：代表識別每個 token 的索引值，用 tokenizer 轉換即可

segments_tensor (token_type_ids)：用來識別句子界限。第一句為 0，第二句則為 1。另外注意句子間的 [SEP] 為 0     (optional) 輸入有1句非必要 輸入有2句則必要

masks_tensor (attention_mask)：用來界定自注意力機制範圍。1 讓 BERT 關注該位置，0 則代表是 padding 不需關注
(optional)

https://huggingface.co/transformers/model_doc/bert.html#tfbertforsequenceclassification

In [4]:
# Tokenize input
text = "很好看的動作片，不會浪費錢跟時間。很久沒有這樣的探險片。可說是女版的印第安那瓊。"
tokens = tokenizer.tokenize(text)      # 每個字切詞成一個list
print(type(tokens))                 # list
np.array(tokens)                    # 轉成numpy

<class 'list'>


array(['很', '好', '看', '的', '動', '作', '片', '，', '不', '會', '浪', '費', '錢',
       '跟', '時', '間', '。', '很', '久', '沒', '有', '這', '樣', '的', '探', '險',
       '片', '。', '可', '說', '是', '女', '版', '的', '印', '第', '安', '那', '瓊',
       '。'], dtype='<U1')

In [5]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)   # 每個字轉成id
print(type(input_ids))                         # list
print(len(input_ids))
np.array(input_ids)

<class 'list'>
40


array([2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298, 3857,
       6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300, 6857,
       3564, 4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957, 4276,
       4638, 1313, 5018, 2128, 6929, 4475,  511])

In [6]:
token_type_ids = tokenizer.create_token_type_ids_from_sequences(input_ids) # token_type_ids 必須input還沒加CLS SEP
print(type(token_type_ids))                                # list
print(len(token_type_ids)) 
np.array(token_type_ids)

<class 'list'>
42


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [7]:
input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)    # 句子前後加上 CLS SEP 的 id
print(type(input_ids))
print(len(input_ids))
np.array(input_ids)

<class 'list'>
42


array([ 101, 2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298,
       3857, 6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300,
       6857, 3564, 4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957,
       4276, 4638, 1313, 5018, 2128, 6929, 4475,  511,  102])

In [8]:
n = 512 - len(input_ids)
input_ids2 = np.pad(input_ids, (0, n), mode ='constant', constant_values=(0))  
# array右邊append n 個 0  補長度到512
print(len(input_ids2))
input_ids2

512


array([ 101, 2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298,
       3857, 6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300,
       6857, 3564, 4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957,
       4276, 4638, 1313, 5018, 2128, 6929, 4475,  511,  102,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [9]:
# input如果是兩個句子

text = "很好看的動作片"
tokens = tokenizer.tokenize(text)      # 每個字切詞成一個list
print(type(tokens))                 # list
np.array(tokens)                    # 轉成numpy

<class 'list'>


array(['很', '好', '看', '的', '動', '作', '片'], dtype='<U1')

In [10]:
text2 = "不會浪費錢跟時間"
tokens2 = tokenizer.tokenize(text2)      # 每個字切詞成一個list
print(type(tokens2))                 # list
np.array(tokens2)                    # 轉成numpy

<class 'list'>


array(['不', '會', '浪', '費', '錢', '跟', '時', '間'], dtype='<U1')

In [11]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)   # 每個字轉成id
print(type(input_ids))                         # list
print(len(input_ids))
np.array(input_ids)

<class 'list'>
7


array([2523, 1962, 4692, 4638, 1240,  868, 4275])

In [12]:
input_ids2 = tokenizer.convert_tokens_to_ids(tokens2)   # 每個字轉成id
print(type(input_ids2))                         # list
print(len(input_ids2))
np.array(input_ids2)

<class 'list'>
8


array([ 679, 3298, 3857, 6527, 7092, 6656, 3229, 7279])

In [13]:
token_type_ids = tokenizer.create_token_type_ids_from_sequences(input_ids, input_ids2) # token_type_ids 必須input還沒加CLS SEP
print(type(token_type_ids))                                # list
print(len(token_type_ids)) 
np.array(token_type_ids)

<class 'list'>
18


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [14]:
input_ids3 = tokenizer.build_inputs_with_special_tokens(input_ids, input_ids2)    # 句子前後加上 CLS SEP 的 id
print(type(input_ids3))
print(len(input_ids3))
np.array(input_ids3)

<class 'list'>
18


array([ 101, 2523, 1962, 4692, 4638, 1240,  868, 4275,  102,  679, 3298,
       3857, 6527, 7092, 6656, 3229, 7279,  102])

numpy.pad

https://docs.scipy.org/doc/numpy/reference/generated/numpy.pad.html 

也可以使用

```from keras.preprocessing.sequence import pad_sequences```


In [15]:
def input_ids_all(text):
#    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    tokens = tokenizer.tokenize(text)        # 每個字切詞成一個list
    input_ids = tokenizer.convert_tokens_to_ids(tokens)  # 每個字轉成id
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)    # 句子前後加上 CLS SEP 的 id
    input_ids = np.array(input_ids)          # list 轉 numpy
    if len(input_ids) < 512:
        n = 512 - len(input_ids)
        input_ids = np.pad(input_ids, (0, n), mode ='constant', constant_values=(0))  
        # array右邊append n 個 0  補長度到512
    return input_ids

In [16]:
def attention_mask_all(text):
    tokens = tokenizer.tokenize(text)       # 每個字切詞成一個list
    input_ids = tokenizer.convert_tokens_to_ids(tokens)  # 每個字轉成id
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)    # 句子前後加上 CLS SEP 的 id
    input_ids = np.array(input_ids)          # list 轉 numpy
    attention_mask = np.array([1,1])
    attention_mask = np.pad(attention_mask, (0, len(input_ids)-2 ), mode ='constant', constant_values=(1)) 
    # array右邊append 1 到跟segment一樣長
    if len(attention_mask) < 512:
        n = 512 - len(attention_mask)
        attention_mask = np.pad(attention_mask, (0, n), mode ='constant', constant_values=(0))  # array右邊append n 個 0  補長度到512
    return attention_mask

In [17]:
def token_type_ids_all(text):
    tokens = tokenizer.tokenize(text)       # 每個字切詞成一個list
    input_ids = tokenizer.convert_tokens_to_ids(tokens)  # 每個字轉成id
    input_ids = np.array(input_ids)          # list 轉 numpy
    token_type_ids = tokenizer.create_token_type_ids_from_sequences(input_ids)   # token_type_ids 必須input還沒加CLS SEP
    token_type_ids = np.array(token_type_ids)              # list 轉numpy
    if len(token_type_ids) < 512:
        n = 512 - len(token_type_ids)
        token_type_ids = np.pad(token_type_ids, (0, n), mode ='constant', constant_values=(0))  
        # array右邊append n 個 0  補長度到512    
    return token_type_ids

In [18]:
# df['text2'] = df['text']].apply(lambda x: " ".join(jieba.cut(x)))
# df_news['tokens'] = df_news['text'].apply(lambda x:  tokenizer.tokenize(x) )
df_news['input_ids'] = df_news['text'].apply(input_ids_all)
df_news['attention_mask'] = df_news['text'].apply(attention_mask_all)
df_news['token_type_ids'] = df_news['text'].apply(token_type_ids_all)
df_news.head(2)

Unnamed: 0,id,label,text,input_ids,attention_mask,token_type_ids
0,0,0,財政部充分尊重銀行團確保債權的必要作為，預計元旦後就會請法院實施假扣押，如三重土地順利拍賣，...,"[101, 6512, 3124, 6956, 1041, 1146, 2203, 7028...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,0,至三趴以上，「在假扣押前，仍歡迎頂新還錢。」頂新：將持續協商財政部次長吳當傑尊重並同意銀行團...,"[101, 5635, 676, 6640, 809, 677, 8024, 519, 17...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [19]:
print(len(df_news['input_ids'][0]))
df_news['input_ids'][0] 

512


array([ 101, 6512, 3124, 6956, 1041, 1146, 2203, 7028, 7065, 6121, 1757,
       4825,  924, 1002, 3609, 4638, 2553, 6206,  868, 4158, 8024, 7521,
       6243, 1039, 3190, 2527, 2218, 3298, 6313, 3791, 7368, 2179, 3177,
        969, 2807, 2852, 8024, 1963,  676, 7028, 1759, 1765, 7518, 1164,
       2864, 6546, 8024, 7515, 4372, 6917, 3621, 2200,  679, 2512, 7513,
       1456, 1059,  127, 1283, 1399, 1519, 2339, 4638, 4495, 6243,  511,
       1042, 6493, 7065, 6121, 6134, 4850, 8024, 7515, 4372, 7274, 4634,
       5474, 6526, 2234, 3309, 6889, 5147, 3428, 8024, 5195, 5474, 6526,
       7065, 6121, 1757, 3748, 6359, 4634, 1139,  998, 1440, 2100, 6349,
        928, 1141, 8024, 6313,  102,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [20]:
print(len(df_news['token_type_ids'][0]))
df_news['token_type_ids'][0]

512


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [21]:
print(len(df_news['attention_mask'][0]))
df_news['attention_mask'][0]

512


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [22]:
# 2個column numpy相加
def add_2_column(input_ids, attention_mask):
    add = np.array(input_ids) + np.array(attention_mask)
    return add

In [23]:
# pandas apply 用2個column # 2個column numpy相加
df_news['add'] = df_news.apply(lambda row: add_2_column(row['input_ids'], row['attention_mask']), axis=1)   
df_news.head(2)

Unnamed: 0,id,label,text,input_ids,attention_mask,token_type_ids,add
0,0,0,財政部充分尊重銀行團確保債權的必要作為，預計元旦後就會請法院實施假扣押，如三重土地順利拍賣，...,"[101, 6512, 3124, 6956, 1041, 1146, 2203, 7028...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[102, 6513, 3125, 6957, 1042, 1147, 2204, 7029..."
1,1,0,至三趴以上，「在假扣押前，仍歡迎頂新還錢。」頂新：將持續協商財政部次長吳當傑尊重並同意銀行團...,"[101, 5635, 676, 6640, 809, 677, 8024, 519, 17...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[102, 5636, 677, 6641, 810, 678, 8025, 520, 17..."


In [24]:
# input是2個句子才真的需要的token_type_ids

def token_type_ids_all(text, text2):
    tokens = tokenizer.tokenize(text)       # 每個字切詞成一個list
    tokens2 = tokenizer.tokenize(text2)       # 每個字切詞成一個list
    input_ids = tokenizer.convert_tokens_to_ids(tokens)  # 每個字轉成id
    input_ids2 = tokenizer.convert_tokens_to_ids(tokens2)  # 每個字轉成id
    input_ids = np.array(input_ids)          # list 轉 numpy
    input_ids2 = np.array(input_ids2)          # list 轉 numpy
    token_type_ids = tokenizer.create_token_type_ids_from_sequences(input_ids, input_ids2)   # token_type_ids 必須input還沒加CLS SEP
    token_type_ids = np.array(token_type_ids)              # list 轉numpy
    if len(token_type_ids) < 512:
        n = 512 - len(token_type_ids)
        token_type_ids = np.pad(token_type_ids, (0, n), mode ='constant', constant_values=(0))  
        # array右邊append n 個 0  補長度到512    
    return token_type_ids

In [25]:
# pandas apply 用2個column 做token_type_ids

df_news['text2'] = df_news['text']
df_news['token_type_ids2'] = df_news.apply(lambda row: token_type_ids_all(row['text'], row['text2']), axis=1)   
df_news.head(2)

Unnamed: 0,id,label,text,input_ids,attention_mask,token_type_ids,add,text2,token_type_ids2
0,0,0,財政部充分尊重銀行團確保債權的必要作為，預計元旦後就會請法院實施假扣押，如三重土地順利拍賣，...,"[101, 6512, 3124, 6956, 1041, 1146, 2203, 7028...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[102, 6513, 3125, 6957, 1042, 1147, 2204, 7029...",財政部充分尊重銀行團確保債權的必要作為，預計元旦後就會請法院實施假扣押，如三重土地順利拍賣，...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,0,至三趴以上，「在假扣押前，仍歡迎頂新還錢。」頂新：將持續協商財政部次長吳當傑尊重並同意銀行團...,"[101, 5635, 676, 6640, 809, 677, 8024, 519, 17...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[102, 5636, 677, 6641, 810, 678, 8025, 520, 17...",至三趴以上，「在假扣押前，仍歡迎頂新還錢。」頂新：將持續協商財政部次長吳當傑尊重並同意銀行團...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [26]:
df_news['token_type_ids2'][0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [27]:
df_news = df_news.drop(columns=["add", "text2", "token_type_ids2" ])

In [28]:
# df_news['input_ids'].to_numpy() 出來不是一個2d numpy 
# 只好用for loop一個一個拿出來合併

input_ids = np.zeros((1, 512)).astype(int) #宣吿一個都是0的1*512 numpy # np.zeros預設是float 改成int 不然bert餵不進去
for index, row in df_news.iterrows():  
    element = df_news.loc[index,'input_ids']
    input_ids = np.vstack((input_ids, np.array([element])))   # 2維 合併

input_ids = np.delete(input_ids, 0, 0)              # 刪掉一開始都是0的那一個宣告  
input_ids

array([[ 101, 6512, 3124, ...,    0,    0,    0],
       [ 101, 5635,  676, ...,    0,    0,    0],
       [ 101, 3947, 1841, ...,    0,    0,    0],
       ...,
       [ 101, 5262, 1378, ...,    0,    0,    0],
       [ 101,  127, 1023, ...,    0,    0,    0],
       [ 101, 3229, 3149, ...,    0,    0,    0]])

In [29]:
input_ids.shape

(15892, 512)

In [30]:
torch.tensor(input_ids)    # numpy 轉 torch tensor

tensor([[ 101, 6512, 3124,  ...,    0,    0,    0],
        [ 101, 5635,  676,  ...,    0,    0,    0],
        [ 101, 3947, 1841,  ...,    0,    0,    0],
        ...,
        [ 101, 5262, 1378,  ...,    0,    0,    0],
        [ 101,  127, 1023,  ...,    0,    0,    0],
        [ 101, 3229, 3149,  ...,    0,    0,    0]])

In [31]:
tf.convert_to_tensor(input_ids)   # numpy 轉 tf tensor

<tf.Tensor: id=2, shape=(15892, 512), dtype=int64, numpy=
array([[ 101, 6512, 3124, ...,    0,    0,    0],
       [ 101, 5635,  676, ...,    0,    0,    0],
       [ 101, 3947, 1841, ...,    0,    0,    0],
       ...,
       [ 101, 5262, 1378, ...,    0,    0,    0],
       [ 101,  127, 1023, ...,    0,    0,    0],
       [ 101, 3229, 3149, ...,    0,    0,    0]])>

In [32]:
attention_mask = np.zeros((1, 512)).astype(int) #宣吿一個都是0的1*512 numpy # np.zeros預設是float 改成int 不然bert餵不進去
for index, row in df_news.iterrows():  
    element = df_news.loc[index,'attention_mask']
    attention_mask = np.vstack((attention_mask, np.array([element])))   # 2維 合併

attention_mask = np.delete(attention_mask, 0, 0)              # 刪掉一開始都是0的那一個宣告  
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [33]:
attention_mask.shape

(15892, 512)

In [34]:
token_type_ids = np.zeros((1, 512)).astype(int) #宣吿一個都是0的1*512 numpy # np.zeros預設是float 改成int 不然bert餵不進去
for index, row in df_news.iterrows():  
    element = df_news.loc[index,'token_type_ids']
    token_type_ids = np.vstack((token_type_ids, np.array([element])))   # 2維 合併

token_type_ids = np.delete(token_type_ids, 0, 0)              # 刪掉一開始都是0的那一個宣告  
token_type_ids

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [35]:
token_type_ids.shape

(15892, 512)

In [36]:
label = df_news['label'].to_numpy()
label

array([0, 0, 1, ..., 3, 3, 3])

In [37]:
# 把2個input_ids, attention_mask , token_type_ids 還有label 切成training data, validation data

from sklearn.model_selection import train_test_split
# Use train_test_split to split our data into train and validation sets for training

train_input_ids, validation_input_ids, train_label, validation_label = train_test_split(input_ids, label, 
                                                            random_state=2018, test_size=0.5)
train_attention_mask, validation_attention_mask, _, _ = train_test_split(attention_mask, input_ids,
                                             random_state=2018, test_size=0.5)
train_token_type_ids, validation_token_type_ids, _, _ = train_test_split(token_type_ids, input_ids,
                                             random_state=2018, test_size=0.5)

In [38]:
# # input 可用 numpy 或 tf tensor 下面是numpy轉tf tensor

# train_input_ids = tf.convert_to_tensor(train_input_ids)
# validation_input_ids = tf.convert_to_tensor(validation_input_ids)
# train_label = tf.convert_to_tensor(train_label)
# validation_label = tf.convert_to_tensor(validation_label)
# train_attention_mask = tf.convert_to_tensor(train_attention_mask)
# validation_attention_mask = tf.convert_to_tensor(validation_attention_mask)
# train_token_type_ids = tf.convert_to_tensor(train_token_type_ids)
# validation_token_type_ids = tf.convert_to_tensor(validation_token_type_ids)

In [39]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
# num_labels=5 分5類
model = TFBertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=5)
model.summary()


optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  102267648 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  3845      
Total params: 102,271,493
Trainable params: 102,271,493
Non-trainable params: 0
_________________________________________________________________


In [40]:
# Train and evaluate using tf.keras.Model.fit()
model_fit = model.fit(train_input_ids, train_label, batch_size=4, epochs=1, 
                    validation_data=(validation_input_ids, validation_label)
#                    steps_per_epoch=115,
#                    validation_steps=7)
                   )

Train on 7946 samples, validate on 7946 samples


https://huggingface.co/transformers/model_doc/bert.html#tfbertforsequenceclassification

attention_mask 或 token_type_ids 不一定要放輸入 可選擇

要放了話要加```[ ]```

```model.fit([train_input_ids, train_attention_mask, train_token_type_ids], train_label)```

就是```model.fit(X_train, Y_train)```

```[train_input_ids, train_attention_mask, train_token_type_ids]``` 就是 ```X_train```

```train_label``` 就是 ```Y_train```

In [41]:
# model_fit = model.fit([train_input_ids, train_attention_mask, train_token_type_ids], train_label, batch_size=4, epochs=1, 
#                     validation_data=([validation_input_ids, validation_attention_mask, validation_token_type_ids], validation_label)
#                    )

In [42]:
# # 重新train 要把model記憶體釋放 在jupyter裡面interrupt kernel
# # model 還是存在 只是train到一半
# del model