In [1]:
import pandas as pd
import numpy as np
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(torch.__version__) 
print(device)
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

import tensorflow as tf
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW

1.3.0
cuda
1
GeForce RTX 2080 Ti


In [2]:
# 讀取tsv # \t tab做區隔
df_news = pd.read_csv("https://github.com/roccqqck/news_bert/raw/master/data/2015_Company.tsv", sep="\t", encoding="utf-8")
df_news['text'] = df_news['text'].astype(str)
df_news

Unnamed: 0,id,label,text
0,0,0,財政部充分尊重銀行團確保債權的必要作為，預計元旦後就會請法院實施假扣押，如三重土地順利拍賣，...
1,1,0,至三趴以上，「在假扣押前，仍歡迎頂新還錢。」頂新：將持續協商財政部次長吳當傑尊重並同意銀行團...
2,2,1,測報告中指出，各款手機在裝上SIM卡前後開機連線時，傳送之資料皆未涉及第一類的敏感資訊，僅為...
3,3,1,大哥大，透過4G漫遊方式共網爭議，戰火延燒，威寶電信（台灣之星）及中華電信、遠傳電信等業者，...
4,4,1,測報告中指出，各款手機在裝上SIM卡前後開機連線時，傳送之資料皆未涉及第一類的敏感資訊，僅為...
...,...,...,...
15887,15887,2,"感覺華碩客服有點推託,所謂的個資法是企業有義務妥善保管蒐集來的個人資料不 感覺華碩客服有點推..."
15888,15888,3,上市公司玉晶光（3406）今天傳出裁員，對此，公司發言人趙志強表示，並非裁員而 上市公司玉晶...
15889,15889,3,繼台企銀（2834）工會因合併問題而罷工，創金融史首例後，大眾銀行（2847）工 繼台企銀（...
15890,15890,3,6億元賣給英業達，獲利21億元，預計明年首季入帳，活化資產。外界認為這將有機會帶動宏達電明年...


In [3]:
# Load pre-trained model tokenizer, to convert our text into tokens that correspond to BERT’s vocabulary.
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [4]:
# Tokenize input
text = "很好看的動作片，不會浪費錢跟時間。很久沒有這樣的探險片。可說是女版的印第安那瓊。"
tokenized_text = tokenizer.tokenize(text)      # 每個字切詞成一個list
print(type(tokenized_text))                 # list
np.array(tokenized_text)                    # 轉成numpy

<class 'list'>


array(['很', '好', '看', '的', '動', '作', '片', '，', '不', '會', '浪', '費', '錢',
       '跟', '時', '間', '。', '很', '久', '沒', '有', '這', '樣', '的', '探', '險',
       '片', '。', '可', '說', '是', '女', '版', '的', '印', '第', '安', '那', '瓊',
       '。'], dtype='<U1')

In [5]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)   # 每個字轉成id
print(type(indexed_tokens))                         # list
np.array(indexed_tokens)

<class 'list'>


array([2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298, 3857,
       6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300, 6857,
       3564, 4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957, 4276,
       4638, 1313, 5018, 2128, 6929, 4475,  511])

In [6]:
indexed_tokens2 = tokenizer.build_inputs_with_special_tokens(indexed_tokens)    # 句子前後加上 CLS SEP
print(type(indexed_tokens2))
np.array(indexed_tokens2)

<class 'list'>


array([ 101, 2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298,
       3857, 6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300,
       6857, 3564, 4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957,
       4276, 4638, 1313, 5018, 2128, 6929, 4475,  511,  102])

In [7]:
n = 512 - len(indexed_tokens2)
indexed_tokens3 = np.pad(indexed_tokens2, (0, n), mode ='constant', constant_values=(0))  
# array右邊append n 個 0  補長度到512
print(len(indexed_tokens3))
indexed_tokens3

512


array([ 101, 2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298,
       3857, 6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300,
       6857, 3564, 4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957,
       4276, 4638, 1313, 5018, 2128, 6929, 4475,  511,  102,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [8]:
segments_ids = tokenizer.create_token_type_ids_from_sequences(indexed_tokens2)
print(type(segments_ids))                                # list
np.array(segments_ids)

<class 'list'>


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

#### np.pad doc

https://docs.scipy.org/doc/numpy/reference/generated/numpy.pad.html 

也可以使用
from keras.preprocessing.sequence import pad_sequences

In [9]:
def tokenize_all(text):
#    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    tokenized_text = tokenizer.tokenize(text)    
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    indexed_tokens2 = tokenizer.build_inputs_with_special_tokens(indexed_tokens)
    indexed_tokens2 = np.array(indexed_tokens2)          # list 轉 numpy
    if len(indexed_tokens2) < 512:
        n = 512 - len(indexed_tokens2)
        indexed_tokens3 = np.pad(indexed_tokens2, (0, n), mode ='constant', constant_values=(0))  # array右邊append n 個 0  補長度到512
    return indexed_tokens3

In [10]:
def segments_all(text):
    tokenized_text = tokenizer.tokenize(text)    
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    indexed_tokens2 = tokenizer.build_inputs_with_special_tokens(indexed_tokens)
    indexed_tokens2 = np.array(indexed_tokens2)          # list 轉 numpy
    segments_ids = tokenizer.create_token_type_ids_from_sequences(indexed_tokens2)
    segments_ids = np.array(segments_ids)              # list 轉numpy
    if len(segments_ids) < 512:
        n = 512 - len(segments_ids)
        segments_ids2 = np.pad(segments_ids, (0, n), mode ='constant', constant_values=(0))  # array右邊append n 個 0  補長度到512    
    return segments_ids2

In [11]:
def masks_all(text):
    tokenized_text = tokenizer.tokenize(text)    
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    indexed_tokens2 = tokenizer.build_inputs_with_special_tokens(indexed_tokens)
    indexed_tokens2 = np.array(indexed_tokens2)          # list 轉 numpy
    segments_ids = tokenizer.create_token_type_ids_from_sequences(indexed_tokens2)
    segments_ids = np.array(segments_ids)              # list 轉numpy
    mask_token = np.array([1,1])
    mask_token = np.pad(mask_token, (0, len( segments_ids - 2 )), mode ='constant', constant_values=(1)) 
    # array右邊append 1 到跟segment一樣長
    if len(mask_token) < 512:
        n = 512 - len(mask_token)
        mask_token2 = np.pad(mask_token, (0, n), mode ='constant', constant_values=(0))  # array右邊append n 個 0  補長度到512
    return mask_token2

In [12]:
# df['text'] = df.comment.apply(lambda x: " ".join(jieba.cut(x)))
# df_news['tokenized_text'] = df_news['text'].apply(lambda x:  tokenizer.tokenize(x) )
df_news['indexed_tokens'] = df_news['text'].apply(tokenize_all)
df_news['segments_ids'] = df_news['text'].apply(segments_all)
df_news['mask_token'] = df_news['text'].apply(masks_all)
df_news

Unnamed: 0,id,label,text,indexed_tokens,segments_ids,mask_token
0,0,0,財政部充分尊重銀行團確保債權的必要作為，預計元旦後就會請法院實施假扣押，如三重土地順利拍賣，...,"[101, 6512, 3124, 6956, 1041, 1146, 2203, 7028...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1,0,至三趴以上，「在假扣押前，仍歡迎頂新還錢。」頂新：將持續協商財政部次長吳當傑尊重並同意銀行團...,"[101, 5635, 676, 6640, 809, 677, 8024, 519, 17...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,2,1,測報告中指出，各款手機在裝上SIM卡前後開機連線時，傳送之資料皆未涉及第一類的敏感資訊，僅為...,"[101, 3947, 1841, 1440, 704, 2900, 1139, 8024,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,3,1,大哥大，透過4G漫遊方式共網爭議，戰火延燒，威寶電信（台灣之星）及中華電信、遠傳電信等業者，...,"[101, 1920, 1520, 1920, 8024, 6851, 6882, 100,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,4,1,測報告中指出，各款手機在裝上SIM卡前後開機連線時，傳送之資料皆未涉及第一類的敏感資訊，僅為...,"[101, 3947, 1841, 1440, 704, 2900, 1139, 8024,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...,...
15887,15887,2,"感覺華碩客服有點推託,所謂的個資法是企業有義務妥善保管蒐集來的個人資料不 感覺華碩客服有點推...","[101, 2697, 6221, 5836, 4820, 2145, 3302, 3300...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15888,15888,3,上市公司玉晶光（3406）今天傳出裁員，對此，公司發言人趙志強表示，並非裁員而 上市公司玉晶...,"[101, 677, 2356, 1062, 1385, 4373, 3253, 1045,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15889,15889,3,繼台企銀（2834）工會因合併問題而罷工，創金融史首例後，大眾銀行（2847）工 繼台企銀（...,"[101, 5262, 1378, 821, 7065, 8020, 11152, 8159...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15890,15890,3,6億元賣給英業達，獲利21億元，預計明年首季入帳，活化資產。外界認為這將有機會帶動宏達電明年...,"[101, 127, 1023, 1039, 6546, 5183, 5739, 3511,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [13]:
print(len(df_news['indexed_tokens'][0]))
df_news['indexed_tokens'][0] 

512


array([ 101, 6512, 3124, 6956, 1041, 1146, 2203, 7028, 7065, 6121, 1757,
       4825,  924, 1002, 3609, 4638, 2553, 6206,  868, 4158, 8024, 7521,
       6243, 1039, 3190, 2527, 2218, 3298, 6313, 3791, 7368, 2179, 3177,
        969, 2807, 2852, 8024, 1963,  676, 7028, 1759, 1765, 7518, 1164,
       2864, 6546, 8024, 7515, 4372, 6917, 3621, 2200,  679, 2512, 7513,
       1456, 1059,  127, 1283, 1399, 1519, 2339, 4638, 4495, 6243,  511,
       1042, 6493, 7065, 6121, 6134, 4850, 8024, 7515, 4372, 7274, 4634,
       5474, 6526, 2234, 3309, 6889, 5147, 3428, 8024, 5195, 5474, 6526,
       7065, 6121, 1757, 3748, 6359, 4634, 1139,  998, 1440, 2100, 6349,
        928, 1141, 8024, 6313,  102,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [14]:
print(len(df_news['mask_token'][0]))
df_news['mask_token'][0]

512


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [15]:
print(len(df_news['segments_ids'][0]))
df_news['segments_ids'][0]

512


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [16]:
# df_news['indexed_tokens'].to_numpy() 出來不是一個2d numpy 
# 只好用for loop一個一個拿出來合併

indexed_tokens = np.zeros((1, 512)).astype(int) #宣吿一個都是0的1*512 numpy # np.zeros預設是float 改成int 不然bert餵不進去
for index, row in df_news.iterrows():  
    element = df_news.loc[index,'indexed_tokens']
    indexed_tokens = np.vstack((indexed_tokens, np.array([element])))   # 2維 合併

indexed_tokens = np.delete(indexed_tokens, 0, 0)              # 刪掉一開始都是0的那一個宣告  
indexed_tokens

array([[ 101., 6512., 3124., ...,    0.,    0.,    0.],
       [ 101., 5635.,  676., ...,    0.,    0.,    0.],
       [ 101., 3947., 1841., ...,    0.,    0.,    0.],
       ...,
       [ 101., 5262., 1378., ...,    0.,    0.,    0.],
       [ 101.,  127., 1023., ...,    0.,    0.,    0.],
       [ 101., 3229., 3149., ...,    0.,    0.,    0.]])

In [17]:
indexed_tokens.shape

(15892, 512)

In [18]:
torch.tensor(indexed_tokens)

tensor([[ 101., 6512., 3124.,  ...,    0.,    0.,    0.],
        [ 101., 5635.,  676.,  ...,    0.,    0.,    0.],
        [ 101., 3947., 1841.,  ...,    0.,    0.,    0.],
        ...,
        [ 101., 5262., 1378.,  ...,    0.,    0.,    0.],
        [ 101.,  127., 1023.,  ...,    0.,    0.,    0.],
        [ 101., 3229., 3149.,  ...,    0.,    0.,    0.]], dtype=torch.float64)

In [19]:
tf.convert_to_tensor(indexed_tokens)

<tf.Tensor: id=2, shape=(15892, 512), dtype=float64, numpy=
array([[ 101., 6512., 3124., ...,    0.,    0.,    0.],
       [ 101., 5635.,  676., ...,    0.,    0.,    0.],
       [ 101., 3947., 1841., ...,    0.,    0.,    0.],
       ...,
       [ 101., 5262., 1378., ...,    0.,    0.,    0.],
       [ 101.,  127., 1023., ...,    0.,    0.,    0.],
       [ 101., 3229., 3149., ...,    0.,    0.,    0.]])>

In [20]:
mask_token = np.zeros((1, 512)).astype(int) #宣吿一個都是0的1*512 numpy # np.zeros預設是float 改成int 不然bert餵不進去
for index, row in df_news.iterrows():  
    element = df_news.loc[index,'mask_token']
    mask_token = np.vstack((mask_token, np.array([element])))   # 2維 合併

mask_token = np.delete(mask_token, 0, 0)              # 刪掉一開始都是0的那一個宣告  
mask_token

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [21]:
mask_token.shape

(15892, 512)

In [22]:
label = df_news["label"].to_numpy()
label

array([0, 0, 1, ..., 3, 3, 3])

https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca
https://github.com/ThilinaRajapakse/pytorch-transformers-classification

https://leemeng.tw/attack_on_bert_transfer_learning_in_nlp.html

https://mccormickml.com/2019/07/22/BERT-fine-tuning/



In [23]:
# 把2個input indexed_tokens , mask_token 還有label 切成training validation

from sklearn.model_selection import train_test_split
# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(indexed_tokens, label, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(mask_token, indexed_tokens,
                                             random_state=2018, test_size=0.1)

In [24]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_masks = torch.tensor(validation_masks)
train_masks = torch.tensor(train_masks)
validation_labels = torch.tensor(validation_labels)

In [25]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# validation_sampler = SequentialSampler(validation_data)
# validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [26]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.
# num_labels=5 分5類
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=5)
model.cuda()





# Batch size: 16, 32
# Learning rate (Adam): 5e-5, 3e-5, 2e-5
# Number of epochs: 2, 3, 4
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]






# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters,
                     lr=2e-5,)

In [27]:
outputs = model(train_inputs, train_masks, train_labels)
# loss, logits = outputs[:2]

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.DoubleTensor instead (while checking arguments for embedding)

In [27]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [28]:
from tqdm import tqdm, trange

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
          # Forward pass, calculate logit predictions
          logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

NameError: name 't' is not defined

In [21]:
input_ids = torch.tensor(tokenizer.encode("很好看的動作片，不會浪費錢跟時間。很久沒有這樣的探險片。可說是女版的印第安那瓊。")).unsqueeze(0)  # Batch size 1
input_ids

tensor([[ 101, 2523, 1962, 4692, 4638, 1240,  868, 4275, 8024,  679, 3298, 3857,
         6527, 7092, 6656, 3229, 7279,  511, 2523,  719, 3760, 3300, 6857, 3564,
         4638, 2968, 7402, 4275,  511, 1377, 6303, 3221, 1957, 4276, 4638, 1313,
         5018, 2128, 6929, 4475,  511,  102]])

In [22]:
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
labels

tensor([[1]])

In [23]:
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]

In [24]:
outputs

(tensor(0.7320, grad_fn=<NllLossBackward>),
 tensor([[0.1374, 0.0612]], grad_fn=<AddmmBackward>))

In [25]:
loss, logits

(tensor(0.7320, grad_fn=<NllLossBackward>),
 tensor([[0.1374, 0.0612]], grad_fn=<AddmmBackward>))