In [1]:
from IPython.display import clear_output 

!pip install transformers==4.8.2
!pip install sentencepiece==0.1.96
!pip install tensorflow_addons

clear_output() # clear the output after the installation

In [2]:
import sklearn
import tensorflow
import transformers
import tensorflow_addons

print(sklearn.__version__) # 1.0.2
print(tensorflow.__version__) # 2.8.0
print(transformers.__version__) # 4.8.2
print(tensorflow_addons.__version__) # 0.16.1

import sentencepiece # 이번 실습에서 추가되었습니다

1.0.2
2.8.0
4.8.2
0.17.0


In [3]:
import pandas as pd
import numpy as np

import os
import re
import pickle 
import dill # for saving a function as a file(.pkl)
import logging # for changing the tf's logging level
import urllib.request
from tqdm import tqdm

from sklearn import model_selection
from sklearn.metrics import accuracy_score

import tensorflow as tf
import tensorflow_addons as tfa # for using Rectified-Adam optimizer (instead of Adam optimizer) 
from tensorflow.keras import layers, initializers, losses, optimizers, metrics, callbacks 

import transformers
from transformers import TFBertModel # BertTokenizer 제외

import sentencepiece as spm # 이번 실습에서 추가되었습니다

In [4]:
# Random seed 고정
tf.random.set_seed(1234)
np.random.seed(1234)

# Transformers logging level 변경 (WARNING -> ERROR) @ https://huggingface.co/transformers/main_classes/logging.html
transformers.logging.set_verbosity(transformers.logging.ERROR)

# Tensorflow logging level 변경 
tf.get_logger().setLevel(logging.ERROR)

# 1. load data

In [5]:
from google.colab import files
file_uploaded = files.upload()

Saving text_data.xlsx to text_data (1).xlsx


In [6]:
data = pd.read_excel('text_data.xlsx')
data.head()

Unnamed: 0,Sentence,Emotion
0,어깨라도넓엇으면 옷빨이라도 잘살텐데..,슬픔
1,그저 보고싶어죽겟어요,슬픔
2,아 진짜 저한테는 악운만 붙는거같네요ㅠㅠ........0,슬픔
3,지금 계단에서울고있어요,슬픔
4,저때문에 택배하나 배달하려다 변을 당하신건 아니겠죠0,슬픔


In [7]:
#data['Emotion'].value_counts()

In [8]:
data.loc[(data['Emotion']=='중립'),'Emotion'] = 0
data.loc[(data['Emotion']=='행복'),'Emotion'] = 1
data.loc[(data['Emotion']=='슬픔'),'Emotion'] = 2

In [9]:
#data['Emotion'].value_counts()

In [10]:
data.to_csv('num_labeled_df.csv', encoding='utf-8')

In [11]:
# 별도로 stratify 파라미터 값을 지정하지 않아도 자동으로 클래스 비율이 유지됨
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Sentence'], data['Emotion'], 
                                                                    test_size=0.3, 
                                                                    random_state=42) 

# 2. pre-processing
tokens > sequence numbers with padding

2.1. load the pre-trained tokenizer

In [12]:
#KoBERT 불러오기
!git clone https://github.com/monologg/KoBERT-Transformers.git
!mv KoBERT-Transformers/kobert_transformers/tokenization_kobert.py /content
clear_output() # clear the output

#토크나이저 생성
from tokenization_kobert import KoBertTokenizer 

tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # sentencepiece 라이브러리가 먼저 import 되어있어야 합니다.
# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', cache_dir='bert_ckpt', do_lower_case=False) 

2.2. browse the usage of BertTokenizer

In [13]:
# tokenizer.tokenize(text_data) #토큰화
# tokenizer.encode(text_data, max_length=None, padding='max_length') #토큰화 + 토큰별 시퀀스번호 부여
# tokenizer.convert_ids_to_tokens(tokenizer.encode(text_data)) #토큰화 + 토큰별 시퀀스번호 부여 + 패딩
# encode() 적용결과가 tokenize() 보다 2만큼 더 큼 <- '[CLS]'와 '[SEP]' 토큰이 자동으로 추가되었기 때문

2.3. determine the max_length of SEQUENCE

In [14]:
#가장 길이가 긴 Sentence 찾기
data['Sentence'].str.len().sort_values(ascending=False)[:5]
#인덱스번호, 글자 갯수

15232    298
14708    181
15823    174
8354     155
10345    142
Name: Sentence, dtype: int64

In [15]:
data['Sentence'][15232]

'그냥 내비둬 웃음을 주잖아ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ'

In [16]:
#토큰 갯수
len(tokenizer.tokenize(data['Sentence'][15232]))

292

In [17]:
# 최대 토큰 갯수인 292개 이상의 암의값으로 max_length 설정
SEQ_LEN = 300

#확인
len(tokenizer.encode("그냥 내비둬 웃음을 주잖아ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ",
                     max_length=SEQ_LEN, padding='max_length'))

300

# 2.4. KoBERT의 input data 형식으로 변환 (tokens_tensor, segments_tensor, masks_tensor) by BertTokenizer.encode_plus(sentence)
return: 각각에 해당하는 'input_ids', 'token_type_ids', 'attention_mask' 3가지 key를 가진 dict

In [18]:
token_ids = []
token_segments = []
token_masks = []

train_labels = []

for idx in tqdm(range(len(train_x))):

  # 'Sentence'칼럼에서 0번째 행~ 꺼내오기
  train_sentence = train_x.iloc[idx] 

  # 특수문자 제거
  cleaned_sentence = re.sub("[^\s0-9a-zA-Zㄱ-ㅎㅏ-ㅣ가-힣]", "", train_sentence)

  # 토큰화 + 토큰별 시퀀스번호 부여 + 패딩
  encoded_dict = tokenizer.encode_plus(text = cleaned_sentence,
                                       padding='max_length',
                                       truncation=True,
                                       max_length=SEQ_LEN
                                       )
  
  #'input_ids', 'token_type_ids', 'attention_mask'
  token_ids.append(encoded_dict['input_ids'])
  token_masks.append(encoded_dict['attention_mask'])
  token_segments.append(encoded_dict['token_type_ids'])

  train_labels.append(train_y.iloc[idx])


train_inputs = (np.array(token_ids), np.array(token_masks), np.array(token_segments))
train_labels = np.array(train_labels)


100%|██████████| 11293/11293 [00:02<00:00, 4092.86it/s]


# 3. Modeling

3.1. Build a pretrained BERT model

In [19]:
# #load pretrained model
# bert_base_model = TFBertModel.from_pretrained("monologg/kobert", cache_dir='bert_ckpt', from_pt=True)

# input_token_ids   = layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_token_ids')   # tokens_tensor
# input_masks       = layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')       # masks_tensor
# input_segments    = layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segments')    # segments_tensor  

# bert_outputs = bert_base_model([input_token_ids, input_masks, input_segments])

# bert_outputs = bert_outputs[1] #'pooler_output'
# bert_outputs = layers.Dropout(0.2)(bert_outputs)
# final_output = layers.Dense(units=3, activation='softmax', kernel_initializer=initializers.TruncatedNormal(stddev=0.02), name="classifier")(bert_outputs)

# model = tf.keras.Model(inputs=[input_token_ids, input_masks, input_segments], 
#                        outputs=final_output)


In [20]:
# model.summary()

In [21]:
#tf.keras.utils.plot_model(model, "model.png")

In [22]:
def create_model(max_length=300):

    bert_base_model = TFBertModel.from_pretrained("monologg/kobert", from_pt=True) 

    input_token_ids   = layers.Input((max_length,), dtype=tf.int32, name='input_token_ids')   # tokens_tensor
    input_masks       = layers.Input((max_length,), dtype=tf.int32, name='input_masks')       # masks_tensor
    input_segments    = layers.Input((max_length,), dtype=tf.int32, name='input_segments')    # segments_tensor  

    bert_outputs = bert_base_model([input_token_ids, input_masks, input_segments]) 
    

    bert_outputs = bert_outputs[1] # ('pooler_output', <KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'tf_bert_model')>)
    bert_outputs = layers.Dropout(0.2)(bert_outputs)
    final_output = layers.Dense(units=3, activation='softmax', kernel_initializer=initializers.TruncatedNormal(stddev=0.02), name="classifier")(bert_outputs)

    model = tf.keras.Model(inputs=[input_token_ids, input_masks, input_segments], 
                        outputs=final_output)

    model.compile(optimizer=tfa.optimizers.RectifiedAdam(learning_rate=1e-5, weight_decay=0.0025, warmup_proportion=0.05),
                  loss=losses.SparseCategoricalCrossentropy(), 
                  metrics=[metrics.SparseCategoricalAccuracy()])
    
    return model

# 3.2 TPU setting

In [23]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("\nAll devices: ", tf.config.list_logical_devices('TPU'))

strategy = tf.distribute.TPUStrategy(resolver) # Obsolete : tf.distribute.experimental.TPUStrategy()

with strategy.scope(): 
    model = create_model(max_length=300)

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0



All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU')]


# 3.3. Train

In [24]:
# Checkpoint setting for saving the best model
from google.colab import drive
drive.mount('/gdrive')

checkpoint_path = '/gdrive/MyDrive/colab_data/temp_data/saved_models/'

if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)


# For custom models, we have to use "save_weights_only = True" (or we should implement a "get_config" method @ https://j.mp/3ltUibd) 
callback_checkpoint = callbacks.ModelCheckpoint(filepath=checkpoint_path + 'best_bert_weights.h5', 
                                                monitor='val_sparse_categorical_accuracy',
                                                save_best_only=True, 
                                                save_weights_only = True, #이렇게 무거운 모델은 .h5인 하나의 파일로 저장 못하므로 theta만 저장
                                                verbose=1) 
                                                
# Early-stopping for preventing the overfitting
callback_earlystop = callbacks.EarlyStopping(monitor='val_sparse_categorical_accuracy', 
                                             min_delta=0.0001, # the threshold that triggers the termination (acc should at least improve 0.0001)
                                             patience=5) #  Number of epochs with no improvement after which training will be stopped

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [25]:
history = model.fit(train_inputs, train_labels, validation_split=0.2,
                    epochs=10, batch_size=100,
                    verbose=1,
                    callbacks=[callback_checkpoint, callback_earlystop])

Epoch 1/10


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_24:0' shape=(None,) dtype=int64>]
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_24:0' shape=(None,) dtype=int64>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_24:0' shape=(None,) dtype=int64>]



Epoch 1: val_sparse_categorical_accuracy improved from -inf to 0.42231, saving model to /gdrive/MyDrive/colab_data/temp_data/saved_models/best_bert_weights.h5
Epoch 2/10
Epoch 2: val_sparse_categorical_accuracy improved from 0.42231 to 0.58035, saving model to /gdrive/MyDrive/colab_data/temp_data/saved_models/best_bert_weights.h5
Epoch 3/10
Epoch 3: val_sparse_categorical_accuracy improved from 0.58035 to 0.69854, saving model to /gdrive/MyDrive/colab_data/temp_data/saved_models/best_bert_weights.h5
Epoch 4/10
Epoch 4: val_sparse_categorical_accuracy improved from 0.69854 to 0.72864, saving model to /gdrive/MyDrive/colab_data/temp_data/saved_models/best_bert_weights.h5
Epoch 5/10
Epoch 5: val_sparse_categorical_accuracy improved from 0.72864 to 0.75077, saving model to /gdrive/MyDrive/colab_data/temp_data/saved_models/best_bert_weights.h5
Epoch 6/10
Epoch 6: val_sparse_categorical_accuracy improved from 0.75077 to 0.76538, saving model to /gdrive/MyDrive/colab_data/temp_data/saved_mod

# 3.4. Evaluate

In [26]:
SEQ_LEN = 300 # 최대 token 개수 이상의 값으로 임의로 설정

token_ids =[]
token_masks =[]
token_segments =[]

test_labels = []


for idx in tqdm(range(len(test_x))):

    test_sentence = test_x.iloc[idx]
    
    # 특수문자 제거
    cleaned_sentence = re.sub("[^\s0-9a-zA-Zㄱ-ㅎㅏ-ㅣ가-힣]", "", test_sentence) 
    
    # Tokenizing / Tokens to sequence numbers / Padding
    encoded_dict = tokenizer.encode_plus(text=cleaned_sentence,
                                         padding='max_length', 
                                         truncation=True,
                                         max_length=SEQ_LEN)
    
    token_ids.append(encoded_dict['input_ids']) # tokens_tensor
    token_masks.append(encoded_dict['attention_mask']) # masks_tensor
    token_segments.append(encoded_dict['token_type_ids']) # segments_tensor

    test_labels.append(test_y.iloc[idx])


test_inputs = (np.array(token_ids), np.array(token_masks), np.array(token_segments))
test_labels = np.array(test_labels)

100%|██████████| 4841/4841 [00:01<00:00, 3805.88it/s]


In [27]:
test_inputs

(array([[   2, 3760, 6116, ...,    1,    1,    1],
        [   2, 4368, 3514, ...,    1,    1,    1],
        [   2, 4720, 6855, ...,    1,    1,    1],
        ...,
        [   2,  517, 7546, ...,    1,    1,    1],
        [   2, 1788, 5947, ...,    1,    1,    1],
        [   2, 1861, 1407, ...,    1,    1,    1]]),
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

In [28]:
test_labels

array([0, 2, 1, ..., 0, 1, 2])

In [29]:
# results = model.evaluate(test_inputs, test_labels, batch_size=100)
# print("test loss, test acc: ", results)

In [30]:
model.load_weights(filepath=checkpoint_path + 'best_bert_weights.h5') # Load the best model's weights from checkpoint file

preds = model.predict(test_inputs)
preds = tf.argmax(preds, axis=1)

print(accuracy_score(preds, test_labels))

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 300) dtype=int64>]


0.7866143358810164


# 3.5. Save the Model-Builder & Bert-Tokenizer

In [31]:
from google.colab import drive
drive.mount('/gdrive')


data_path = '/gdrive/MyDrive/colab_data/temp_data/' 

if not os.path.exists(data_path): 
    os.makedirs(data_path)


with open(data_path + 'model_BERTfunction_v1.pkl', 'wb') as f:
    pickle.dump(dill.dumps(create_model), f) # use dill to pickle a function (https://j.mp/3CeSIzP & https://j.mp/3AaXxYW)

with open(data_path + 'tokenizer-bert.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)     

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
