In [None]:
import os
import re
import numpy as np
from tqdm import tqdm

import tensorflow as tf
from transformers import *

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import pandas as pd
import matplotlib.pyplot as plt


In [None]:
pip install transformers



In [None]:
from transformers import *
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
#다국어 지원을 위한 BerrTokenizer 모델 사용, wordpiece 모델을 사용해서 학습을 진행한 것 

In [None]:
print(tokenizer.all_special_tokens,"\n", tokenizer.all_special_ids)
#BertTokenizer에서 사용되는 스페셜 토큰의 종류와 인덱스

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'] 
 [100, 102, 0, 101, 103]


In [None]:
kor_encode = tokenizer.encode("안녕하세요. 반갑습니다.")
kor_decode = tokenizer.decode(kor_encode)
print("안녕하세요. 반갑습니다.")
print(kor_encode," / ", kor_decode) 

안녕하세요. 반갑습니다.
[101, 9521, 118741, 35506, 24982, 48549, 119, 9321, 118610, 119081, 48345, 119, 102]  /  [CLS] 안녕하세요. 반갑습니다. [SEP]


In [None]:
#random seed 고정
tf.random.set_seed(1234)
np.random.seed(1234)

BATCH_SIZE = 32
NUM_EPOCHS = 3
VALID_SPLIT = 0.2
MAX_LEN = 39 # EDA에서 추출된 Max Length
DATA_IN_PATH = "/content/drive/MyDrive/Colab Notebooks/nlp book/data_in/KOR"
DATA_OUT_PATH = "/content/drive/MyDrive/Colab Notebooks/nlp book"

# Korean Movie Review Classification

In [None]:
# 데이터 전처리 준비
# 15만개의 네이버 영화 리뷰 데이터 프레임 train_data
DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, "naver_movie", "ratings_train.txt")

train_data = pd.read_csv(DATA_TRAIN_PATH, header = 0, delimiter = '\t', quoting = 3)
print(len(train_data))
train_data = train_data.dropna()
train_data.head()

150000


Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


### 세가지 유형의 input값

-input_ids : 문장을 토크나이즈해서 인덱스 값으로 변환한 값

-attention_mask : 패딩된 부분에 대해서, 학습에 영향을 받지 않게 처리해주는 입력값, 실질적인 값이 존재하면 1,  패딩된부분은 0

-token_type_ids : 두개의 시퀀스를 입력으로 활용할 대 0과 1로 문장의 토큰 값을 분리

In [1]:
# Bert Tokenizer 
def bert_tokenizer(sent, MAX_LEN):
    
    encoded_dict = tokenizer.encode_plus(
        text = sent,
        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
        max_length = MAX_LEN,      # Pad & truncate all sentences.
        pad_to_max_length = True,
        return_attention_mask = True   # Construct attn. masks.       
    )
    
    input_id = encoded_dict['input_ids'] # encode한것
    attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).
    token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences
    
    return input_id, attention_mask, token_type_id

In [2]:
# 정규표현식을 활용해서 한글이외의 특수문자를 제외해주는 함수
def clean_text(sent) : 
  sent_clean = re.sub("[^가-힣 ㄱ-ㅎ ㅏ-ㅣ\\s]", ' ', sent)
  return sent_clean

In [3]:
input_ids = []
attention_masks = []
token_type_ids = []
train_data_labels = []


for train_set, train_label in zip(train_data['document'], train_data['label']) :
  try:
    input_id, attention_mask, token_type_id = bert_tokenizer(clean_text(train_set),MAX_LEN) #MAX_LEN=39
    
    #각각의 리스트에 추가해주기
    input_ids.append(input_id)
    attention_masks.append(attention_mask)
    token_type_ids.append(token_type_id)
    train_data_labels.append(train_label)
  except Exception as e :
    print(e)
    print(train_set)
    pass

NameError: ignored

In [None]:
#위의 리스트들을 넘파이로 바꿔주기
train_movie_input_ids = np.array(input_ids, dtype=int)
train_movie_attention_masks = np.array(attention_masks, dtype=int)
train_movie_type_ids = np.array(token_type_ids, dtype=int)

train_movie_inputs = (train_movie_input_ids, train_movie_attention_masks, train_movie_type_ids)
train_data_labels = np.asarray(train_data_labels, dtype=np.int32) 

print("# sents: {}, # labels: {}".format(len(train_movie_input_ids), len(train_data_labels)))

# sents: 149995, # labels: 149995


In [None]:
# 최대 길이: 39
input_id = train_movie_input_ids[1]
attention_mask = train_movie_attention_masks[1]
token_type_id = train_movie_type_ids[1]

print("문장 ", train_data.iloc[1]['document'])
print(tokenizer.decode(input_id))

문장  흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
[CLS] [UNK] 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [None]:
print("input_id", input_id)

input_id [   101    100   9928  58823  30005  11664   9757 118823  30858  18227
 119219   9580  41605  25486  12310  20626  23466   8843 118986  12508
   9523  17196  16439    102      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0]


In [None]:
print("attention_mask", attention_mask)

attention_mask [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]


In [None]:
print("token_type_id", token_type_id)

token_type_id [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]


In [None]:
import keras

In [None]:
drive_dir_path ='/content/drive/MyDrive/Colab Notebooks/nlp book/bert_ckpt'

# bert를 활용한 한국어 텍스트 분류 모델 만들기
class TFBertClassifier(tf.keras.Model) :
  def __init__ (self, model_name, dir_path, num_class):
    super(TFBertClassifier,self).__init__()

    self.bert = TFBertModel.from_pretrained(model_name, cache_dir= dir_path ) 
    # 기존에 사전학습된 모델의 가중치 부분들이 로드된다.
    
    self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
    # self.bert.config.hidden_dropout_prob = 0.1

    self.classifier = tf.keras.layers.Dense(num_class, kernel_initializer=tf.keras.initializers.TruncatedNormal(
      self.bert.config.initializer_range), name='classifier')
    # num_class에 원하는 분류 값을 추가해서 정답의 개수를 정할 수 있다.
    # tf.keras.initializers.TruncatedNormal는 절단 정규 분포로부터 무작위 값을 선택, 대신 선택범위를 평균으로 2표준편차 안쪽으로 제한한다.
    # self.bert.config.initializer_range =0.02
    

  def call (self, inputs, attention_mask = None, token_type_ids = None, training = Fasle ):
    outputs= self.bert(inputs, attention_mask=attention_mask, token_type_ids = token_type_ids)   
    #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)이 된다
    pooled_output = outputs[1] # 
    pooled_output = self.dropout(pooled_output, training=training)
    logits = self.classifier(pooled_output) # 마지막 출력에 완전연결층 1층 적용, 필요하다면 여러개 층을 추가 가능하다.
    return logits

cls_model = TFBertClassifier(model_name='bert-base-multilingual-cased',
                                  dir_path=drive_dir_path,
                                  num_class=2)

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
drive_dir_path ='/content/drive/MyDrive/Colab Notebooks/nlp book/bert_ckpt'
bert = TFBertModel.from_pretrained('bert-base-multilingual-cased',
                                  cache_dir=drive_dir_path,)


Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
output = bert(train_movie_input_ids[:5], train_movie_attention_masks[:5], train_movie_type_ids[:5])

In [None]:
# 모델 컴파일
optimizer = tf.keras.optimizers.Adam(3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
model_name = "tf2_bert_naver_movie"
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)

checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))

cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

# 학습과 eval 시작
history = cls_model.fit(train_movie_inputs, train_data_labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,
                    validation_split = VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])

#steps_for_epoch

print(history.history)

#코랩에서 gpu를 사용해 epoch 3를 주었는데도 거의 3시간 가량이 걸렸다.

/content/drive/MyDrive/Colab Notebooks/nlp book/tf2_bert_naver_movie -- Folder create complete 

Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f2284217d90> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f2284217d90> is not a module, class, method, function, traceback, frame, or code object


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).


Cause: while/else statement not yet supported


The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Cause: while/else statement not yet supported


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.




The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.



Epoch 00001: val_accuracy improved from -inf to 0.83799, saving model to /content/drive/MyDrive/Colab Notebooks/nlp book/tf2_bert_naver_movie/weights.h5
Epoch 2/3
  63/3750 [..............................] - ETA: 18:33 - loss: 0.3399 - accuracy: 0.8393

In [None]:
DATA_TEST_PATH = os.path.join(DATA_IN_PATH, "naver_movie", "ratings_test.txt")
test_data = pd.read_csv(DATA_TEST_PATH, header = 0, delimiter = '\t', quoting = 3)
test_data = test_data.dropna()
test_data.head()

In [None]:
input_ids = []
attention_masks = []
token_type_ids = []
test_data_labels = []

for test_sent, test_label in tqdm(zip(test_data["document"], test_data["label"])):
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(test_sent, MAX_LEN)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        test_data_labels.append(test_label)
    except Exception as e:
        print(e)
        print(test_sent)
        pass



In [None]:
test_movie_input_ids = np.array(input_ids, dtype=int)
test_movie_attention_masks = np.array(attention_masks, dtype=int)
test_movie_type_ids = np.array(token_type_ids, dtype=int)
test_movie_inputs = (test_movie_input_ids, test_movie_attention_masks, test_movie_type_ids)

test_data_labels = np.asarray(test_data_labels, dtype=np.int32) #레이블 토크나이징 리스트

print("num sents, labels {}, {}".format(len(test_movie_input_ids), len(test_data_labels)))


In [None]:
results = cls_model.evaluate(test_movie_inputs, test_data_labels, batch_size=1024)
print("test loss, test acc: ", results)

추가로 성능을 올리기 위한 방법으로 전처리 및 다양한 변수를 변형해가며, 변화시키는 방법

다국어가 아닌 한국어로 사전학습한 모델을 활용해 미세 조정하는 방법등이 있다.