In [1]:
%pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install sacremoses

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import re
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import pandas as pd
import matplotlib.pyplot as plt


In [4]:
from transformers import TFBertModel, BertTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string], '')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()


In [6]:
tf.random.set_seed(1111)
np.random.seed(1111)


In [7]:
CLASS_NUMBER = 2
BATCH_SIZE = 32
NUM_EPOCHS = 2
VALID_SPLIT = 0.2
MAX_LEN = 40
BERT_CKPT = 'c:\\pytest\\data\\KOR\\BERT\\bert_ckpt\\'
DATA_IN_PATH = 'c:\\pytest\\data\\KOR\\naver_movie\\data_in\\'
DATA_OUT_PATH = "c:\\pytest\\data\\KOR\\BERT\\data_out\\"


In [10]:
def listToString(listdata):
    result = 'id\tdocument\tlabel\n'
    for data_each in listdata:
        if data_each:
            result += data_each[0]+"\t"+data_each[1]+"\t"+data_each[2]+"\n"
    return result


def read_data(filename, encoding='cp949', start=0):
    with open(filename, 'r', encoding=encoding) as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[start:]
    return data


def write_data(data, filename, encoding='cp949'):
    with open(filename, 'w', encoding=encoding) as f:
        f.write(data)


data_ratings = read_data(os.path.join(DATA_IN_PATH, "ratings_utf8_small.txt"), encoding='utf-8', start=1)


In [11]:
from sklearn.model_selection import train_test_split
ratings_train, ratings_test = train_test_split(data_ratings)

ratings_train = listToString(ratings_train)
ratings_test = listToString(ratings_test)

write_data(ratings_train, os.path.join(DATA_IN_PATH, "ratings_train.txt"), encoding='utf-8')
write_data(ratings_test, os.path.join(DATA_IN_PATH, "ratings_test.txt"), encoding='utf-8')


In [12]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", cache_dir=os.path.join(BERT_CKPT, "tokenizer"), do_lower_case=False)


Downloading: 100%|██████████| 996k/996k [00:03<00:00, 274kB/s]  
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 14.6kB/s]
Downloading: 100%|██████████| 625/625 [00:00<00:00, 628kB/s]


In [13]:
import pickle
if os.path.exists(DATA_OUT_PATH):
    print("{} -- Folder already exists\n".format(DATA_OUT_PATH))
else:
    os.makedirs(DATA_OUT_PATH, exist_ok=True)
    print("{} -- Folder create complete\n".format(DATA_OUT_PATH))
with open(DATA_OUT_PATH+"bert_tokenizer.pickle", 'wb') as file:
    pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)


c:\pytest\data\KOR\BERT\data_out\ -- Folder already exists



In [14]:
test_sentence = "안녕하세요, 반갑습니다."
encode = tokenizer.encode(test_sentence)
token_print = [tokenizer.decode(token) for token in encode]
encode = tokenizer.encode(test_sentence)
print(encode)
print(token_print)


[101, 9521, 118741, 35506, 24982, 48549, 117, 9321, 118610, 119081, 48345, 119, 102]
['[ C L S ]', '안', '# # 녕', '# # 하', '# # 세', '# # 요', ',', '반', '# # 갑', '# # 습', '# # 니 다', '.', '[ S E P ]']


In [16]:
DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, "ratings_train.txt")
DATA_TEST_PATH = os.path.join(DATA_IN_PATH, "ratings_test.txt")

train_data = pd.read_csv(DATA_TRAIN_PATH, header=0, delimiter='\t', quoting=3)
train_data = train_data.dropna()
train_data.head()


Unnamed: 0,id,document,label
0,7743368,박흥식영화 사랑해 말순씨 ***보면 장애우를 배려하지않는 저질 영화장애우가 성희롱이...,0
1,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1
2,4254270,성형부작용 같은 주인공 얼굴 때문에 집중이 안됨..;;;,0
3,530171,맥티어난의 최고 졸작품,0
4,9462634,어린 외계인역 하신분 귀엽네요..ㅎ,1


In [18]:
kor_encode = tokenizer.encode("안녕하세요, 반갑습니다")
eng_encode = tokenizer.encode("Hello world")
kor_decode = tokenizer.decode(kor_encode)
eng_decode = tokenizer.decode(eng_encode)
print(kor_encode)
print([tokenizer.decode(token) for token in kor_encode])
print(eng_encode)
print([tokenizer.decode(token) for token in eng_encode])
print(kor_decode)
print(eng_decode)


[101, 9521, 118741, 35506, 24982, 48549, 117, 9321, 118610, 119081, 48345, 102]
['[ C L S ]', '안', '# # 녕', '# # 하', '# # 세', '# # 요', ',', '반', '# # 갑', '# # 습', '# # 니 다', '[ S E P ]']
[101, 31178, 11356, 102]
['[ C L S ]', 'H e l l o', 'w o r l d', '[ S E P ]']
[CLS] 안녕하세요, 반갑습니다 [SEP]
[CLS] Hello world [SEP]


In [33]:
# 16
print(tokenizer.all_special_tokens, "\n", tokenizer.all_special_ids)
kor_encode = tokenizer.encode("안녕하세요, 반갑습니다.")
eng_encode = tokenizer.encode("Hello world")
kor_decode = tokenizer.decode(kor_encode)
eng_decode = tokenizer.decode(eng_encode)
print(kor_encode)
print([tokenizer.decode(token) for token in kor_encode])
print(eng_encode)
print([tokenizer.decode(token) for token in eng_encode])
print(kor_decode)
print(eng_decode)


['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'] 
 [100, 102, 0, 101, 103]
[101, 9521, 118741, 35506, 24982, 48549, 117, 9321, 118610, 119081, 48345, 119, 102]
['[ C L S ]', '안', '# # 녕', '# # 하', '# # 세', '# # 요', ',', '반', '# # 갑', '# # 습', '# # 니 다', '.', '[ S E P ]']
[101, 31178, 11356, 102]
['[ C L S ]', 'H e l l o', 'w o r l d', '[ S E P ]']
[CLS] 안녕하세요, 반갑습니다. [SEP]
[CLS] Hello world [SEP]


In [24]:
def bert_tokenizer(sent, MAX_LEN):
    encoded_dict = tokenizer.encode_plus(text=sent, add_special_tokens=True, max_length=MAX_LEN,
                                         padding='max_length', truncation=True, return_attention_mask=True) #padding적용에사용할문장의최대길이(‘longest’, ‘max_length’, ‘do_not_pad’)
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    token_type_id = encoded_dict['token_type_ids']
    return input_id, attention_mask, token_type_id


In [28]:
input_ids = []
attention_masks = []
token_type_ids = []
train_data_labels = []
for train_sent, train_label in tqdm(zip(train_data["document"], train_data["label"]), total=len(train_data)):
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(train_sent, MAX_LEN)
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        train_data_labels.append(train_label)
    except Exception as e:
        print(e)
        print(train_sent)
        pass


100%|██████████| 375/375 [00:00<00:00, 3269.17it/s]


In [30]:
train_movie_input_ids = np.array(input_ids, dtype=int)
train_movie_attention_masks = np.array(attention_masks, dtype=int)
train_movie_type_ids = np.array(token_type_ids, dtype=int)
train_movie_inputs = (train_movie_input_ids, train_movie_attention_masks, train_movie_type_ids)
train_data_labels = np.asarray(train_data_labels, dtype=np.int32)
print("# sents:{}, # labels:{}".format(len(train_movie_input_ids), len(train_data_labels)))


# sents:375, # labels:375


In [31]:
input_id = train_movie_input_ids[1]
attention_mask = train_movie_attention_masks[1]
token_type_id = train_movie_type_ids[1]

print(input_id)
print(attention_mask)
print(token_type_id)
print(tokenizer.decode(input_id))


[   101   9521  21789   9651 119168  11102   9326  35506 118762  10530
   9138  13767   9757  48210  89851  18589  42428    119    102      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]
[CLS] 안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [36]:
class TFBertClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super().__init__()
        self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class, name="classifier", activation="softmax",
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range))

    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooler_output = outputs[1]
        pooler_output = self.dropout(pooler_output, training=training)
        logits = self.classifier(pooler_output)
        return logits


In [None]:
cls_model = TFBertClassifier(model_name='bert-base-multilingual-cased',
                             dir_path=os.path.join(BERT_CKPT, "model"), num_class=CLASS_NUMBER)


In [None]:
optimizer = tf.keras.optimizers.Adam(3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


In [None]:
model_name = "tf2_bert"
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)
checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists\n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete\n".format(checkpoint_dir))
cp_callback = ModelCheckpoint(checkpoint_path, monitor='val_accuracy',
                              verbose=1, save_best_only=True, save_weights_only=True)


In [None]:
history = cls_model.fit(train_movie_inputs, train_data_labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, validation_split=VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])
print(history.history)


In [None]:
plot_graphs(history,'accuracy')
plot_graphs(history,'loss')

In [None]:
test_data=pd.read_csv(DATA_TEST_PATH,header=0,delimiter='\t',quoting=3)
test_data=test_data.dropna()
test_data.head()

In [None]:
test_movie_inputs=(test_movie_input_ids,test_movie_attention_masks,test_movie_type_ids)
test_data_labels=np.asarray(test_data_labels,dtype=np.int32)

In [None]:
input_ids = []
attention_masks = []
token_type_ids = []
test_data_labels = []
for test_sent, test_label in tqdm(zip(test_data["document"], test_data["label"])):
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(test_sent, MAX_LEN)
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        test_data_labels.append(test_label)
    except Exception as e:
        print(e)
        pass
test_movie_input_ids=np.array(input_ids,dtype=int)
test_movie_attention_masks=np.array(attention_masks,dtype=int)
test_movie_type_ids=np.array(token_type_ids,dtype=int)
test_movie_inputs=(test_movie_input_ids,test_movie_attention_masks,test_movie_type_ids)

test_data_labels=np.asarray(test_data_labels,dtype=np.int32)


In [None]:
results=cls_model.evaluate(test_movie_inputs,test_data_labels,batch_size=BATCH_SIZE)
print("testloss, testacc:",results)