In [1]:
from tokenization import *

import pandas as pd
import numpy as np

import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Bidirectional, LSTM, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, Dropout, Dense
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer
from transformers import BertConfig, TFBertModel

In [2]:
def categorizer(label):
    
    if label == 'entailment':
        return 2
    elif label == 'neutral':
        return 1
    else:
        return 0
    
    
def get_similarity(sentence1, sentence2):
    
    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    test_data = BertSemanticDataGenerator(
        sentence_pairs, 
        labels = None, 
        batch_size = 1, 
        shuffle = False, 
        include_targets = False
    )

    proba = model.predict(test_data)[0]
    idx = np.argmax(proba)
    proba = f"{proba[idx] * 100: .2f}%"
    pred = labels[idx]
    
    return pred, proba

In [3]:
def build_model(max_length, gpus = False):
    
    if gpus >= 2:
        
        strategy = tf.distribute.MirroredStrategy()
        with strategy.scope():

            input_ids = Input(
                shape = (max_length, ), dtype = tf.int32, name = 'input_ids'
            )
            attention_masks = tf.keras.layers.Input(
                shape = (max_length, ), dtype = tf.int32, name = 'attention_masks'
            )
            token_type_ids = tf.keras.layers.Input(
                shape = (max_length, ), dtype = tf.int32, name = 'segment_ids'
            )

            # model load
            config = BertConfig.from_pretrained(r'model', output_hidden_states = True)
            bert_model = bert_model = TFBertModel.from_pretrained(r'model', from_pt = True, config = config)
            # Freeze the BERT model to reuse the pretrained features without modifying them.
            bert_model.trainable = False

            outputs = bert_model([input_ids, token_type_ids, attention_masks])
            sequence_output, _ = outputs.last_hidden_state, outputs.pooler_output
            # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
            bi_lstm = Bidirectional(
                LSTM(units = 64, return_sequences = True)
            )(sequence_output)
            # Applying hybrid pooling approach to bi_lstm sequence output.
            avg_pool = GlobalAveragePooling1D()(bi_lstm)
            max_pool = GlobalMaxPooling1D()(bi_lstm)
            concat = concatenate([avg_pool, max_pool])
            dropout = Dropout(rate = 0.3)(concat)
            output = Dense(units = 3, activation = 'softmax')(dropout)
            model = Model(
                inputs = [input_ids, token_type_ids, attention_masks], 
                outputs = output
            )

            model.compile(
                optimizer = Adam(),
                loss = 'categorical_crossentropy',
                metrics = ['accuracy'],
            )

            model.summary()

            return model
        
    else:
        input_ids = Input(
                shape = (max_length, ), dtype = tf.int32, name = 'input_ids'
            )
        attention_masks = tf.keras.layers.Input(
            shape = (max_length, ), dtype = tf.int32, name = 'attention_masks'
        )
        token_type_ids = tf.keras.layers.Input(
            shape = (max_length, ), dtype = tf.int32, name = 'segment_ids'
        )

        # model load
        config = BertConfig.from_pretrained(r'model', output_hidden_states = True)
        bert_model = bert_model = TFBertModel.from_pretrained(r'model', from_pt = True, config = config)
        # Freeze the BERT model to reuse the pretrained features without modifying them.
        bert_model.trainable = False

        outputs = bert_model([input_ids, token_type_ids, attention_masks])
        sequence_output, _ = outputs.last_hidden_state, outputs.pooler_output
        # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
        bi_lstm = Bidirectional(
            LSTM(units = 64, return_sequences = True)
        )(sequence_output)
        # Applying hybrid pooling approach to bi_lstm sequence output.
        avg_pool = GlobalAveragePooling1D()(bi_lstm)
        max_pool = GlobalMaxPooling1D()(bi_lstm)
        concat = concatenate([avg_pool, max_pool])
        dropout = Dropout(rate = 0.3)(concat)
        output = Dense(units = 3, activation = 'softmax')(dropout)
        model = Model(
            inputs = [input_ids, token_type_ids, attention_masks], 
            outputs = output
        )

        model.compile(
            optimizer = Adam(),
            loss = 'categorical_crossentropy',
            metrics = ['accuracy'],
        )

        model.summary()

        return model

In [4]:
class BertSemanticDataGenerator(Sequence):
    """Generates batches of data.
    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.
    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size,
        shuffle = True,
        include_targets = True,
        max_len = 128
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets        
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = FullTokenizer('vocab.korean.rawtext.list')
        self.max_len = max_len
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()


    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size
    
    

    def get_batch_bert_input_data(self, sentence_pairs):
        
    
        sentence_pairs = list(map(lambda x: ' '.join(['[CLS]', x[0], '[SEP]', x[1], '[SEP]']), sentence_pairs))
    
        input_ids = map(lambda x: self.tokenizer.wordpiece_tokenizer.tokenize(x), sentence_pairs)
        input_ids = list(map(lambda x: self.tokenizer.convert_tokens_to_ids(x), input_ids))
                
        mask_array = list(map(lambda x: [1] * len(x), input_ids))
        input_mask_array = pad_sequences(mask_array, maxlen = self.max_len, padding = 'post')
        
        segment_index_lists = list(map(lambda x: np.where(x == tf.constant(3))[0], input_ids))
        input_segment_array = list(map(lambda x: ( [0] * (x[0] + 1) ) + [1] * ( x[1] - x[0] ), segment_index_lists))
        input_segment_array = pad_sequences(input_segment_array, maxlen = self.max_len, padding = 'post')
        
        input_id_array = pad_sequences(input_ids, maxlen = self.max_len, padding = 'post', dtype = 'int32')
        
        return [input_id_array, input_segment_array, input_mask_array]
    
        

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return self.get_batch_bert_input_data(sentence_pairs), labels
        else:
            return self.get_batch_bert_input_data(sentence_pairs)


    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

In [5]:
train_dataset = pd.read_csv(r'multinli.train.ko.tsv.txt', sep = '\t', error_bad_lines = False).dropna().reset_index(drop = True)    
train_dataset.gold_label = train_dataset.gold_label.apply(lambda x: categorizer(x))
y_train = to_categorical(train_dataset.gold_label, num_classes = 3)
train_data = BertSemanticDataGenerator(
    train_dataset[['sentence1', 'sentence2']].values.astype('str'), 
    y_train, 
    batch_size = 128, 
    max_len = 128,
    shuffle = True
    )

valid_dataset = pd.read_csv(r'xnli.test.ko.tsv.txt', sep = '\t', error_bad_lines = False).dropna().reset_index(drop = True)
valid_dataset.gold_label = valid_dataset.gold_label.apply(lambda x: categorizer(x))
y_valid = to_categorical(valid_dataset.gold_label, num_classes = 3)
valid_data = BertSemanticDataGenerator(
    valid_dataset[['sentence1', 'sentence2']].values.astype('str'), 
    y_valid, 
    batch_size = 128, 
    max_len = 128,
    shuffle = False
    )

b'Skipping line 24426: expected 3 fields, saw 4\nSkipping line 156343: expected 3 fields, saw 4\nSkipping line 218766: expected 3 fields, saw 4\nSkipping line 232318: expected 3 fields, saw 4\nSkipping line 253493: expected 3 fields, saw 4\n'
b'Skipping line 265734: expected 3 fields, saw 4\nSkipping line 282588: expected 3 fields, saw 4\nSkipping line 350969: expected 3 fields, saw 4\n'


In [6]:
early_stopping = EarlyStopping(
    monitor = 'val_loss',
    mode = 'min',
    verbose = 1,
    patience = 2
    )

In [7]:
EPOCHS = 10
labels = ['contradiction', 'neutral', 'entailment']
model = build_model(max_length = 128, gpus = 2)
# feature extraction
history = model.fit(
    train_data,
    validation_data = valid_data,
    epochs = EPOCHS,
    use_multiprocessing = True,
    workers = -1,
    callbacks = [early_stopping]
    )

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f4ec5792a58> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f4ec5792a58> is not a module, class, method, function, traceback, frame, or code object

Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
train_data = BertSemanticDataGenerator(
    train_dataset[['sentence1', 'sentence2']].values.astype('str'), 
    y_train, 
    batch_size = 24, 
    max_len = 128,
    shuffle = True
    )

valid_data = BertSemanticDataGenerator(
    valid_dataset[['sentence1', 'sentence2']].values.astype('str'), 
    y_valid, 
    batch_size = 24, 
    max_len = 128,
    shuffle = False
    )

In [9]:
# fine-tuning
model.trainable = True
model.compile(
    optimizer = Adam(1e-5),
    loss = 'categorical_crossentropy',
    metrics = ['acc']
    )

model.summary()

history = model.fit(        
    train_data,
    validation_data = valid_data,
    epochs = 50,
    use_multiprocessing = True,
    workers = -1,
    callbacks = [early_stopping]
    )

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 109693440   input_ids[0][0]                  
                                                                 segment_ids[0][0]     

Epoch 00005: early stopping


In [10]:
model = build_model(max_length = 128, gpus = 0)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)    

In [11]:
get_similarity('델타변이, 마스크도 돌파…대전 태권도장발 185명 집단감염', '27일부터 비수도권 거리두기 3단계로 격상…식당·카페 밤 10시까지(상보)')

('neutral', ' 54.45%')

In [13]:
get_similarity('美, 하루 코로나 확진자 10만명 돌파…“6개월만에 최다”', '미, 하루 코로나 확진자 10만명선 넘어…6개월만에 최고치')

('neutral', ' 48.57%')

In [14]:
get_similarity('[올림픽] 김연경, 최초로 올림픽서 4차례 한 경기 30득점 이상', '‘최초’ 김연경, 올림픽서 4차례 한 경기 30득점 이상')

('neutral', ' 54.95%')

In [15]:
get_similarity('원희룡, 지사직 사퇴…"고뇌했지만 정권교체 위해 던진다"', '원희룡, 오늘 지사직 사퇴 기자회견…국민의힘 경선 집중')

('neutral', ' 62.85%')

In [16]:
get_similarity('그리고 그가 말했다, "엄마, 저 왔어요."', '그는 한마디도 하지 않았다.')

('neutral', ' 54.70%')

In [17]:
get_similarity('그리고 그가 말했다, "엄마, 저 왔어요."', '그는 엄마에게 집에 갔다고 말했다.')

('neutral', ' 45.63%')