In [1]:
import csv
import time
import random
import numpy as np
import pandas as pd
import tensorflow as tf
# import tensorflow_addons as tfa

import sentencepiece as spm
tf.__version__

'2.3.0'

In [2]:
from tensorflow.raw_ops import RaggedTensorToVariant

@tf.RegisterGradient("RaggedTensorFromVariant")
def _RaggedTensorFromVariantGrad(*args):
    if len(args) == 2:
        op, grad = args
        res = [RaggedTensorToVariant(rt_nested_splits=[], rt_dense_values=grad,
                                      batched_input=False)]
    else:
        op, empty, grad = args
        res = [RaggedTensorToVariant(rt_nested_splits=[op.outputs[0]], rt_dense_values=grad,
                                    batched_input=True)]

### data load

In [3]:
train = pd.read_csv('./data/train.csv').fillna('')
# test = pd.read_csv('./data/test.csv').fillna('')
# test_df = train.copy()

In [4]:
train['query'] = train.apply(lambda x: x['query'].lower(), axis=1)
train['product_title'] = train.apply(lambda x: x['product_title'].lower(), axis=1)
train['product_description'] = train.apply(lambda x: x['product_description'].lower(), axis=1)

In [5]:
sentences_train = list(train['query'].unique()) + \
                  list(train['product_title'].unique()) + \
                  list(train['product_description'].unique())

In [6]:
# with open('./sentences_.txt', 'w', encoding='utf8') as f:
#     f.write('\n'.join(sentences_train))

In [7]:
parameter = '--input={} --model_prefix={} --vocab_size={} --user_defined_symbols={} --model_type={}'

input_file = './sentences_.txt'
model_prefix = 'sentences_'
vocab_size = 2000
user_defined_symbols = '▁[PAD],▁[UNK],▁[CLS],▁[SEP],▁[MASK]'
model_type = 'bpe'

cmd = parameter.format(input_file, model_prefix, vocab_size, user_defined_symbols, model_type)
spm.SentencePieceTrainer.Train(cmd)

In [8]:
vocab_list = pd.read_csv('./sentences_.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_size = len(vocab_list)

In [9]:
sp = spm.SentencePieceProcessor()
vocab_file = './sentences_.model'
sp.load(vocab_file)

True

In [10]:
train['query_processed'] = train.apply(lambda x: sp.encode_as_ids(x['query']), axis=1)
train['product_title_processed'] = train.apply(lambda x: sp.encode_as_ids(x['product_title']), axis=1)
train['product_description_processed'] = train.apply(lambda x: sp.encode_as_ids(x['product_description']), axis=1)
train['NSP_label'] = train.apply(lambda x: 0 if random.uniform(0, 1) <0.5 else 1, axis=1)

In [11]:
q_dict = {}
for q in train['query'].unique():
    q_dict[q] = train.groupby('query').get_group(q)['product_title_processed']

In [12]:
temp = list(train['query'].unique())
def sentence_change(x):
    if x['NSP_label'] == 0:
        tmp = random.choice(temp)
        while tmp == x['query']:
            tmp = random.choice(temp)
        return random.choice(list(q_dict[tmp]))
    return x['product_title_processed']

In [13]:
train['changed_product_title'] = train.apply(lambda x: sentence_change(x), axis=1)

In [14]:
del q_dict

In [15]:
def _mask(x):
    new_query = []
    masked_indexes = []
    for i, w in enumerate(x):
        if random.uniform(0, 1) <= 0.15:
            if random.uniform(0, 1) <= 0.8:
                new_query.append(7)
                masked_indexes.append(i)
            elif random.uniform(0, 1) <= 0.5:
                random_word = random.randint(8, vocab_size-8)
                new_query.append(random_word)
                masked_indexes.append(i)
            else :
                new_query.append(w)
                masked_indexes.append(i)
        else :
            new_query.append(w)
    return new_query, masked_indexes

In [16]:
def test_func(x):
    query, label = _mask(x['query_processed'])
    x['masked_query'] = [5] + query + [6]
    query_label = [l+1 for l in label]
    title, label = _mask(x['changed_product_title'])
    x['masked_product_title'] = title + [6]
    title_label = [l+len(x['masked_query']) for l in label]
    x['LM_label_idx'] = query_label + title_label
    return x

In [17]:
train = train.apply(test_func, axis=1)

In [18]:
def gen_pair_label(x):
    return [5] + x['query_processed'] + [6] + x['changed_product_title'] + [6]

In [19]:
train['original_label'] = train.apply(lambda x: gen_pair_label(x), axis=1)

In [20]:
def get_LM_label(x):
    return [x['original_label'][i] for i in x['LM_label_idx']]

In [21]:
train['LM_label'] = train.apply(lambda x: get_LM_label(x), axis=1)

In [22]:
print(train['masked_query'].iloc[0] + train['masked_product_title'].iloc[0])
print(train['original_label'][0])

print(sp.DecodeIds(train['masked_query'].iloc[0] + train['masked_product_title'].iloc[0]))
print(sp.DecodeIds(train['original_label'][0]))

[5, 217, 66, 31, 1803, 627, 905, 6, 103, 7, 598, 1953, 7, 1952, 7, 1235, 6]
[5, 217, 66, 31, 1803, 627, 905, 6, 103, 268, 598, 1953, 831, 1952, 1304, 1235, 6]
[CLS] bridal shower decorations [SEP] v [MASK] otw [MASK]b [MASK] backpack [SEP]
[CLS] bridal shower decorations [SEP] vans otw washburn backpack [SEP]


In [23]:
from sklearn.model_selection import train_test_split
# position embedding max_len
max_len = train.masked_query.map(len).max() + train.masked_product_title.map(len).max()
train, dev = train_test_split(train, test_size=0.05)

#### Hyper parameter

In [24]:
# voca 수
vocab_size = len(vocab_list)

# 임베딩 벡터의 크기
d_model = 256

# encoder layer 수
num_layers = 4

# attentin 수
num_heads = 4
depth = d_model/num_heads

# max_len = train.masked_query.map(len).max() + train.masked_product_title.map(len).max()


In [25]:
metadata = {'masked_query' : tf.ragged.constant(train['masked_query'], dtype=tf.int32, ragged_rank=1),
            'masked_product_title' : tf.ragged.constant(train['masked_product_title'], dtype=tf.int32, ragged_rank=1),
            'LM_label_idx' : tf.ragged.constant(train['LM_label_idx'], ragged_rank=1),
            'NSP_label' : tf.constant(train['NSP_label']),
            'LM_label' : tf.ragged.constant(train['LM_label'], dtype=tf.int32, ragged_rank=1)}

In [26]:
metadata_dev = {'masked_query' : tf.ragged.constant(dev['masked_query'], dtype=tf.int32, ragged_rank=1),
            'masked_product_title' : tf.ragged.constant(dev['masked_product_title'], dtype=tf.int32, ragged_rank=1),
            'LM_label_idx' : tf.ragged.constant(dev['LM_label_idx'], ragged_rank=1),
            'NSP_label' : tf.constant(dev['NSP_label']),
            'LM_label' : tf.ragged.constant(dev['LM_label'], dtype=tf.int32, ragged_rank=1)}

In [27]:
ds_dev = tf.data.Dataset.from_tensor_slices(metadata_dev)
ds_dev = ds_dev.shuffle(buffer_size=len(dev))
batchs_dev = 32
ds_devs = ds_dev.batch(batchs_dev)
example_batch_dev = next(iter(ds_devs))

In [28]:
ds = tf.data.Dataset.from_tensor_slices(metadata)

ds = ds.shuffle(buffer_size=len(train))
batchs = 128
ds = ds.batch(batchs).repeat()
example_batch = next(iter(ds))
# example_batch

In [29]:
class Bert_Embedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model, max_len, name='Bert_Embedding'):
        super(Bert_Embedding, self).__init__(name=name)
        self._supports_ragged_inputs = True 
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.max_len = max_len
        
    def build(self, input_shape):
        self.Token_Embedding = tf.Variable(tf.random.truncated_normal([self.vocab_size, self.d_model],
                                                                      stddev=1.0 / np.sqrt(self.d_model)), 
                                           trainable=True)
        self.Segment_Embedding = tf.Variable(tf.random.truncated_normal([2, self.d_model],
                                                                        stddev=1.0 / np.sqrt(self.d_model)), 
                                             trainable=True)
        self.Position_Embedding = tf.Variable(tf.random.truncated_normal([self.max_len, self.d_model],
                                                                         stddev=1.0 / np.sqrt(self.d_model)), 
                                              trainable=True)
        
    def call(self, inputs):
        sentence_pair = tf.concat([inputs['masked_query'], inputs['masked_product_title']], axis=1)
        T_embedding = tf.nn.embedding_lookup(self.Token_Embedding, sentence_pair)
        
        Sa = tf.zeros_like(inputs['masked_query'])
        Sb = tf.ones_like(inputs['masked_product_title'])
        S_embedding = tf.gather(self.Segment_Embedding, tf.concat([Sa, Sb], axis=1))
        
        elems = tf.math.reduce_sum(tf.ones_like(sentence_pair), axis=1)
        ragged_range = tf.map_fn(tf.range, elems, fn_output_signature=tf.RaggedTensorSpec(shape=[None], dtype=tf.int32))
#         P_embedding = tf.ragged.map_flat_values(tf.nn.embedding_lookup, self.Position_Embedding, ragged_range)
        P_embedding = tf.gather(self.Position_Embedding, ragged_range)
    
        return tf.math.add_n([T_embedding, S_embedding, P_embedding])
        
    def compute_output_shape(self, inputs):
        return (inputs['masked_query'].shape[0], None, self.d_model)
    
# bert_embedding = Bert_Embedding(vocab_size, d_model, max_len)
# result = bert_embedding(example_batch)
# result.shape

In [30]:
class Scaled_Dot_Product_Attention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, name='Scaled_Dot_Product_Attention'):
        super(Scaled_Dot_Product_Attention, self).__init__(name=name)
        self._supports_ragged_inputs = True
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = self.d_model/self.num_heads
        self.Q_layer = tf.keras.layers.Dense(units=self.depth)
        self.K_layer = tf.keras.layers.Dense(units=self.depth)
        self.V_layer = tf.keras.layers.Dense(units=self.depth)
    
    def QKV_Gen(self, inputs):
        Query = tf.ragged.map_flat_values(self.Q_layer, inputs)
        Key = tf.ragged.map_flat_values(self.K_layer, inputs)
        Value = tf.ragged.map_flat_values(self.V_layer, inputs)
        return Query, Key, Value

#     def calculate_attention(self, x):
#         matmul_qk = tf.matmul(x[0], x[1], transpose_b=True)
#         logits = matmul_qk / tf.math.sqrt(depth)
#         attention_weights = tf.nn.softmax(logits)
#         return tf.matmul(attention_weights, x[2])

#     def call(self, inputs):
#         query, key, value = self.QKV_Gen(inputs)
#         attention_value=tf.map_fn(fn=self.calculate_attention, 
#                                   elems=(query, key, value),
#                                   dtype=tf.float32,
#                                   fn_output_signature=tf.RaggedTensorSpec(shape=[None, int(self.depth)], ragged_rank=0))
#         return attention_value
    
    def call(self, inputs):
        query, key, value = self.QKV_Gen(inputs)
        matmul_qk = tf.ragged.map_flat_values(tf.matmul, query, key, transpose_b=True) 
        logits = matmul_qk / tf.math.sqrt(self.depth)
        attention_weights = tf.ragged.map_flat_values(tf.nn.softmax, logits, axis=-1) 
        attention_value = tf.ragged.map_flat_values(tf.matmul, attention_weights, value) 
        return attention_value    
    
    def compute_output_shape(self, inputs):
        return (inputs.shape[0], None, self.d_model)
    
# scaled_Dot_Product_Attention = Scaled_Dot_Product_Attention(d_model, num_heads, name='Scaled_Dot_Product_Attention')
# qattention_value = scaled_Dot_Product_Attention(result)
# attention_value.shape

In [31]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, name='multi_head_attention'):
        super(MultiHeadAttention, self).__init__(name=name)
        self._supports_ragged_inputs = True
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // self.num_heads
        self.scaled_dot_product_attention_dict = {}
        for i in range(self.num_heads):
            self.scaled_dot_product_attention_dict[i] = Scaled_Dot_Product_Attention(self.d_model, self.num_heads)
        self.drop_out = tf.keras.layers.Dropout(rate=0.1)
        self.norm =  tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
    def build(self, input_shape):
        w_init = tf.random_normal_initializer()
        b_init = tf.zeros_initializer()
        self.W = tf.Variable(initial_value=w_init(shape=(input_shape[-1], self.d_model), dtype='float32'), trainable=True)
        self.b = tf.Variable(initial_value=b_init(shape=(self.d_model,), dtype='float32'), trainable=True)

    def call(self, inputs):
        attention_value_dict = {}
        for i in range(self.num_heads):
            attention_value_dict[i] = self.scaled_dot_product_attention_dict[i](inputs)
        concat_attention = tf.concat([attention_value_dict[i] for i in range(self.num_heads)], axis=-1)
        
        outputs = self.drop_out(tf.ragged.map_flat_values(tf.matmul, concat_attention, self.W) + self.b)
        return tf.ragged.map_flat_values(self.norm, tf.math.add(outputs, inputs))
    
    def compute_output_shape(self, inputs):
        return (inputs.shape[0], None, self.d_model)r
    
    
# multiheadattention = MultiHeadAttention(d_model, num_heads)
# multiheadattentionmatrix = multiheadattention(result)
# # multiheadattentionmatrix.shape

In [32]:
class Add_and_Norm(tf.keras.layers.Layer):
    def __init__(self, name='Add_and_Norm'):
        super(Add_and_Norm, self).__init__(name=name)
        self._supports_ragged_inputs = True
        self.drop_out = tf.keras.layers.Dropout(rate=0.1)
        self.norm =  tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, residual):
        outputs = self.drop_out(inputs)
        outputs = tf.ragged.map_flat_values(self.norm, tf.math.add(outputs, residual)) 
        return outputs
    
    def compute_output_shape(self, inputs):
        return (inputs.shape)
    
# add_and_norm = Add_and_Norm()
# norm = add_and_norm(multiheadattentionmatrix, result)
# # norm.shape

In [33]:
class Feed_Forward_NN(tf.keras.layers.Layer):
    def __init__(self, d_model, name='Feed_Forward_NN'):
        super(Feed_Forward_NN, self).__init__(name=name)
        self._supports_ragged_inputs = True
        self.d_model = d_model
        self.dff = self.d_model*4
        self.layer1 = tf.keras.layers.Dense(units=self.dff, activation='relu')
        self.layer2 = tf.keras.layers.Dense(units=self.d_model)
        self.drop_out = tf.keras.layers.Dropout(rate=0.1)
        self.norm =  tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs):
        x = tf.ragged.map_flat_values(self.layer1, inputs) 
        x = tf.ragged.map_flat_values(self.layer2, x) 
        x = self.drop_out(x)
        return tf.ragged.map_flat_values(self.norm, tf.math.add(x, inputs)) 
    
# ffnn = FFNN(d_model)
# output = ffnn(multiheadattentionmatrix, result)
# # output.shape

In [34]:
class BERT(tf.keras.models.Model):
    def __init__(self, vocab_size, d_model, num_heads, max_len, num_layers, name='BERT'):
        super(BERT, self).__init__(name=name)
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.num_heads = num_heads
        self.max_len = max_len
        self.num_layers = num_layers
        
        self.bert_embedding = Bert_Embedding(self.vocab_size, self.d_model, self.max_len)
        self.ffnn_dict = {}
        self.mha_dict = {}
        for i in range(self.num_layers):
            self.mha_dict[i] = MultiHeadAttention(self.d_model, self.num_heads, name='MultiHeadAttention_%d'%(i+1))
            self.ffnn_dict[i] = Feed_Forward_NN(self.d_model, name='Feed_Forward_NN_%d'%(i+1))
        self.lm_layer = tf.keras.layers.Dense(self.vocab_size, activation='softmax', name='lm_layer')
        self.nsp_layer = tf.keras.layers.Dense(2, activation='softmax', name='nsp_layer')
        self.finetune_layer = tf.keras.layers.Dense(4, activation='softmax', name='finetune_layer')
        
    def call(self, inputs, mode='pretrain'):
        x = self.bert_embedding(inputs)
        for i in range(self.num_layers):
            x = self.mha_dict[i](x)
            x = self.ffnn_dict[i](x)
        lm_x = tf.map_fn(fn=lambda rt: tf.gather(rt[0], rt[1]), 
                         elems=(x, inputs['LM_label_idx']),
                         dtype=tf.float32,
                         fn_output_signature=tf.RaggedTensorSpec(ragged_rank=0))
        lm = tf.ragged.map_flat_values(self.lm_layer, lm_x) 
        nsp = self.nsp_layer(x[:,:1].to_tensor())
        pred = self.finetune_layer(x[:,:1].to_tensor())

        if mode == 'pretrain':
            self.finetune_layer.trainable = False
            self.lm_layer.trainable = True
            self.nsp_layer.trainable = True
            return lm, tf.squeeze(nsp)
        elif mode == 'finetune':
            self.lm_layer.trainable = False
            self.nsp_layer.trainable = False
            self.finetune_layer.trainable = True
            return tf.squeeze(pred)
        
    def predict(self, inputs):
        self.lm_layer.trainable = False
        self.nsp_layer.trainable = False
        self.finetune_layer.trainable = False
        return self.call(inputs, mode='finetune')

In [35]:
def Pairwise_loss(y_true, y_pred):
    lm_true = y_true['LM_label']   
    lm_pred = y_pred[0]
    lm_loss = tf.ragged.map_flat_values(tf.keras.losses.sparse_categorical_crossentropy, lm_true, lm_pred)
    lm_loss = tf.keras.backend.mean(lm_loss)
    
    nsp_true = tf.reshape(y_true['NSP_label'], shape=(-1, 1))
    nsp_pred = y_pred[1]
    nsp_loss = tf.keras.losses.BinaryCrossentropy()(nsp_true, nsp_pred)
    return lm_loss + nsp_loss

In [36]:
tf.keras.backend.clear_session()
model = BERT(vocab_size, d_model, num_heads, max_len, num_layers, name='pretrain')

Pairwise_loss(y_true=example_batch, y_pred=model(example_batch))

<tf.Tensor: shape=(), dtype=float32, numpy=8.625193>

In [37]:
model.summary()

Model: "pretrain"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Bert_Embedding (Bert_Embeddi multiple                  540672    
_________________________________________________________________
Feed_Forward_NN_1 (Feed_Forw multiple                  526080    
_________________________________________________________________
Feed_Forward_NN_2 (Feed_Forw multiple                  526080    
_________________________________________________________________
Feed_Forward_NN_3 (Feed_Forw multiple                  526080    
_________________________________________________________________
Feed_Forward_NN_4 (Feed_Forw multiple                  526080    
_________________________________________________________________
MultiHeadAttention_1 (MultiH multiple                  263680    
_________________________________________________________________
MultiHeadAttention_2 (MultiH multiple                  263

In [38]:
learning_rate = .00001
print_step = 10
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

In [39]:
loss_sum = 0
loss_history = []
loss_history_dev = []

start = time.time()
for step, batch_train in enumerate(ds):
    with tf.GradientTape() as tape:
        logits = model(batch_train, mode='pretrain')
        loss_value = Pairwise_loss(y_true=batch_train, y_pred=logits)
        loss_sum += loss_value 
        
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))

    if step % print_step == 0:
        current_loss_average = float(loss_sum)/print_step
        if step ==0:
            current_loss_average = loss_sum
        loss_history.append(current_loss_average)
        loss_sum = 0
        
        loss_sum_dev = 0
        for dev_step, batch_dev in enumerate(ds_devs):
            logits_dev = model(batch_dev, mode='pretrain')
            loss_value_dev = Pairwise_loss(y_true=batch_dev, y_pred=logits_dev)
            loss_sum_dev += loss_value_dev 
        current_loss_average_dev = loss_sum_dev/dev_step
        loss_history_dev.append(current_loss_average_dev)
        ds_devs = ds_dev.batch(batchs_dev)
        print("Training loss at step %d: %.5f, dev loss : %.5f, time : "% (step, current_loss_average, current_loss_average_dev), time.time() - start)

#         print("Training loss at step %d: %.5f"% (step, current_loss_average))
#         print("Seen so far: %s train samples, learning rate: %.5f" % ((step + 1) * batchs, learning_rate))
#         print("time :", time.time() - start)
        start = time.time() 
        
    if step == 100:
        break

Training loss at step 0: 8.60710, dev loss : 9.00872, time :  18.680289268493652
Training loss at step 10: 8.41563, dev loss : 8.99298, time :  90.47512412071228
Training loss at step 20: 8.39720, dev loss : 8.94570, time :  89.21184635162354
Training loss at step 30: 8.36994, dev loss : 8.93658, time :  89.20885038375854
Training loss at step 40: 8.36278, dev loss : 8.91974, time :  89.76152849197388
Training loss at step 50: 8.35014, dev loss : 8.90440, time :  83.96385431289673
Training loss at step 60: 8.33517, dev loss : 8.89219, time :  90.35027575492859
Training loss at step 70: 8.32502, dev loss : 8.88064, time :  84.92830109596252
Training loss at step 80: 8.30131, dev loss : 8.86932, time :  81.28838992118835
Training loss at step 90: 8.29911, dev loss : 8.85437, time :  91.16472721099854
Training loss at step 100: 8.27529, dev loss : 8.84190, time :  90.00339365005493
