In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers , activations , models , preprocessing , utils
import pandas as pd
from keras.models import load_model, Model
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os
import random
from tqdm import tqdm
import collections
import math
import string

def fix_random_seed(seed_value = 42):
    np.random.seed(seed_value)
    random.seed(seed_value)
    tf.random.set_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

## Chuẩn bị dữ liệu và cấu hình

In [3]:
#!wget http://www.manythings.org/anki/vie-eng.zip -O vie-eng.zip
#!unzip vie-eng.zip

lines = None
data_path = './data/vie.txt'
lines = pd.read_table(data_path, names=['eng' , 'vie' , 'c' ] )
lines = lines.drop(['c'] , axis=1 )[0:5000]
lines.tail()

--2024-10-01 07:53:14--  http://www.manythings.org/anki/vie-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 381617 (373K) [application/zip]
Saving to: ‘vie-eng.zip’


2024-10-01 07:53:14 (1.34 MB/s) - ‘vie-eng.zip’ saved [381617/381617]

Archive:  vie-eng.zip
replace _about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: _about.txt              
replace vie.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: vie.txt                 


Unnamed: 0,eng,vie
4995,Who's your favorite movie star?,Ngôi sao điện ảnh mà bạn yêu thích là ai thế?
4996,Who's your favorite movie star?,Ngôi sao điện ảnh mà bạn yêu thích là ai vậy?
4997,Why aren't you in your uniform?,Tại sao bạn không mặc đồng phục?
4998,Will he be coming this evening?,Tối nay anh ấy có đến không?
4999,Will he be coming this evening?,Tối nay ông ấy có đến không?


In [4]:
configs = {
    "file_name": data_path,
    "batch_size":8,
    "embedding_dim": 64,
    "dff": 128,
    "n_layers": 2,
    "n_heads": 2,
    "dropout": 0.1,
    "n_epochs": 12
}

def process_input(lines):
    lines = lines.drop(['c'] , axis=1 )

    eng_train_lines,vie_train_lines,eng_val_lines,vie_val_lines,eng_full_lines,vie_full_lines = [],[],[],[],[],[]

    for line_eng, line_vie in zip(lines.eng, lines.vie):
        eng_full_lines.append( line_eng )
        vie_full_lines.append( '<START> ' + line_vie + ' <END>' )
        if random.random()<0.9:
            eng_train_lines.append( line_eng )
            vie_train_lines.append( '<START> ' + line_vie + ' <END>' )
        else:
            eng_val_lines.append( line_eng )
            vie_val_lines.append( '<START> ' + line_vie + ' <END>' )
    eng_tokenizer = preprocessing.text.Tokenizer()
    eng_tokenizer.fit_on_texts( eng_full_lines )
    tokenized_full_eng_lines = eng_tokenizer.texts_to_sequences( eng_full_lines )
    tokenized_train_eng_lines = eng_tokenizer.texts_to_sequences( eng_train_lines )
    tokenized_val_eng_lines = eng_tokenizer.texts_to_sequences( eng_val_lines )
    max_input_length = np.array( [len( token_seq ) for token_seq in tokenized_full_eng_lines] ).max()


    vie_tokenizer = preprocessing.text.Tokenizer()
    vie_tokenizer.fit_on_texts( vie_full_lines )
    tokenized_full_vie_lines = vie_tokenizer.texts_to_sequences( vie_full_lines )
    tokenized_train_vie_lines = vie_tokenizer.texts_to_sequences( vie_train_lines )
    tokenized_val_vie_lines = vie_tokenizer.texts_to_sequences( vie_val_lines )
    max_output_length = np.array( [len( token_seq ) for token_seq in tokenized_full_vie_lines] ).max()

    padded_train_eng_lines = preprocessing.sequence.pad_sequences( tokenized_train_eng_lines, maxlen=max(max_input_length, max_output_length), padding='post' )
    encoder_train_input_data = np.array( padded_train_eng_lines )
    padded_val_eng_lines = preprocessing.sequence.pad_sequences( tokenized_val_eng_lines, maxlen=max(max_input_length, max_output_length) , padding='post' )
    encoder_val_input_data = np.array( padded_val_eng_lines )

    padded_train_vie_lines = preprocessing.sequence.pad_sequences( tokenized_train_vie_lines, maxlen=max(max_input_length, max_output_length) , padding='post' )
    decoder_train_input_data = np.array( padded_train_vie_lines )
    padded_val_vie_lines = preprocessing.sequence.pad_sequences( tokenized_val_vie_lines, maxlen=max(max_input_length, max_output_length) , padding='post' )
    decoder_val_input_data = np.array( padded_val_vie_lines )

    eng_word_dict = eng_tokenizer.word_index
    num_eng_tokens = len( eng_word_dict )+1
    vie_word_dict = vie_tokenizer.word_index
    num_vie_tokens = len( vie_word_dict )+1

    print( 'Độ dài lớn nhất của English là {}'.format( max_input_length ))
    print( 'Kích thước dữ liệu của Encoder  -> {}'.format( encoder_train_input_data.shape ))
    print( 'Số lượng English tokens = {}'.format( num_eng_tokens))

    print( 'Độ dài lớn nhất của tiếng việt là {}'.format( max_output_length ))
    print( 'kích thước dữ liệu đầu vào của Decoder -> {}'.format( decoder_train_input_data.shape ))
    print( 'Số lượng Vietnamese tokens = {}'.format( num_vie_tokens))

    input_decoder_target_data = []
    for token_seq in tokenized_train_vie_lines:
        input_decoder_target_data.append( token_seq[ 1 : ])

    padded_vie_lines = preprocessing.sequence.pad_sequences(input_decoder_target_data , maxlen=max_output_length, padding='post' )
    onehot_vie_lines = utils.to_categorical( padded_vie_lines , num_vie_tokens )
    decoder_target_data = np.array( onehot_vie_lines )
    return  {
                "encoder_train_input_data":encoder_train_input_data,
                "encoder_val_input_data":encoder_val_input_data,
                "decoder_train_input_data":decoder_train_input_data,
                "decoder_target_data":decoder_target_data,
                "eng_tokenizer":eng_tokenizer,
                "vie_tokenizer":vie_tokenizer,
                "num_eng_tokens":num_eng_tokens,
                "num_vie_tokens":num_vie_tokens,
                "max_input_length":max_input_length,
                "max_output_length":max_output_length,
                "tokenized_val_vie_lines":tokenized_val_vie_lines,
                "eng_val_lines":eng_val_lines,
                "encoder_val_input_data":encoder_val_input_data
            }

In [5]:
lines = pd.read_table(configs['file_name'] , names=['eng' , 'vie' , 'c' ] )

data_input = process_input(lines)


num_layers = configs["n_layers"]
d_model = configs["embedding_dim"]
dff = configs["dff"]
num_heads = configs["n_heads"]
dropout_rate = configs["dropout"]

def masked_loss(label, pred):
    mask = tf.argmax(label, axis=-1) != 0
    loss_object = tf.keras.losses.CategoricalCrossentropy(
        from_logits=False, reduction='none')
    loss = loss_object(label, pred)
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask

    loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
    return loss

def masked_accuracy(label, pred):
    pred = tf.argmax(pred, axis=2)
    label = tf.argmax(label, axis=2)
    label = tf.cast(label, pred.dtype)
    match = label == pred

    mask = label != 0

    match = match & mask

    match = tf.cast(match, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(match)/tf.reduce_sum(mask)

def fix_random_seed(seed_value = 42):
    np.random.seed(seed_value)
    random.seed(seed_value)
    tf.random.set_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    def get_config(self):
        config = {
        'd_model': self.d_model,
        'warmup_steps': self.warmup_steps,

        }
        return config

Độ dài lớn nhất của English là 32
Kích thước dữ liệu của Encoder  -> (8515, 43)
Số lượng English tokens = 4115
Độ dài lớn nhất của tiếng việt là 43
kích thước dữ liệu đầu vào của Decoder -> (8515, 43)
Số lượng Vietnamese tokens = 2486


## Hàm thành phần của Transformer

In [6]:
def positional_encoding(length, depth):
    depth = depth/2

    positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)
    angle = 1 / (10000**depths)         # (1, depth)
    angle = positions * angle      # (pos, depth)


    pos_encoding = np.concatenate(
        [np.sin(angle), np.cos(angle)],
        axis=-1)

    return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

In [7]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x


class Encoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads,
                dff, vocab_size, dropout_rate=0.1):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_embedding = PositionalEmbedding(
            vocab_size=vocab_size, d_model=d_model)

        self.enc_layers = [
            EncoderLayer(d_model=d_model,
                        num_heads=num_heads,
                        dff=dff,
                        dropout_rate=dropout_rate)
            for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x):
        # `x` is token-IDs shape: (batch, seq_len)
        x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

        # Add dropout.
        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x  # Shape `(batch_size, seq_len, d_model)`.

class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.

    return x


class Decoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
                dropout_rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                                d_model=d_model)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dec_layers = [
            DecoderLayer(d_model=d_model, num_heads=num_heads,
                        dff=dff, dropout_rate=dropout_rate)
            for _ in range(num_layers)]

        self.last_attn_scores = None

    def call(self, x, context):
        # `x` is token-IDs shape (batch, target_seq_len)
        x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

        x = self.dropout(x)

        for i in range(self.num_layers):
            x  = self.dec_layers[i](x, context)

        self.last_attn_scores = self.dec_layers[-1].last_attn_scores

        # The shape of x is (batch_size, target_seq_len, d_model).
        return x

In [8]:
class Translator:
    def __init__(self, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.0):
        self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

        self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)
        self.final_layer = tf.keras.layers.Dense(target_vocab_size, activation=tf.keras.activations.softmax)
        self.d_model = d_model

    def build(self):
        encoder_inputs = tf.keras.layers.Input(shape=( None ,))
        decoder_inputs = tf.keras.layers.Input(shape=( None ,))

        context = self.encoder(encoder_inputs)  # (batch_size, context_len, d_model)

        output = self.decoder(decoder_inputs, context)  # (batch_size, target_len, d_model)

        # Final linear layer output.
        output = self.final_layer(output)  # (batch_size, target_len, target_vocab_size)

        try:
            # Delete the keras mask, so keras doesn't scale the loss+accuracy.
            del output._keras_mask
        except AttributeError:
            pass

        self.model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
    def train(self, encoder_input_data , decoder_input_data, decoder_target_data,cfg):
        learning_rate = CustomSchedule(self.d_model)
        optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                        epsilon=1e-9)
        self.model.compile(
                loss=masked_loss,
                optimizer=optimizer,
                metrics=[masked_accuracy],
                )
        self.model.fit([encoder_input_data , decoder_input_data], decoder_target_data, batch_size=cfg["batch_size"], epochs=cfg["n_epochs"])

    # Load mô hình từ file
    def load_weights(self, model_file):
        self.model.load_weights(model_file).expect_partial()

    # Lưu mô hình hiện tại xuống file
    def save_weights(self, model_file):
        self.model.save_weights(model_file)

    # Tóm tắt kiến trúc mạng
    def summary(self):
        self.model.summary()

    # Thử nghiệm mô hình với dữ liệu ảnh đầu vào
    def predict(self, x_test):
        return self.model.predict(x_test)
    def translate(self, input,max_output_length, vie_tokenizer, return_attention=False):
        vie_word_dict=vie_tokenizer.word_index
        map_i2w = {val: key for key, val in vie_word_dict.items()}
        output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
        start, end = vie_word_dict['start'], vie_word_dict['end']
        output_array = output_array.write(0, [start])

        for i in tf.range(max_output_length):
            output = tf.transpose(output_array.stack())
            predictions = self.model([input, output], training=False)

            # Select the last token from the `seq_len` dimension.

            predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.
            predicted_id = tf.argmax(predictions, axis=-1)

            # Concatenate the `predicted_id` to the output which is given to the
            if predicted_id[0] == end or predicted_id[0]==0:
                break

            output_array = output_array.write(i+1, predicted_id[0])
        output = tf.transpose(output_array.stack())
        # print(output)
        text = [map_i2w.get(i, '') for i in output[0].numpy()[1:]]  # Shape: `()`.
        if return_attention:
            self.model([input, output], training=False)
            attention_weights = self.decoder.last_attn_scores
            attention_heads = tf.squeeze(attention_weights, 0)
            return text, attention_heads
        return text
    def plot_attention(self, text, **kwargs):
        assert isinstance(text, str)

        input = eng_tokenizer.texts_to_sequences([text])
        input = preprocessing.sequence.pad_sequences(input, maxlen=max_input_length , padding='post' )
        output, attention = self.translate(input, return_attention=True)
        attention_weights = tf.concat(attention, 0)
        context = text.split()

        for i in range(len(attention_weights)):
            attention = attention_weights[i]
            fig = plt.figure(figsize=(10, 10))
            ax = fig.add_subplot(1, 1, 1)

            ax.matshow(attention, cmap='viridis', vmin=0.0)

            fontdict = {'fontsize': 14}

            ax.set_xticklabels([''] + context, fontdict=fontdict, rotation=90)
            ax.set_yticklabels([''] + output, fontdict=fontdict)

            ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
            ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

            ax.set_ylabel('Output text')
            ax.set_xlabel(f'Head {i}')

## Kiến trúc Transformer tiêu chuẩn

In [None]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

class CrossAttention(BaseAttention):
    def call(self, x, context):
        attn_output, attn_scores = self.mha(
            query=x,
            key=context,
            value=context,
            return_attention_scores=True)
        # Cache the attention scores for plotting later.
        self.last_attn_scores = attn_scores
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

class CausalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x,
            use_causal_mask = True)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = None
        self.add = None
        self.layer_norm = None
        self.seq = tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model),
        tf.keras.layers.Dropout(dropout_rate)])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x

In [None]:
fix_random_seed(24)

standard =Translator(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=data_input["num_eng_tokens"],
    target_vocab_size=data_input["num_vie_tokens"],
    dropout_rate=dropout_rate)

standard.build()
standard.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 encoder (Encoder)           (None, None, 64)             363456    ['input_1[0][0]']             
                                                                                                  
 decoder (Decoder)           (None, None, 64)             325888    ['input_2[0][0]',             
                                                                     'encoder[0][0]']         

## 5. Scaled Dot Product Attention --> Xoá phần Scale
Trong phần này, chúng ta sẽ thử nghiệm phiên bản **Beta** của Transformer khi xoá phần **Scale** trong hàm Dot-Product của kiến trúc Attention.


**TODO:** Cài đặt lại hàm **_compute_attention** của lớp MultiHeadAttention, và bỏ phần scale khi xử lý **query**

In [9]:
from tensorflow.keras.layers import Embedding, MultiHeadAttention, Dense, Input, Dropout, LayerNormalization
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForTokenClassification

class custom_MultiHeadAttention(MultiHeadAttention):
    def _compute_attention(self, query, key, value, attention_mask=None, training=None):

        ### BEGIN SOLUTION
        attention_scores = tf.einsum(self._dot_product_equation, key, query)
        attention_scores = self._masked_softmax(
            attention_scores, attention_mask
        )
        attention_scores_dropout = self._dropout_layer(
            attention_scores, training=training
        )
        attention_output = tf.einsum(
            self._combine_equation, attention_scores_dropout, value
    )
        ### END SOLUTION

        return attention_output, attention_scores

In [10]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()

        self.mha  = None
        self.layernorm = None
        self.add = None

        ### BEGIN SOLUTION
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)


        ### END SOLUTION

        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

class CrossAttention(BaseAttention):
    def call(self, x, context):
        attn_output, attn_scores = self.mha(
            query=x,
            key=context,
            value=context,
            return_attention_scores=True)
        # Cache the attention scores for plotting later.
        self.last_attn_scores = attn_scores
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

class CausalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x,
            use_causal_mask = True)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = None
        self.add = None
        self.layer_norm = None
        self.seq = tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model),
        tf.keras.layers.Dropout(dropout_rate)])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x

In [12]:
### BEGIN PUBLIC TESTS
fix_random_seed(42)
batch_size = 1
seq_len = 3
d_model = 64
num_heads= 2

x = tf.random.uniform((1, 8, 64))
context = tf.random.uniform((1, 8, 64))

Test_FeedForward = FeedForward(d_model, dff, dropout_rate)
Output_FeedForward = Test_FeedForward(x)

attention_scores_sum = tf.reduce_sum(Output_FeedForward, axis=-1)
assert tf.reduce_all(tf.not_equal(attention_scores_sum, 1.0)).numpy()

### END PUBLIC TESTS

In [None]:
fix_random_seed(24)

beta=Translator(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=data_input["num_eng_tokens"],
    target_vocab_size=data_input["num_vie_tokens"],
    dropout_rate=dropout_rate)

beta.build()
beta.summary()

In [None]:
options = {0: 'hiệu suất beta xấp xỉ mô hình gốc (không chêch lệch quá 1%)',
           1: 'hiệu suất beta thấp hơn mô hình gốc k% (1<k<5)',
           2: 'hiệu suất beta thấp hơn mô hình gốc k% (5<=k<10)',
           3: 'hiệu suất beta thấp hơn mô hình gốc k% (10<=k<20)',
           4: 'hiệu suất beta thấp hơn mô hình gốc trên 20%',
           5: 'alpha gần như không học gì (hiệu suất thấp hơn 10%)'}
your_choice = None
### BEGIN SOLUTION
your_choice = 1
### END SOLUTION
print("Theo mình thì: ", options[your_choice])