In [1]:
import numpy as np 
def reweight_distribution(original_distribution, temperature=0.5):   
    distribution = np.log(original_distribution) / temperature
    distribution = np.exp(distribution)
    return distribution / np.sum(distribution) 

In [2]:
original_distribution = np.array([0.1, 0.2, 0.7])
temperature = 0.5
new_distribution = reweight_distribution(original_distribution, temperature)
print(new_distribution)


[0.01851852 0.07407407 0.90740741]


In [3]:
import urllib.request

url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
urllib.request.urlretrieve(url, "aclImdb_v1.tar.gz")
print("Download complete.")


Download complete.


In [None]:
import tarfile

file_name = "aclImdb_v1.tar.gz"
with tarfile.open(file_name, "r:gz") as tar:
    tar.extractall()
    print("Extraction complete.")


In [None]:
import tensorflow as tf 
from tensorflow import keras
dataset = keras.utils.text_dataset_from_directory(
    directory="aclImdb", label_mode=None, batch_size=256)
dataset = dataset.map(lambda x: tf.strings.regex_replace(x, "<br />", " ")) 

Found 100006 files.


 Preparing a TextVectorization layer

In [None]:
from tensorflow.keras.layers import TextVectorization

sequence_length = 100 
vocab_size = 15000

text_vectorization = TextVectorization(
    max_tokens=vocab_size,                
    output_mode="int",               
    output_sequence_length=sequence_length,   
)
text_vectorization.adapt(dataset)


Setting up a language modeling dataset

In [None]:
def prepare_lm_dataset(text_batch):
    vectorized_sequences = text_vectorization(text_batch)
    x = vectorized_sequences[:, :-1] 
    y = vectorized_sequences[:, 1:]     
    return x, y

lm_dataset = dataset.map(prepare_lm_dataset, num_parallel_calls=4)

 A simple Transformer-based language model

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        # Embedding cho các từ
        self.token_embedding = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        # Embedding cho vị trí
        self.position_embedding = layers.Embedding(input_dim=sequence_length, output_dim=embed_dim)

    def call(self, inputs):
        # Tạo chuỗi các vị trí (0, 1, 2, ..., sequence_length - 1)
        positions = tf.range(start=0, limit=tf.shape(inputs)[-1], delta=1)
        positions = tf.expand_dims(positions, 0)  # Thêm batch dimension nếu cần
        # Nhúng các từ và vị trí
        embedded_tokens = self.token_embedding(inputs)
        embedded_positions = self.position_embedding(positions)
        return embedded_tokens + embedded_positions


In [None]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim)]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, encoder_outputs):
        attention_output = self.attention(query=inputs, value=encoder_outputs, key=encoder_outputs)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


In [None]:
from tensorflow.keras import layers
embed_dim = 256 
latent_dim = 2048 
num_heads = 2 

inputs = keras.Input(shape=(None,), dtype="int64")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, x)
outputs = layers.Dense(vocab_size, activation="softmax")(x)     
model = keras.Model(inputs, outputs)
model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop")




 The text-generation callback

In [None]:
import numpy as np
tokens_index = dict(enumerate(text_vectorization.get_vocabulary())) 
def sample_next(predictions, temperature=1.0):      
    predictions = np.asarray(predictions).astype("float64")
    predictions = np.log(predictions) / temperature
    exp_preds = np.exp(predictions)
    predictions = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, predictions, 1)
    return np.argmax(probas)

class TextGenerator(keras.callbacks.Callback):
    def __init__(self,
         prompt,   
         generate_length,     
         model_input_length,
         temperatures=(1.,),   
         print_freq=1):
        self.prompt = prompt
        self.generate_length = generate_length
        self.model_input_length = model_input_length
        self.temperatures = temperatures
        self.print_freq = print_freq
    def on_epoch_end(self, epoch, logs=None):
        if(epoch + 1) % self.print_freq != 0:
            return
        for temperature in self.temperatures:
            print("== Generating with temperature", temperature)
            sentence = self.prompt
            for i in range(self.generate_length) :
                tokenized_sentence = text_vectorization([sentence])  
                predictions = self.model(tokenized_sentence)         
                next_token = sample_next(predictions[0, i, :])  
                sampled_token = tokens_index[next_token]        
                sentence += " " + sampled_token 
            print(sentence)

prompt = "This movie" 
text_gen_callback = TextGenerator(
    prompt,
    generate_length=50,
    model_input_length=sequence_length,
    temperatures=(0.2, 0.5, 0.7, 1., 1.5))

 Fitting the language model

In [None]:
model.fit(lm_dataset, epochs=1, callbacks=[text_gen_callback])


[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - loss: 6.1685== Generating with temperature 0.2
This movie show  cheesy [UNK]                                              
== Generating with temperature 0.5
This movie is made just  wistful say on passed off watch nervous                                       
== Generating with temperature 0.7
This movie movie  strong brilliantly how a price of crime film although the present                                     
== Generating with temperature 1.0
This movie is has really made touching  productions well made it go much towards                                     
== Generating with temperature 1.5
This movie guy has by a sea different  stay                                          
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2408s[0m 6s/step - loss: 6.1673


<keras.src.callbacks.history.History at 0x20505c1ac90>

In [None]:
# Lưu mô hình sau khi huấn luyện
model.save('text_generation_model', save_format='tf')


ValueError: The `save_format` argument is deprecated in Keras 3. Please remove this argument and pass a file path with either `.keras` or `.h5` extension.Received: save_format=tf