In [8]:
import tensorflow as tf
import numpy as np

class CharacterTransformer(tf.keras.Model):  # Changed to inherit from tf.keras.Model
    def __init__(self, 
                 vocab_size,
                 d_model=256,
                 n_heads=8,
                 ff_dim=512,
                 max_seq_length=100,
                 dropout_rate=0.1):
        super().__init__()  # Added super().__init__() call
        self.d_model = d_model
        self.n_heads = n_heads
        self.ff_dim = ff_dim
        self.vocab_size = vocab_size
        self.max_seq_length = max_seq_length
        self.dropout_rate = dropout_rate
        
        # Initialize embeddings and positional encodings
        self.char_embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.pos_encoding = self._positional_encoding()
        
        # Initialize transformer blocks
        self.attention_block = self.build_attention_block()
        self.ff_block = self.build_feedforward_block()
        
        # Output layers
        self.final_layer = tf.keras.layers.Dense(vocab_size)
        
        # Add dropout layers
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
    def _positional_encoding(self):
        pos = np.arange(self.max_seq_length)[:, np.newaxis]
        i = np.arange(self.d_model)[np.newaxis, :]
        angle = pos / np.power(10000, (2 * (i//2)) / self.d_model)
        
        pos_encoding = np.zeros_like(angle)
        pos_encoding[:, 0::2] = np.sin(angle[:, 0::2])
        pos_encoding[:, 1::2] = np.cos(angle[:, 1::2])
        
        return tf.cast(pos_encoding, dtype=tf.float32)
    
    def build_attention_block(self):
        return tf.keras.layers.MultiHeadAttention(
            num_heads=self.n_heads,
            key_dim=self.d_model // self.n_heads,
            dropout=self.dropout_rate
        )
    
    def build_feedforward_block(self):
        return tf.keras.Sequential([
            tf.keras.layers.Dense(self.ff_dim, activation='relu'),
            tf.keras.layers.Dropout(self.dropout_rate),
            tf.keras.layers.Dense(self.d_model)
        ])
    
    def call(self, x, training=False):  # Changed from __call__ to call
        # Get sequence length
        seq_length = tf.shape(x)[1]
        
        # Create embeddings
        x = self.char_embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        # Add positional encoding
        x += self.pos_encoding[:seq_length, :]
        
        # Apply dropout
        x = self.dropout(x, training=training)
        
        # Self-attention block
        attention_output = self.attention_block(
            query=x,
            value=x,
            key=x,
            attention_mask=None,
            training=training
        )
        x = self.layer_norm1(x + attention_output)
        
        # Feedforward block
        ff_output = self.ff_block(x, training=training)
        x = self.layer_norm2(x + ff_output)
        
        # Output layer
        x = self.final_layer(x)
        
        return x

def prepare_data(text, seq_length):
    # Create character vocabulary
    chars = sorted(list(set(text)))
    char_to_idx = {char: idx for idx, char in enumerate(chars)}
    idx_to_char = {idx: char for idx, char in enumerate(chars)}
    
    # Convert text to indices
    text_as_int = [char_to_idx[char] for char in text]
    
    # Create input sequences and target values
    sequences = []
    next_chars = []
    
    for i in range(0, len(text_as_int) - seq_length):
        sequences.append(text_as_int[i:i + seq_length])
        next_chars.append(text_as_int[i + seq_length])
    
    x = np.array(sequences)
    y = np.array(next_chars)
    
    return x, y, char_to_idx, idx_to_char

def train_model(model, x, y, epochs=10, batch_size=64):
    # Create dataset
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
    
    # Define optimizer and loss
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    
    # Training loop
    for epoch in range(epochs):
        total_loss = 0
        num_batches = 0
        
        for batch_x, batch_y in dataset:
            with tf.GradientTape() as tape:
                # Forward pass
                logits = model(batch_x, training=True)
                # Reshape logits and compute loss
                batch_loss = loss_fn(batch_y, logits[:, -1, :])
            
            # Compute gradients and apply updates
            gradients = tape.gradient(batch_loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            
            total_loss += batch_loss
            num_batches += 1
        
        avg_loss = total_loss / num_batches
        print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}')

def generate_text(model, start_string, char_to_idx, idx_to_char, num_chars=1000, temperature=1.0):
    # Convert start string to indices
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    
    for _ in range(num_chars):
        predictions = model(input_eval)
        predictions = predictions[:, -1, :] / temperature
        
        # Sample from the predicted distribution
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        
        # Append prediction to generated text
        text_generated.append(idx_to_char[predicted_id])
        
        # Update input evaluation
        input_eval = tf.concat([input_eval[:, 1:], 
                              tf.expand_dims([predicted_id], 0)], axis=-1)
    
    return start_string + ''.join(text_generated)

In [11]:
# Load your Shakespeare text file
with open('training_data.txt', 'r') as f:
    text = f.read()

# Set parameters
seq_length = 100
vocab_size = len(set(text))

# Prepare data
x, y, char_to_idx, idx_to_char = prepare_data(text, seq_length)

# Create and train model
model = CharacterTransformer(vocab_size=vocab_size)
train_model(model, x, y, epochs=10)

Epoch 1, Loss: 2.0332
Epoch 2, Loss: 2.0738
Epoch 3, Loss: 2.0846
Epoch 4, Loss: 2.0976
Epoch 5, Loss: 2.1084
Epoch 6, Loss: 2.1103
Epoch 7, Loss: 2.1027
Epoch 8, Loss: 2.0834
Epoch 9, Loss: 2.1069
Epoch 10, Loss: 2.0972


In [25]:
# Generate new text
generated_text = generate_text(model, 
                             start_string="Why", 
                             char_to_idx=char_to_idx, 
                             idx_to_char=idx_to_char)
print(generated_text)

Whyos domanfinae, t l;
rothereansly: r aa,
r
t k
:
srnevey m,
si e
te
-'n on


,
d,
e
ow--paoIiN:
e
t.
igiso s
,

s
ted ! e
, llet.s
tusinothneethewaw,
lestotoforendgend; ao?uthau

baicheure.
u:
:
!,
,

!
sothactaheeatahIfeHIng f?
IahoeathevON:
 .
s
lionopeikno t 
o:
IO
t inf he r l tyorido lolnon: ha
.
.
,

 
Tag.
isiealethaiaN:
tl i!
dinion: aneremewobasswaivel l
sh

dt icembd t io,ofeibag,
s
laxesuyethathaulenedsmofen,

r, p,
uselon is
ak
-
gs
,
dotofoigsage:'t

!
g

.
s.
e
r m o ok
t,
as
a ,

s
sa,
! d

e, aengonetud end h sddODO, cho io a:


:
wige reifet yeeat.

e,

d
t
s n
tintnnthaaacerolibe;
e
ak't ssccouseang
; 
 h!
n.
ngsusheeasiasofag
,
fabrias e: eseantpof ceaheeaA'bsealo le
 m 
r;

sh a,
n,

mowslinenout t tsuwve r,
's,
d
-g,
t bf;IONIOre

r
s?
ion st, w,
ss; hepubt led 


xcet to uglarerooue
co l eo'l dasoeihfoaesan:
d
stheleaela:'d
d,
il I ,

ow

t.
re.


l co'letowngt t ds

'


s
,


ge?oyon:

 


h,
t:

,
n;

'

;

s

,
ch ,

f e.

y

e
ly
:
a d
us 
,
ng


ce
t kis m


In [24]:
# Corrected filepath with the required `.weights.h5` suffix
file_path = "character_transformer_weights.weights.h5"

# Save the model's weights
model.save_weights(file_path)
print(f"Model weights saved to {file_path}")


Model weights saved to character_transformer_weights.weights.h5
