# Speed Test

### **Conclusion**
- **~same on CPU compared to GPU** for this model and setup.  

---

### **Model Details**
- **Model Type:** Transformer (encoder only)  
- **Total Parameters:** 4,934,529 (18.82 MB)
- **Training Data:** 25,000 sequences  
- **Features:** 1 (number of occurrences in the corpus)

---

### **Rules of Thumb**
1. **Sample Size:**  
   `nb_samples > 10 × nb_features × nb_classes`  
   (with `nb_classes = 5` for regression).

2. **Parameter-Sample Ratio:**  
   `nb_parameters < nb_samples / 10`  
   (or even `nb_samples / 50` for deep learning).

---

### **GPU Performance**

#### **tf.keras**  
- Run 1 (Apple Silicon) : 412s 525ms/step - loss: 7.6209 - accuracy: 0.4999 - val_loss: 7.6246 - val_accuracy: 0.5000

#### **Keras**  
- Run 1 (Apple Silicon) : 419s 534ms/step - loss: 7.6191 - accuracy: 0.5002 - val_loss: 7.6246 - val_accuracy: 0.5000


---

### **CPU Performance**

#### **tf.keras**  
- Run 1 (Apple Silicon) : 439s 560ms/step - accuracy: 0.4992 - loss: 8.0630 - val_accuracy: 0.5000 - val_loss: 8.0590

#### **Keras**  
- Run 1 (Apple Silicon) : 417s 531ms/step - accuracy: 0.5006 - loss: 7.9368 - val_accuracy: 0.5000 - val_loss: 7.9712



In [None]:
import tensorflow as tf

devices = tf.config.list_physical_devices()
print("\nDevices: ", devices)

gpus = tf.config.list_physical_devices("GPU")
if gpus:
    details = tf.config.experimental.get_device_details(gpus[0])
    print("GPU details: ", details)

In [None]:
# get data
import tensorflow as tf
from tensorflow import keras
import numpy as np

np.random.seed(42)  # for reproducibility

max_features = 20000
maxlen = 100  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(
    num_words=max_features
)

x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
print(x_train.shape, x_train[:2], y_train.shape, y_train[:2])
print(
    f"{x_train.shape[0]} train samples and {y_test.shape[0]} tests samples, for a total of {x_train.shape[0] + x_test.shape[0]} samples."
)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
from tensorflow import keras

print(f"Using tf.keras version {keras.__version__}")

ff_dim = 512
head_size = 128
num_heads = 16
num_layers = 2


def transformer_encoder(inputs, num_layers, head_size, num_heads, ff_dim, dropout=0):
    for _ in range(num_layers):
        # Attention and Normalization
        x = keras.layers.LayerNormalization(epsilon=1e-6)(inputs)
        x = keras.layers.MultiHeadAttention(
            key_dim=head_size, num_heads=num_heads, dropout=dropout
        )(x, x)
        x = keras.layers.Add()([x, inputs])

        # Feed Forward Part
        y = keras.layers.LayerNormalization(epsilon=1e-6)(x)
        y = keras.layers.Dense(ff_dim, activation="relu")(y)
        y = keras.layers.Dropout(dropout)(y)
        y = keras.layers.Dense(inputs.shape[-1])(y)
        inputs = keras.layers.Add()([y, x])

    return inputs


# Define the input layer
inputs = keras.layers.Input(
    shape=(maxlen,)
)  # maxlen is the input length for each sequence

# Embedding layer
x = keras.layers.Embedding(max_features, 128)(
    inputs
)  # max_features is the vocabulary size
x = transformer_encoder(x, num_layers, head_size, num_heads, ff_dim)
x = keras.layers.GlobalAveragePooling1D()(x)
x = keras.layers.LayerNormalization(epsilon=1e-6)(x)
# Define output layer based on target type
outputs = keras.layers.Dense(1, activation="linear")(x)

# Build the model
model = keras.Model(inputs=inputs, outputs=outputs, name="transformer")

model.summary()
# try using different optimizers and different optimizer configs
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

print("Train...")
batch_size = 32
epoch = 100
model.fit(
    x_train, y_train, batch_size=batch_size, epochs=1, validation_data=[x_test, y_test]
)

In [None]:
import keras

print(f"Using keras version {keras.__version__}")

ff_dim = 512
head_size = 128
num_heads = 16
num_layers = 2


def transformer_encoder(inputs, num_layers, head_size, num_heads, ff_dim, dropout=0):
    for _ in range(num_layers):
        # Attention and Normalization
        x = keras.layers.LayerNormalization(epsilon=1e-6)(inputs)
        x = keras.layers.MultiHeadAttention(
            key_dim=head_size, num_heads=num_heads, dropout=dropout
        )(x, x)
        x = keras.layers.Add()([x, inputs])

        # Feed Forward Part
        y = keras.layers.LayerNormalization(epsilon=1e-6)(x)
        y = keras.layers.Dense(ff_dim, activation="relu")(y)
        y = keras.layers.Dropout(dropout)(y)
        y = keras.layers.Dense(inputs.shape[-1])(y)
        inputs = keras.layers.Add()([y, x])

    return inputs


# Define the input layer
inputs = keras.layers.Input(
    shape=(maxlen,)
)  # maxlen is the input length for each sequence

# Embedding layer
x = keras.layers.Embedding(max_features, 128)(
    inputs
)  # max_features is the vocabulary size
x = transformer_encoder(x, num_layers, head_size, num_heads, ff_dim)
x = keras.layers.GlobalAveragePooling1D()(x)
x = keras.layers.LayerNormalization(epsilon=1e-6)(x)
# Define output layer based on target type
outputs = keras.layers.Dense(1, activation="linear")(x)

# Build the model
model = keras.Model(inputs=inputs, outputs=outputs, name="transformer")

model.summary()
# try using different optimizers and different optimizer configs
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

print("Train...")
batch_size = 32
epoch = 100
model.fit(
    x_train, y_train, batch_size=batch_size, epochs=1, validation_data=[x_test, y_test]
)