In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input, Embedding
import numpy as np
import math

2025-01-04 17:51:56.580586: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735993316.649988    9992 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735993316.668516    9992 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
with open("training_data.txt", "r") as f:
    data = f.read()
    data = data.replace("\n", " ")

In [4]:
chars = list(set(list(data)))

In [5]:
len(chars)

64

In [6]:
char_to_code = {}
code_to_char = {}
for char in chars:
    char_to_code[char] = len(char_to_code)
    code_to_char[len(code_to_char)] = char
print(char_to_code)
print(code_to_char)

{'c': 0, 'l': 1, '!': 2, 'v': 3, 'Q': 4, 'a': 5, 'y': 6, 'o': 7, 'I': 8, "'": 9, 'p': 10, 'B': 11, 'f': 12, 'A': 13, 'm': 14, 'U': 15, 'i': 16, 'F': 17, 'K': 18, 'k': 19, 'b': 20, 'D': 21, 'V': 22, 'r': 23, 'R': 24, 'z': 25, 'e': 26, 'n': 27, 'w': 28, 'E': 29, 'M': 30, 'u': 31, 'd': 32, ',': 33, 'O': 34, 'T': 35, 'x': 36, 'g': 37, 'N': 38, 'P': 39, '&': 40, '3': 41, ';': 42, ':': 43, 'S': 44, 'Y': 45, 'H': 46, 'X': 47, 'h': 48, 'G': 49, ' ': 50, 's': 51, 'q': 52, '.': 53, 'Z': 54, 't': 55, '?': 56, 'J': 57, 'W': 58, 'j': 59, 'L': 60, '-': 61, 'C': 62, '$': 63}
{0: 'c', 1: 'l', 2: '!', 3: 'v', 4: 'Q', 5: 'a', 6: 'y', 7: 'o', 8: 'I', 9: "'", 10: 'p', 11: 'B', 12: 'f', 13: 'A', 14: 'm', 15: 'U', 16: 'i', 17: 'F', 18: 'K', 19: 'k', 20: 'b', 21: 'D', 22: 'V', 23: 'r', 24: 'R', 25: 'z', 26: 'e', 27: 'n', 28: 'w', 29: 'E', 30: 'M', 31: 'u', 32: 'd', 33: ',', 34: 'O', 35: 'T', 36: 'x', 37: 'g', 38: 'N', 39: 'P', 40: '&', 41: '3', 42: ';', 43: ':', 44: 'S', 45: 'Y', 46: 'H', 47: 'X', 48: 'h', 4

In [7]:
def encode_string(string):
    encoding = []
    for char in string:
        encoding.append(char_to_code[char])
    return encoding

def decode_string(string):
    decoding = []
    for code in string:
        decoding.append(code_to_char[code])
    return decoding

In [8]:
input_data = encode_string(data)
train = input_data[:int(len(input_data) * 0.9)]
test = input_data[int(len(input_data) * 0.9):]

In [9]:
class ScaledDotProductAttention(layers.Layer):
    def __init__(self, embed_dim, keyquery_dim):
        super(ScaledDotProductAttention, self).__init__()
        self.embed_dim = embed_dim
        self.keyquery_dim = keyquery_dim

    def build(self, input_shape):
        self.Wq = self.add_weight(name='query_weights',shape=(self.embed_dim, self.keyquery_dim), initializer=tf.random_normal_initializer(), trainable=True)
        self.Wk = self.add_weight(name='key_weights',shape=(self.embed_dim, self.keyquery_dim), initializer=tf.random_normal_initializer(), trainable=True)
        self.Wdown = self.add_weight(name='vdown_weights', shape = (self.embed_dim, self.keyquery_dim), initializer=tf.random_normal_initializer(), trainable=True)
        self.Wup = self.add_weight(name='vup_weights', shape = (self.keyquery_dim, self.embed_dim), initializer=tf.random_normal_initializer(), trainable=True)

    def call(self, inputs):
        q = tf.matmul(inputs, self.Wq)
        k = tf.matmul(inputs, self.Wk)
        attention_score = tf.matmul(q, k, transpose_b=True)
        attention_score = attention_score / tf.math.sqrt(tf.cast(self.keyquery_dim, tf.float32))
        attention_score = tf.linalg.band_part(attention_score, 0, -1) # upper triangular matrix
        attention_score = tf.where(tf.equal(attention_score, 0), tf.float32.min, attention_score)
        attention_score = tf.nn.softmax(attention_score, axis=-1)

        v = tf.matmul(inputs,tf.matmul(self.Wdown, self.Wup))

        attention_score = tf.matmul(attention_score, v)

        return attention_score


In [10]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, num_heads, embed_dim, keyquery_dim):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.keyquery_dim = keyquery_dim
        self.embed_dim = embed_dim

    def build(self, input_shape):
        self.attentionheads = []
        for i in range(self.num_heads):
            self.attentionheads.append(ScaledDotProductAttention(embed_dim=self.embed_dim, keyquery_dim=self.keyquery_dim))

    def call(self, inputs):
        head_attention_scores = []
        for head in self.attentionheads:
            head_attention_scores.append(head(inputs))
        return tf.math.add_n(head_attention_scores)

In [11]:
class MultilayerPerceptron(layers.Layer):
    def __init__(self, embed_dim, feedforward_dim):
        super(MultilayerPerceptron, self).__init__()
        self.embed_dim = embed_dim
        self.feedforward_dim = feedforward_dim

    def build(self, input_shape):
        self.Wup = self.add_weight(name='ffup_weights', shape=(self.embed_dim, self.feedforward_dim), initializer=tf.random_normal_initializer(), trainable=True)
        self.Bup = self.add_weight(name='ffup_bias', shape=(1, self.feedforward_dim), initializer=tf.zeros_initializer(), trainable=True)
        self.Wdown = self.add_weight(name='ffdown_weights', shape=(self.feedforward_dim, self.embed_dim), initializer=tf.random_normal_initializer(), trainable=True)
        self.Bdown = self.add_weight(name='ffdown_bias', shape=(1, self.embed_dim), initializer=tf.zeros_initializer(), trainable=True)

    def call(self, inputs):
        x = tf.matmul(inputs, self.Wup)
        x = tf.add(x, self.Bup)
        x = tf.nn.relu(x)
        x = tf.matmul(x, self.Wdown)
        x = tf.add(x, self.Bdown)
        return x

In [12]:
class TransformerBlock(layers.Layer):
    def __init__(self, num_heads, embed_dim, keyquery_dim, feedforward_dim):
        super(TransformerBlock, self).__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.keyquery_dim = keyquery_dim
        self.feedforward_dim = feedforward_dim

    def build(self, input_shape):
        self.multiheadattention = MultiHeadAttention(self.num_heads, self.embed_dim, self.keyquery_dim)
        self.feedforward = MultilayerPerceptron(self.embed_dim, self.feedforward_dim)
        self.norm = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        mla_output = self.multiheadattention(inputs)
        x = self.feedforward(mla_output)+mla_output
        x = self.norm(x)
        return x


In [13]:
class Embed(layers.Layer):
    def __init__(self, vocab_size,embed_dim, MAXTOKENS):
        super(Embed, self).__init__()
        self.embed_dim = embed_dim
        self.maxtokens = MAXTOKENS
        self.vocab_size = vocab_size

    def build(self, input_shape):
        self.embed = Embedding(self.vocab_size, self.embed_dim)

        self.pos_embed = np.zeros((1,self.maxtokens))

        for i in range(self.maxtokens):
            if (i%2==0):
                self.pos_embed[0][i]=(math.sin(i/(10000**(2*i/self.embed_dim))))
            else:
                self.pos_embed[0][i]=(math.cos(i/(10000**(2*i/self.embed_dim))))

        a = np.array(self.pos_embed)
        a = np.expand_dims(a, axis=2)
        self.pos_embed = tf.Variable(initial_value=a,trainable=False,dtype=tf.float32)


    def call(self, inputs):
        inputshape = tf.shape(inputs)
        x = self.embed(inputs)
        x = x + self.pos_embed[:,:inputshape[1],:]
        return x

In [14]:
class FinalLayer(layers.Layer):
    def __init__(self, embed_dim, MAXTOKENS):
        super(FinalLayer, self).__init__()
        self.embed_dim = embed_dim
        self.MAXTOKENS = MAXTOKENS

    def build(self, input_shape):
        self.W = self.add_weight(name='final_weights', shape=(self.embed_dim, self.MAXTOKENS), initializer=tf.random_normal_initializer(), trainable=True)

    def call(self, inputs):
        final_char = inputs[:,-1,:]
        x = tf.matmul(final_char, self.W)
        return x

In [15]:
class Transformer(keras.Model):
    def __init__(self, num_heads, embed_dim, keyquery_dim, feedforward_dim, MAXTOKENS, num_blocks, vocab_size):
        super(Transformer, self).__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.keyquery_dim = keyquery_dim
        self.feedforward_dim = feedforward_dim
        self.MAXTOKENS = MAXTOKENS
        self.num_blocks = num_blocks
        self.vocab_size = vocab_size

    def build(self, input_shape):
        self.embed = Embed(self.vocab_size,self.embed_dim, self.MAXTOKENS)
        self.transformerblocks = []
        for i in range(self.num_blocks):
            self.transformerblocks.append(TransformerBlock(self.num_heads, self.embed_dim, self.keyquery_dim, self.feedforward_dim))
        self.finallayer = FinalLayer(self.embed_dim, self.MAXTOKENS)

    def call(self, inputs):
        x = self.embed(inputs)
        for i in range(self.num_blocks):
            x = self.transformerblocks[i](x)
        x = self.finallayer(x)
        return x

In [16]:
batch_size = 32
token_block_size = 128
num_heads = 8
vocab_size = len(chars) + 1 # +1 to account for unknown tokens or characters
num_heads = 8
embed_dim = 512
keyquery_dim = 64
feedforward_dim = 2048
num_blocks = 6

In [17]:
x_train = []
y_train = []
for i in range(len(train) - token_block_size):
    x_train.append(train[i:i+token_block_size])
    y_train.append(train[i+token_block_size])

x_train = np.array(x_train)
y_train = np.array(y_train)

In [18]:
print(x_train.shape)

(1003726, 128)


In [19]:
print(y_train.shape)

(1003726,)


In [20]:
print(train[:130])

[17, 16, 23, 51, 55, 50, 62, 16, 55, 16, 25, 26, 27, 43, 50, 11, 26, 12, 7, 23, 26, 50, 28, 26, 50, 10, 23, 7, 0, 26, 26, 32, 50, 5, 27, 6, 50, 12, 31, 23, 55, 48, 26, 23, 33, 50, 48, 26, 5, 23, 50, 14, 26, 50, 51, 10, 26, 5, 19, 53, 50, 50, 13, 1, 1, 43, 50, 44, 10, 26, 5, 19, 33, 50, 51, 10, 26, 5, 19, 53, 50, 50, 17, 16, 23, 51, 55, 50, 62, 16, 55, 16, 25, 26, 27, 43, 50, 45, 7, 31, 50, 5, 23, 26, 50, 5, 1, 1, 50, 23, 26, 51, 7, 1, 3, 26, 32, 50, 23, 5, 55, 48, 26, 23, 50, 55, 7, 50, 32, 16]


In [21]:
print(x_train[0])

[17 16 23 51 55 50 62 16 55 16 25 26 27 43 50 11 26 12  7 23 26 50 28 26
 50 10 23  7  0 26 26 32 50  5 27  6 50 12 31 23 55 48 26 23 33 50 48 26
  5 23 50 14 26 50 51 10 26  5 19 53 50 50 13  1  1 43 50 44 10 26  5 19
 33 50 51 10 26  5 19 53 50 50 17 16 23 51 55 50 62 16 55 16 25 26 27 43
 50 45  7 31 50  5 23 26 50  5  1  1 50 23 26 51  7  1  3 26 32 50 23  5
 55 48 26 23 50 55  7 50]


In [22]:
print(y_train[0])

32


In [23]:
model = Transformer(num_heads=num_heads, embed_dim=embed_dim, keyquery_dim=keyquery_dim, feedforward_dim=feedforward_dim, MAXTOKENS=token_block_size, num_blocks=num_blocks, vocab_size=vocab_size)
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    metrics=["accuracy"]
)
model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=10,
    verbose=1
)
model.evaluate(x_train, y_train)

I0000 00:00:1735993374.779192    9992 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2278 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2050, pci bus id: 0000:01:00.0, compute capability: 8.6


Epoch 1/10


I0000 00:00:1735993403.302667   10573 service.cc:148] XLA service 0x779af002d5f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1735993403.302708   10573 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 2050, Compute Capability 8.6
I0000 00:00:1735993407.383905   10573 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1735993423.474398   10573 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


ResourceExhaustedError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/home/siddhant/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>

  File "/home/siddhant/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/siddhant/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/siddhant/.local/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/home/siddhant/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 542, in dispatch_queue

  File "/home/siddhant/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 531, in process_one

  File "/home/siddhant/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/siddhant/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 359, in execute_request

  File "/home/siddhant/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 775, in execute_request

  File "/home/siddhant/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 446, in do_execute

  File "/home/siddhant/.local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/siddhant/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3051, in run_cell

  File "/home/siddhant/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3106, in _run_cell

  File "/home/siddhant/.local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/home/siddhant/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3311, in run_cell_async

  File "/home/siddhant/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3493, in run_ast_nodes

  File "/home/siddhant/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "/tmp/ipykernel_9992/3547343586.py", line 7, in <module>

  File "/home/siddhant/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/siddhant/.local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 368, in fit

  File "/home/siddhant/.local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 216, in function

  File "/home/siddhant/.local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 129, in multi_step_on_iterator

Out of memory while trying to allocate 1394586936 bytes.
	 [[{{node StatefulPartitionedCall}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_multi_step_on_iterator_53804]