In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input, Embedding
import numpy as np
import math

In [3]:
with open("training_data.txt", "r") as f:
    data = f.read()
    data = data.replace("\n", " ")

In [4]:
chars = list(set(list(data)))

In [None]:
len(chars)

In [None]:
char_to_code = {}
code_to_char = {}
for char in chars:
    char_to_code[char] = len(char_to_code)
    code_to_char[len(code_to_char)] = char
print(char_to_code)
print(code_to_char)

In [7]:
def encode_string(string):
    encoding = []
    for char in string:
        encoding.append(char_to_code[char])
    return encoding

def decode_string(string):
    decoding = []
    for code in string:
        decoding.append(code_to_char[code])
    return decoding

In [8]:
input_data = encode_string(data)
train = input_data[:int(len(input_data) * 0.9)]
test = input_data[int(len(input_data) * 0.9):]

In [9]:
class ScaledDotProductAttention(layers.Layer):
    def __init__(self, embed_dim, keyquery_dim):
        super(ScaledDotProductAttention, self).__init__()
        self.embed_dim = embed_dim
        self.keyquery_dim = keyquery_dim

    def build(self, input_shape):
        self.Wq = self.add_weight(name='query_weights',shape=(self.embed_dim, self.keyquery_dim), initializer=tf.random_normal_initializer(), trainable=True)
        self.Wk = self.add_weight(name='key_weights',shape=(self.embed_dim, self.keyquery_dim), initializer=tf.random_normal_initializer(), trainable=True)
        self.Wdown = self.add_weight(name='vdown_weights', shape = (self.embed_dim, self.keyquery_dim), initializer=tf.random_normal_initializer(), trainable=True)
        self.Wup = self.add_weight(name='vup_weights', shape = (self.keyquery_dim, self.embed_dim), initializer=tf.random_normal_initializer(), trainable=True)

    def call(self, inputs):
        q = tf.matmul(inputs, self.Wq)
        k = tf.matmul(inputs, self.Wk)
        attention_score = tf.matmul(q, k, transpose_b=True)
        attention_score = attention_score / tf.math.sqrt(tf.cast(self.keyquery_dim, tf.float32))
        attention_score = tf.linalg.band_part(attention_score, 0, -1) # upper triangular matrix
        attention_score = tf.where(tf.equal(attention_score, 0), tf.float32.min, attention_score)
        attention_score = tf.nn.softmax(attention_score, axis=-1)

        v = tf.matmul(inputs,tf.matmul(self.Wdown, self.Wup))

        attention_score = tf.matmul(attention_score, v)

        return attention_score


In [10]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, num_heads, embed_dim, keyquery_dim):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.keyquery_dim = keyquery_dim
        self.embed_dim = embed_dim

    def build(self, input_shape):
        self.attentionheads = []
        for i in range(self.num_heads):
            self.attentionheads.append(ScaledDotProductAttention(embed_dim=self.embed_dim, keyquery_dim=self.keyquery_dim))

    def call(self, inputs):
        head_attention_scores = []
        for head in self.attentionheads:
            head_attention_scores.append(head(inputs))
        return tf.math.add_n(head_attention_scores)

In [11]:
class MultilayerPerceptron(layers.Layer):
    def __init__(self, embed_dim, feedforward_dim):
        super(MultilayerPerceptron, self).__init__()
        self.embed_dim = embed_dim
        self.feedforward_dim = feedforward_dim

    def build(self, input_shape):
        self.Wup = self.add_weight(name='ffup_weights', shape=(self.embed_dim, self.feedforward_dim), initializer=tf.random_normal_initializer(), trainable=True)
        self.Bup = self.add_weight(name='ffup_bias', shape=(1, self.feedforward_dim), initializer=tf.zeros_initializer(), trainable=True)
        self.Wdown = self.add_weight(name='ffdown_weights', shape=(self.feedforward_dim, self.embed_dim), initializer=tf.random_normal_initializer(), trainable=True)
        self.Bdown = self.add_weight(name='ffdown_bias', shape=(1, self.embed_dim), initializer=tf.zeros_initializer(), trainable=True)

    def call(self, inputs):
        x = tf.matmul(inputs, self.Wup)
        x = tf.add(x, self.Bup)
        x = tf.nn.relu(x)
        x = tf.matmul(x, self.Wdown)
        x = tf.add(x, self.Bdown)
        return x

In [12]:
class TransformerBlock(layers.Layer):
    def __init__(self, num_heads, embed_dim, keyquery_dim, feedforward_dim):
        super(TransformerBlock, self).__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.keyquery_dim = keyquery_dim
        self.feedforward_dim = feedforward_dim

    def build(self, input_shape):
        self.multiheadattention = MultiHeadAttention(self.num_heads, self.embed_dim, self.keyquery_dim)
        self.feedforward = MultilayerPerceptron(self.embed_dim, self.feedforward_dim)
        self.norm = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        mla_output = self.multiheadattention(inputs)
        x = self.feedforward(mla_output)+mla_output
        x = self.norm(x)
        return x


In [13]:
class Embed(layers.Layer):
    def __init__(self, vocab_size,embed_dim, MAXTOKENS):
        super(Embed, self).__init__()
        self.embed_dim = embed_dim
        self.maxtokens = MAXTOKENS
        self.vocab_size = vocab_size

    def build(self, input_shape):
        self.embed = Embedding(self.vocab_size, self.embed_dim)

        self.pos_embed = np.zeros((1,self.maxtokens))

        for i in range(self.maxtokens):
            if (i%2==0):
                self.pos_embed[0][i]=(math.sin(i/(10000**(2*i/self.embed_dim))))
            else:
                self.pos_embed[0][i]=(math.cos(i/(10000**(2*i/self.embed_dim))))

        a = np.array(self.pos_embed)
        a = np.expand_dims(a, axis=2)
        self.pos_embed = tf.Variable(initial_value=a,trainable=False,dtype=tf.float32)


    def call(self, inputs):
        inputshape = tf.shape(inputs)
        x = self.embed(inputs)
        x = x + self.pos_embed[:,:inputshape[1],:]
        return x

In [14]:
class FinalLayer(layers.Layer):
    def __init__(self, embed_dim, MAXTOKENS):
        super(FinalLayer, self).__init__()
        self.embed_dim = embed_dim
        self.MAXTOKENS = MAXTOKENS

    def build(self, input_shape):
        self.W = self.add_weight(name='final_weights', shape=(self.embed_dim, self.MAXTOKENS), initializer=tf.random_normal_initializer(), trainable=True)

    def call(self, inputs):
        final_char = inputs[:,-1,:]
        x = tf.matmul(final_char, self.W)
        return x

In [15]:
class Transformer(keras.Model):
    def __init__(self, num_heads, embed_dim, keyquery_dim, feedforward_dim, MAXTOKENS, num_blocks, vocab_size):
        super(Transformer, self).__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.keyquery_dim = keyquery_dim
        self.feedforward_dim = feedforward_dim
        self.MAXTOKENS = MAXTOKENS
        self.num_blocks = num_blocks
        self.vocab_size = vocab_size

    def build(self, input_shape):
        self.embed = Embed(self.vocab_size,self.embed_dim, self.MAXTOKENS)
        self.transformerblocks = []
        for i in range(self.num_blocks):
            self.transformerblocks.append(TransformerBlock(self.num_heads, self.embed_dim, self.keyquery_dim, self.feedforward_dim))
        self.finallayer = FinalLayer(self.embed_dim, self.MAXTOKENS)

    def call(self, inputs):
        x = self.embed(inputs)
        for i in range(self.num_blocks):
            x = self.transformerblocks[i](x)
        x = self.finallayer(x)
        return x

In [16]:
batch_size = 32
token_block_size = 128
num_heads = 8
vocab_size = len(chars) + 1 # +1 to account for unknown tokens or characters
num_heads = 8
embed_dim = 512
keyquery_dim = 64
feedforward_dim = 2048
num_blocks = 6

In [17]:
x_train = []
y_train = []
for i in range(len(train) - token_block_size):
    x_train.append(train[i:i+token_block_size])
    y_train.append(train[i+token_block_size])

x_train = np.array(x_train)
y_train = np.array(y_train)

In [None]:
print(x_train.shape)

In [None]:
print(y_train.shape)

In [None]:
print(train[:130])

In [None]:
print(x_train[0])

In [None]:
print(y_train[0])

In [None]:
model = Transformer(num_heads=num_heads, embed_dim=embed_dim, keyquery_dim=keyquery_dim, feedforward_dim=feedforward_dim, MAXTOKENS=token_block_size, num_blocks=num_blocks, vocab_size=vocab_size)
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    metrics=["accuracy"]
)
model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=10,
    verbose=1
)
model.evaluate(x_train, y_train)