In [37]:
import torch
import torch.nn as nn
from nltk.tokenize import word_tokenize
from os import path
from math import sqrt
import plotly.express as px

In [3]:
class Vocabulary:
	def __init__(self):
		self.vocabulary = set()
		self.stoi = {'<N>':0}
		self.itos = {0:'<N>'}

	def add(self, v):
		if type(v) == str:
			self.vocabulary.add(v)
		elif type(v) == list:
			self.vocabulary = self.vocabulary.union(set(v)) 

	def create_mappings(self):
		self.stoi |= {v:i+len(self.stoi) for i, v in enumerate(self.vocabulary)}
		self.itos |= {i+len(self.itos):v for i, v in enumerate(self.vocabulary)}

	def encode(self, s): 
		return [self.stoi[c] for c in s]
	
	def decode(self, i): 
		return [self.itos[n] for n in i]
	

class PreProcessor:
	def __init__(self):
		self.english_vocabulary = Vocabulary()
		self.cherokee_vocabulary = Vocabulary()
		self.cherokee = []
		self.english = []
		self.max_length = 0
		self.count = 0

	def load_text(self, file_name):
		data, language = [], file_name.split('.')[0]

		with open(path.join('chr_en_data', file_name)) as f:
			for line in f.readlines():
				sentence = ['<S>'] + word_tokenize(line) + ['<E>']

				if language == 'en':
					self.english_vocabulary.add(sentence)
				else:
					self.cherokee_vocabulary.add(sentence)

				self.max_length = max(self.max_length, len(sentence))
				data.append(sentence)
				self.count += 1 
		return data
	
	def get_data(self):
		cherokee = self.load_text('chr.txt')
		english  = self.load_text('en.txt' )
		assert len(cherokee) == len(english)
		self.cherokee += cherokee
		self.english  += english

		return cherokee, english

	
	def create_tensors(self):
		self.english_vocabulary.create_mappings()
		self.cherokee_vocabulary.create_mappings()

		english  = torch.zeros(size=(self.count//2, self.max_length), dtype=int)
		cherokee = torch.zeros(size=(self.count//2, self.max_length), dtype=int)

		for i, sen in enumerate(self.english):
			for j, v in enumerate(self.english_vocabulary.encode(sen)):
				english[i, j] = v
		
		for i, sen in enumerate(self.cherokee):
			for j, v in enumerate(self.cherokee_vocabulary.encode(sen)):
				cherokee[i, j] = v

		self.cherokee, self.english = cherokee, english


preprocessor = PreProcessor()
preprocessor.get_data()
preprocessor.create_tensors()

test = word_tokenize('ᎤᎵᎦᎵᏴᎮᎢ ᎠᏴᏤᏂ ᏫᎵᎻ.')

assert preprocessor.cherokee_vocabulary.decode(preprocessor.cherokee_vocabulary.encode(test)) == test
assert preprocessor.english.shape == preprocessor.cherokee.shape

In [4]:
cherokee_vocab_size, english_vocab_size = len(preprocessor.cherokee_vocabulary.stoi), len(preprocessor.english_vocabulary.stoi)

print(f'Cherokee Vocabulary Size: {cherokee_vocab_size}')
print(f'English  Vocabulary Size: {english_vocab_size}')

Cherokee Vocabulary Size: 12790
English  Vocabulary Size: 6401


In [5]:
cherokee_in, english_in, expected_probabilities = [], [], []

for i, c in enumerate(preprocessor.cherokee):
	cherokee_tensor = torch.tensor(list(c))
	for j in range(1, preprocessor.max_length - 1):
		english_tensor = torch.zeros(preprocessor.max_length)
		english_tensor[:j] = preprocessor.english[i, :j]
		
		probability = torch.zeros(english_vocab_size)
		probability[preprocessor.english[i, j].item()] = 1
		if preprocessor.english[i, j].item() != 0:
			cherokee_in.append(cherokee_tensor)
			english_in.append(english_tensor)
			expected_probabilities.append(probability)


cherokee, english, expected_probabilities = torch.stack(cherokee_in).int(), torch.stack(english_in).int(), torch.stack(expected_probabilities).float()

print(cherokee.shape)
print(english.shape)
print(expected_probabilities.shape)

torch.Size([75406, 106])
torch.Size([75406, 106])
torch.Size([75406, 6401])


In [6]:
size = int(cherokee.shape[0])

train_cherokee = cherokee[:int(0.8*size)]
train_english  = english[:int(0.8*size)]
train_probabilities = expected_probabilities[:int(0.8*size)]

test_cherokee  = cherokee[int(0.8*size):int(0.9*size)]
test_english   = english[int(0.8*size):int(0.9*size)]
test_probabilities = expected_probabilities[int(0.8*size):int(0.9*size)]

val_cherokee   = cherokee[int(0.9*size):]
val_english    = english[int(0.9*size):]
val_probabilities = expected_probabilities[int(0.9*size):]

In [7]:
EMBEDDING_DIMENSIONS = 64
QKV_DIMENSIONS       = 64
SEQUENCE_LENGTH      = preprocessor.max_length
ATTENTION_HEADS      = 8
DECODERS             = 4
ENCODERS             = 4
BATCH_SIZE           = 256

In [8]:
def mask_tensor(t):
	mask = torch.tril(torch.ones(size=(t.shape)))
	mask[mask==0], mask[mask==1] = float('-inf'), 0
	
	return t + mask

test = torch.randn(SEQUENCE_LENGTH, SEQUENCE_LENGTH)
print(test)
print(mask_tensor(test))

tensor([[-1.1780,  1.4851, -1.3170,  ...,  1.0396, -0.1582, -0.3851],
        [ 1.2525, -1.6258, -0.0777,  ..., -0.7738,  0.5700, -1.1544],
        [ 0.7286,  0.4564, -1.0594,  ...,  0.4053, -0.8341,  0.5077],
        ...,
        [ 2.1043,  1.0811, -0.2091,  ...,  1.3817,  0.1229,  0.2036],
        [-0.9733,  0.2396, -1.1271,  ...,  0.0310,  0.1468, -2.1297],
        [-1.1083,  0.8323,  0.4088,  ..., -0.9980, -0.9937,  0.0900]])
tensor([[-1.1780,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
        [ 1.2525, -1.6258,    -inf,  ...,    -inf,    -inf,    -inf],
        [ 0.7286,  0.4564, -1.0594,  ...,    -inf,    -inf,    -inf],
        ...,
        [ 2.1043,  1.0811, -0.2091,  ...,  1.3817,    -inf,    -inf],
        [-0.9733,  0.2396, -1.1271,  ...,  0.0310,  0.1468,    -inf],
        [-1.1083,  0.8323,  0.4088,  ..., -0.9980, -0.9937,  0.0900]])


In [9]:
class AttentionHead(nn.Module):
	def __init__(self, masked=False):
		super().__init__()
		self.obtain_key   = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.obtain_query = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.obtain_value = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.masked = masked

	def forward(self, data, encoder_output=None):
		if encoder_output is None: Q, K, V = self.obtain_query(data), self.obtain_key(data), self.obtain_value(data)
		else: Q, K, V = self.obtain_query(data), self.obtain_key(encoder_output), self.obtain_value(encoder_output)
		mat_mul = Q @ K.transpose(-2, -1)
		scaled_mat_mul = mat_mul / sqrt(QKV_DIMENSIONS)
		if self.masked: scaled_mat_mul = mask_tensor(scaled_mat_mul)
		softmax_mat_mul = torch.softmax(scaled_mat_mul, dim=-1)
		output = softmax_mat_mul @ V

		return output

test, e_output = torch.randn(size=(BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS)), torch.randn(size=(SEQUENCE_LENGTH, QKV_DIMENSIONS))
test_module = AttentionHead()
print(test_module(test, encoder_output=e_output))



tensor([[[-3.0545e-02, -5.5631e-02,  1.5919e-01,  ..., -5.1795e-02,
           1.9747e-01, -3.6008e-02],
         [-7.4673e-02, -5.7112e-02,  1.6675e-01,  ..., -5.1271e-02,
           1.7088e-01,  3.2525e-02],
         [-1.5145e-01, -3.6001e-02,  2.2089e-01,  ..., -7.2068e-02,
           2.3880e-01,  5.7365e-02],
         ...,
         [-3.2407e-02, -5.0771e-02,  1.4736e-01,  ..., -4.8669e-02,
           1.4239e-01,  6.0525e-02],
         [-4.7254e-02, -5.2809e-03,  2.3061e-01,  ..., -9.7871e-02,
           2.0430e-01,  6.5788e-02],
         [-5.3027e-02, -4.7955e-02,  1.2685e-01,  ..., -7.8321e-02,
           2.2337e-01, -4.7087e-02]],

        [[-7.8454e-02, -2.1723e-02,  1.7472e-01,  ..., -4.8120e-02,
           2.0975e-01,  4.8083e-02],
         [-4.4241e-02, -1.2681e-02,  1.7373e-01,  ..., -9.8478e-02,
           2.0480e-01,  3.3493e-02],
         [-6.3327e-02, -2.4110e-02,  1.4606e-01,  ..., -7.6299e-02,
           2.2843e-01, -2.1622e-02],
         ...,
         [-1.2856e-01, -2

In [10]:
class MultiHeadedAttention(nn.Module):
	def __init__(self, masked=False):
		super().__init__()
		self.heads  = [AttentionHead(masked=masked) for _ in range(ATTENTION_HEADS)]
		self.linear = nn.Linear(EMBEDDING_DIMENSIONS*ATTENTION_HEADS, EMBEDDING_DIMENSIONS)

	def forward(self, data, encoder_output=None):
		vectors = torch.cat([head(data, encoder_output=encoder_output) for head in self.heads], dim=-1)
		return self.linear(vectors)
	

test = torch.randn(size=(BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
test_module = MultiHeadedAttention()
print(test_module(test))


tensor([[[ 0.0097,  0.0286,  0.0925,  ...,  0.1253,  0.0720, -0.0702],
         [ 0.0121,  0.0350,  0.1462,  ...,  0.0872,  0.0444, -0.0661],
         [-0.0182,  0.0154,  0.1256,  ...,  0.0684,  0.0717, -0.0280],
         ...,
         [ 0.0106,  0.0477,  0.0930,  ...,  0.0838,  0.0693, -0.0475],
         [ 0.0325,  0.0539,  0.1172,  ...,  0.0770,  0.0422, -0.0317],
         [ 0.0058,  0.0198,  0.0914,  ...,  0.0942,  0.0651, -0.0626]],

        [[ 0.0494,  0.0927,  0.0940,  ...,  0.0379,  0.0245, -0.0365],
         [ 0.0501,  0.0349,  0.1080,  ...,  0.0564,  0.0592, -0.0576],
         [ 0.0510,  0.0658,  0.0927,  ...,  0.0650,  0.0624, -0.0588],
         ...,
         [ 0.0476,  0.0587,  0.0786,  ...,  0.0830,  0.0597, -0.0796],
         [ 0.0694,  0.0671,  0.1079,  ...,  0.0392,  0.0493, -0.0666],
         [ 0.0753,  0.0783,  0.1124,  ...,  0.0462,  0.0639, -0.0583]],

        [[-0.0325,  0.0825,  0.0793,  ...,  0.1119,  0.0321,  0.0648],
         [-0.0338,  0.0746,  0.0575,  ...,  0

In [11]:
class FeedForward(nn.Module):
	def __init__(self):
		super().__init__()
		self.network = nn.Sequential(
			nn.Linear(EMBEDDING_DIMENSIONS, EMBEDDING_DIMENSIONS),
			nn.ReLU(),
			nn.Linear(EMBEDDING_DIMENSIONS, EMBEDDING_DIMENSIONS)
		)

	def forward(self, data):
		return self.network(data)

test = torch.randn(size=(BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
test_module = FeedForward()
print(test_module(test))


tensor([[[ 2.4419e-01,  6.5899e-02, -2.7589e-01,  ..., -1.4690e-01,
           2.2770e-01,  9.7680e-02],
         [ 1.4047e-01,  8.2262e-02, -4.1861e-01,  ...,  5.4490e-02,
          -9.1217e-02,  2.6626e-01],
         [ 8.7623e-02,  5.8929e-02, -1.9815e-01,  ..., -1.5461e-01,
           1.5912e-01,  2.2302e-01],
         ...,
         [ 1.0174e-01,  9.6830e-02, -3.6062e-01,  ..., -1.8408e-01,
           2.4727e-01,  5.0433e-01],
         [ 3.4881e-01, -1.6631e-01, -5.9349e-01,  ..., -2.7988e-01,
           2.8831e-01,  3.9413e-01],
         [ 2.5956e-01,  3.4238e-01, -4.0262e-01,  ..., -1.0237e-01,
          -4.5205e-01,  4.2216e-01]],

        [[ 4.1898e-01, -9.2823e-02, -5.3060e-01,  ..., -6.1057e-01,
           2.5743e-01,  2.1742e-02],
         [-1.5915e-01,  3.3048e-02, -4.0627e-01,  ..., -2.2890e-01,
           3.2257e-01,  2.7149e-01],
         [-3.9316e-02,  1.4449e-01, -2.6642e-01,  ..., -2.9766e-01,
          -3.0109e-02,  9.9205e-02],
         ...,
         [ 9.3181e-02,  3

In [12]:
class Encoder(nn.Module):
	def __init__(self):
		super().__init__()
		self.multi_headed_attention = MultiHeadedAttention()
		self.norm1 = nn.LayerNorm(EMBEDDING_DIMENSIONS)
		self.feed_forward = FeedForward()
		self.norm2 = nn.LayerNorm(EMBEDDING_DIMENSIONS)

	def forward(self, data):
		attention_vectors = self.multi_headed_attention(data)
		normalised = self.norm1(attention_vectors) + data # Residual Connection
		fed_through = self.feed_forward(normalised)
		normalised_2 = self.norm2(fed_through) + normalised

		return normalised_2
	
test = torch.randn(size=(BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
print(test)
test_module = Encoder()
print(test_module(test))

tensor([[[-2.8996e-01, -1.6537e+00,  1.2328e+00,  ..., -1.2368e-01,
          -1.7143e+00, -6.2563e-01],
         [-1.3953e-01, -4.3369e-01, -1.5503e-01,  ...,  1.8496e+00,
           3.1170e-01, -9.5862e-01],
         [-4.1079e-01,  1.0137e+00,  9.1881e-01,  ...,  6.7879e-01,
          -6.1745e-01,  1.0037e+00],
         ...,
         [-1.6973e+00, -9.8921e-01, -2.3452e-01,  ...,  9.9356e-01,
          -5.4330e-01,  5.8571e-01],
         [ 1.0433e+00,  2.9293e-01, -1.6109e+00,  ...,  6.6787e-01,
           3.7960e-02,  1.5968e+00],
         [-1.0673e+00,  4.8107e-01,  1.5464e+00,  ...,  2.3817e-01,
          -2.1027e+00,  1.1608e+00]],

        [[-6.4214e-01, -1.3467e-01,  4.8817e-01,  ...,  9.3230e-01,
          -1.4842e+00,  1.6568e+00],
         [-1.6590e+00, -6.7137e-01, -1.3637e-01,  ..., -1.9541e-03,
          -6.0459e-01, -3.3372e+00],
         [ 9.7985e-01, -1.1091e+00,  1.6497e+00,  ..., -1.0765e-01,
          -1.8404e+00,  2.4165e-01],
         ...,
         [-4.2877e-01, -1

In [13]:
class Decoder(nn.Module):
	def __init__(self):
		super().__init__()
		self.masked_attention = MultiHeadedAttention(masked=True)
		self.norm1 = nn.LayerNorm(EMBEDDING_DIMENSIONS)
		self.cross_attention  = MultiHeadedAttention()
		self.norm2 = nn.LayerNorm(EMBEDDING_DIMENSIONS)
		self.feed_forward = FeedForward()
		self.norm3 = nn.LayerNorm(EMBEDDING_DIMENSIONS)

	def forward(self, data, encoder_output):
		attention_vectors = self.masked_attention(data)
		normalised = self.norm1(attention_vectors) + data
		cross_attention = self.cross_attention(normalised, encoder_output=encoder_output)
		normalised = self.norm2(cross_attention) + normalised
		linear = self.feed_forward(normalised)
		normalised = self.norm3(linear) + normalised

		return normalised


test = torch.randn(size=(BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
encoder_output = torch.randn(size=(BATCH_SIZE, SEQUENCE_LENGTH, QKV_DIMENSIONS))
test_module = Decoder()
print(test_module(test, encoder_output))

tensor([[[-2.7892e+00,  5.6026e-01, -8.0965e-01,  ..., -3.1768e+00,
           3.5851e+00, -4.1023e+00],
         [-2.0228e+00,  2.2488e+00,  1.8469e-01,  ..., -3.1232e+00,
           2.8538e-02, -1.4859e+00],
         [-1.1687e+00,  1.4930e+00, -1.3627e+00,  ...,  3.2546e-01,
          -7.8101e-01, -3.2138e+00],
         ...,
         [-2.6029e+00, -3.0364e-01,  1.2680e+00,  ...,  9.5971e-02,
          -2.2976e+00, -2.7613e+00],
         [-7.7358e-01,  3.2063e+00,  6.5864e-01,  ..., -7.8532e-02,
          -2.3526e+00, -9.0300e-01],
         [-4.8518e-01,  1.1057e+00,  1.4585e+00,  ..., -3.6011e-01,
          -7.5788e-01, -2.3041e+00]],

        [[-1.2609e+00,  1.0717e+00,  6.4770e-01,  ...,  1.4167e+00,
           1.3495e+00, -1.0517e+00],
         [-1.3596e+00,  2.9197e+00,  3.0889e+00,  ...,  1.3776e+00,
           1.2732e+00, -1.6888e+00],
         [ 8.8238e-02,  2.1040e+00,  2.0326e+00,  ..., -5.5462e-01,
          -4.6152e-01, -2.1493e+00],
         ...,
         [-2.2370e+00,  4

In [22]:
class Transformer(nn.Module):
	def __init__(self):
		super().__init__()
		self.cherokee_embeddings = nn.Embedding(cherokee_vocab_size, EMBEDDING_DIMENSIONS)
		self.cherokee_positional_encodings = nn.Embedding(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS)
		self.encoders = [Encoder() for _ in range(ENCODERS)]
		self.decoders = [Decoder() for _ in range(DECODERS)]
		self.english_embeddings = nn.Embedding(english_vocab_size, EMBEDDING_DIMENSIONS)
		self.english_positional_encodings = nn.Embedding(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS)
		self.linear   = nn.Linear(SEQUENCE_LENGTH, english_vocab_size) 

	def forward(self, source, target_context):
		encoder_output = self.cherokee_embeddings(source) + self.cherokee_positional_encodings(torch.tensor([i for i in range(SEQUENCE_LENGTH)]))

		for encoder in self.encoders:
			encoder_output = encoder(encoder_output)

		decoder_output = self.english_embeddings(target_context) + self.english_positional_encodings(torch.tensor([i for i in range(SEQUENCE_LENGTH)]))

		for decoder in self.decoders:
			decoder_output = decoder(decoder_output, encoder_output=encoder_output)

		decoder_output = torch.mean(decoder_output, dim=-1)
		logits = self.linear(decoder_output)
		probabilities = torch.softmax(logits, dim=-1)

		return probabilities
	
cherokee_input = torch.randint(low=0, high=cherokee_vocab_size-1, size=(BATCH_SIZE, SEQUENCE_LENGTH,))
target_context = torch.zeros(size=(BATCH_SIZE, SEQUENCE_LENGTH,), dtype=int)
print(cherokee_input.shape, target_context.shape)
model = Transformer()
print(model(cherokee_input, target_context).shape)

torch.Size([256, 106]) torch.Size([256, 106])
torch.Size([256, 6401])


In [65]:
class Interface:
	def __init__(self, learning_rate, model):
		self.model = model
		self.learning_rate = learning_rate
		self.optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
		self.losses = []

	def get_batch(self, english, cherokee, probabilities):
		indexes = torch.randint(0, english.shape[0], size=(BATCH_SIZE,))
		english  = torch.stack([english[i] for i in indexes])
		cherokee = torch.stack([cherokee[i] for i in indexes])
		probabilities = torch.stack([probabilities[i] for i in indexes])

		return english, cherokee, probabilities
	
	def pass_batch(self, english, cherokee, probabilities):
		english, cherokee, probabilities = self.get_batch(english, cherokee, probabilities)
		logits = self.model(cherokee, english)
		loss = nn.functional.cross_entropy(logits, probabilities.float())

		return logits, loss
	
	def plot_loss(self):
		fig = px.line(x=range(len(self.losses)), y=self.losses, title='Line Graph of Losses')
		fig.update_xaxes(title_text='Data Point Index')
		fig.update_yaxes(title_text='Loss Values')
		fig.show()
	
	def train(self, epochs):
		for i in range(epochs):
			logits, loss = self.pass_batch(train_english, train_cherokee, train_probabilities)
			self.optimizer.zero_grad(set_to_none=True)
			loss.backward()
			self.optimizer.step()

			self.losses.append(loss.item())


In [67]:
model = Transformer()
trainer = Interface(learning_rate=0.05, model=model)
trainer.train(100)
trainer.plot_loss()

tensor(8.7642, grad_fn=<DivBackward1>)
tensor(8.7639, grad_fn=<DivBackward1>)
tensor(8.7620, grad_fn=<DivBackward1>)
tensor(8.7551, grad_fn=<DivBackward1>)
tensor(8.7407, grad_fn=<DivBackward1>)
tensor(8.7152, grad_fn=<DivBackward1>)
