In [1]:
import torch
import torch.nn as nn
from nltk.tokenize import word_tokenize
from os import path
from math import sqrt

In [2]:
class Vocabulary:
	def __init__(self):
		self.vocabulary = set()
		self.stoi = {'<N>':0}
		self.itos = {0:'<N>'}

	def add(self, v):
		if type(v) == str:
			self.vocabulary.add(v)
		elif type(v) == list:
			self.vocabulary = self.vocabulary.union(set(v)) 

	def create_mappings(self):
		self.stoi |= {v:i+len(self.stoi) for i, v in enumerate(self.vocabulary)}
		self.itos |= {i+len(self.itos):v for i, v in enumerate(self.vocabulary)}

	def encode(self, s): 
		return [self.stoi[c] for c in s]
	
	def decode(self, i): 
		return [self.itos[n] for n in i]
	

class PreProcessor:
	def __init__(self):
		self.english_vocabulary = Vocabulary()
		self.cherokee_vocabulary = Vocabulary()
		self.cherokee = []
		self.english = []
		self.max_length = 0
		self.count = 0

	def load_text(self, file_name):
		data, language = [], file_name.split('.')[0]

		with open(path.join('chr_en_data', file_name)) as f:
			for line in f.readlines():
				sentence = ['<S>'] + word_tokenize(line) + ['<E>']

				if language == 'en':
					self.english_vocabulary.add(sentence)
				else:
					self.cherokee_vocabulary.add(sentence)

				self.max_length = max(self.max_length, len(sentence))
				data.append(sentence)
				self.count += 1 
		return data
	
	def get_data(self):
		cherokee = self.load_text('chr.txt')
		english  = self.load_text('en.txt' )
		assert len(cherokee) == len(english)
		self.cherokee += cherokee
		self.english  += english

		return cherokee, english

	
	def create_tensors(self):
		self.english_vocabulary.create_mappings()
		self.cherokee_vocabulary.create_mappings()

		english  = torch.zeros(size=(self.count//2, self.max_length), dtype=int)
		cherokee = torch.zeros(size=(self.count//2, self.max_length), dtype=int)

		for i, sen in enumerate(self.english):
			for j, v in enumerate(self.english_vocabulary.encode(sen)):
				english[i, j] = v
		
		for i, sen in enumerate(self.cherokee):
			for j, v in enumerate(self.cherokee_vocabulary.encode(sen)):
				cherokee[i, j] = v

		self.cherokee, self.english = cherokee, english


preprocessor = PreProcessor()
preprocessor.get_data()
preprocessor.create_tensors()

test = word_tokenize('ᎤᎵᎦᎵᏴᎮᎢ ᎠᏴᏤᏂ ᏫᎵᎻ.')

assert preprocessor.cherokee_vocabulary.decode(preprocessor.cherokee_vocabulary.encode(test)) == test
assert preprocessor.english.shape == preprocessor.cherokee.shape

In [3]:
cherokee_vocab_size, english_vocab_size = len(preprocessor.cherokee_vocabulary.stoi), len(preprocessor.english_vocabulary.stoi)

print(f'Cherokee Vocabulary Size: {cherokee_vocab_size}')
print(f'English  Vocabulary Size: {english_vocab_size}')

Cherokee Vocabulary Size: 12790
English  Vocabulary Size: 6401


In [75]:
cherokee_in, english_in, expected_probabilities = [], [], []

for i, c in enumerate(preprocessor.cherokee):
	cherokee_tensor = torch.tensor(list(c))
	for j in range(1, preprocessor.max_length - 1):
		english_tensor = torch.zeros(preprocessor.max_length)
		english_tensor[:j] = preprocessor.english[i, :j]
		
		probability = torch.zeros(english_vocab_size)
		probability[preprocessor.english[i, j].item()] = 1
		if preprocessor.english[i, j].item() != 0:
			cherokee_in.append(cherokee_tensor)
			english_in.append(english_tensor)
			expected_probabilities.append(probability)


cherokee, english, expected_probabilities = torch.stack(cherokee_in).int(), torch.stack(english_in).int(), torch.stack(expected_probabilities).float()

print(cherokee.shape)
print(english.shape)
print(expected_probabilities.shape)

torch.Size([75406, 106])
torch.Size([75406, 106])
torch.Size([75406, 6401])


In [76]:
size = int(cherokee.shape[0])

train_cherokee = cherokee[:int(0.8*size)]
train_english  = english[:int(0.8*size)]
train_probabilities = expected_probabilities[:int(0.8*size)]

test_cherokee  = cherokee[int(0.8*size):int(0.9*size)]
test_english   = english[int(0.8*size):int(0.9*size)]
test_probabilities = expected_probabilities[int(0.8*size):int(0.9*size)]

val_cherokee   = cherokee[int(0.9*size):]
val_english    = english[int(0.9*size):]
val_probabilities = expected_probabilities[int(0.9*size):]

In [68]:
print(train_probabilities)

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0]], dtype=torch.int32)


In [152]:
EMBEDDING_DIMENSIONS = 64
QKV_DIMENSIONS       = 64
SEQUENCE_LENGTH      = preprocessor.max_length
ATTENTION_HEADS      = 4
DECODERS             = 4
ENCODERS             = 4
BATCH_SIZE           = 128

In [14]:
def mask_tensor(t):
	mask = torch.tril(torch.ones(size=(t.shape)))
	mask[mask==0], mask[mask==1] = float('-inf'), 0
	
	return t + mask

test = torch.randn(SEQUENCE_LENGTH, SEQUENCE_LENGTH)
print(test)
print(mask_tensor(test))

tensor([[-1.0723, -1.3801,  0.0585,  ..., -0.4914,  0.5458, -0.0301],
        [-0.2896, -1.0054,  0.6750,  ...,  0.0719, -0.1699,  0.0991],
        [-0.2460,  1.2158,  0.6026,  ..., -0.5683, -0.3942, -1.3164],
        ...,
        [-0.4786, -0.1223, -0.6923,  ...,  0.1017,  0.1417, -0.1078],
        [-2.2747, -0.4283,  0.7588,  ...,  0.5555,  0.4130, -0.6876],
        [-1.1190,  1.0133, -0.8959,  ..., -0.8756, -0.1275, -0.6334]])
tensor([[-1.0723,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
        [-0.2896, -1.0054,    -inf,  ...,    -inf,    -inf,    -inf],
        [-0.2460,  1.2158,  0.6026,  ...,    -inf,    -inf,    -inf],
        ...,
        [-0.4786, -0.1223, -0.6923,  ...,  0.1017,    -inf,    -inf],
        [-2.2747, -0.4283,  0.7588,  ...,  0.5555,  0.4130,    -inf],
        [-1.1190,  1.0133, -0.8959,  ..., -0.8756, -0.1275, -0.6334]])


In [15]:
class AttentionHead(nn.Module):
	def __init__(self, masked=False):
		super().__init__()
		self.obtain_key   = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.obtain_query = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.obtain_value = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.masked = masked

	def forward(self, data, encoder_output=None):
		if encoder_output is None: Q, K, V = self.obtain_query(data), self.obtain_key(data), self.obtain_value(data)
		else: Q, K, V = self.obtain_query(data), self.obtain_key(encoder_output), self.obtain_value(encoder_output)
		mat_mul = Q @ K.transpose(-2, -1)
		scaled_mat_mul = mat_mul / sqrt(QKV_DIMENSIONS)
		if self.masked: scaled_mat_mul = mask_tensor(scaled_mat_mul)
		softmax_mat_mul = torch.softmax(scaled_mat_mul, dim=-1)
		output = softmax_mat_mul @ V

		return output

test, e_output = torch.randn(size=(BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS)), torch.randn(size=(SEQUENCE_LENGTH, QKV_DIMENSIONS))
test_module = AttentionHead()
print(test_module(test, encoder_output=e_output))



tensor([[[ 0.0989, -0.0104,  0.1618,  ..., -0.0822, -0.0214, -0.1309],
         [ 0.1341, -0.1206,  0.1161,  ..., -0.0746,  0.0038, -0.1836],
         [ 0.0848, -0.0751,  0.1078,  ..., -0.0734, -0.0183, -0.1206],
         ...,
         [ 0.0727, -0.1055,  0.0839,  ..., -0.0682, -0.0305, -0.1725],
         [ 0.1381, -0.1461,  0.0984,  ..., -0.0862, -0.0456, -0.1389],
         [ 0.1092, -0.0757,  0.1450,  ..., -0.0619, -0.0086, -0.1633]],

        [[ 0.0615, -0.0477,  0.0428,  ..., -0.0960, -0.0376, -0.1159],
         [ 0.0571, -0.0400,  0.1615,  ..., -0.0946,  0.0134, -0.1306],
         [ 0.1336, -0.0750,  0.1912,  ..., -0.0752,  0.0421, -0.1661],
         ...,
         [ 0.1189, -0.0608,  0.1153,  ..., -0.1013, -0.0057, -0.1614],
         [ 0.1168, -0.0460,  0.1086,  ..., -0.0842, -0.0174, -0.1716],
         [ 0.1043, -0.0606,  0.1712,  ..., -0.1196,  0.0169, -0.1200]],

        [[ 0.1384, -0.0572,  0.1658,  ..., -0.0829,  0.0125, -0.1179],
         [ 0.1133, -0.1130,  0.0573,  ..., -0

In [153]:
class MultiHeadedAttention(nn.Module):
	def __init__(self, masked=False):
		super().__init__()
		self.heads  = [AttentionHead(masked=masked) for _ in range(ATTENTION_HEADS)]
		self.linear = nn.Linear(EMBEDDING_DIMENSIONS*ATTENTION_HEADS, EMBEDDING_DIMENSIONS)

	def forward(self, data, encoder_output=None):
		vectors = torch.cat([head(data, encoder_output=encoder_output) for head in self.heads], dim=-1)
		return self.linear(vectors)
	

test = torch.randn(size=(BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
test_module = MultiHeadedAttention()
print(test_module(test))


tensor([[[ 0.0532,  0.0134,  0.1116,  ...,  0.0125,  0.0692,  0.0140],
         [ 0.0386,  0.0033,  0.1562,  ...,  0.0127,  0.0414,  0.0259],
         [ 0.0296,  0.0150,  0.1250,  ..., -0.0055,  0.0707,  0.0118],
         ...,
         [ 0.0437, -0.0078,  0.0705,  ..., -0.0169,  0.0904, -0.0044],
         [ 0.0256,  0.0057,  0.1296,  ..., -0.0132,  0.0663, -0.0025],
         [ 0.0096, -0.0156,  0.0901,  ..., -0.0201,  0.0468, -0.0008]],

        [[-0.0643,  0.0159,  0.0075,  ..., -0.0311,  0.0794,  0.0495],
         [-0.0438,  0.0520, -0.0041,  ...,  0.0014,  0.1282, -0.0014],
         [-0.0367,  0.0219,  0.0074,  ..., -0.0519,  0.1044,  0.0416],
         ...,
         [-0.0579,  0.0092,  0.0304,  ..., -0.0280,  0.1106,  0.0095],
         [-0.0696,  0.0053, -0.0124,  ..., -0.0529,  0.0528,  0.0436],
         [-0.0518,  0.0267,  0.0208,  ..., -0.0157,  0.1136,  0.0484]],

        [[-0.0551,  0.0543,  0.0953,  ..., -0.0137,  0.0985,  0.0387],
         [-0.0426,  0.0377,  0.0651,  ..., -0

In [154]:
class FeedForward(nn.Module):
	def __init__(self):
		super().__init__()
		self.network = nn.Sequential(
			nn.Linear(EMBEDDING_DIMENSIONS, EMBEDDING_DIMENSIONS),
			nn.ReLU(),
			nn.Linear(EMBEDDING_DIMENSIONS, EMBEDDING_DIMENSIONS)
		)

	def forward(self, data):
		return self.network(data)

test = torch.randn(size=(BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
test_module = FeedForward()
print(test_module(test))


tensor([[[-7.1829e-02, -4.0354e-02,  2.8878e-03,  ..., -1.2242e-01,
           9.5087e-02,  4.0177e-01],
         [ 1.9630e-02, -8.6136e-03, -1.8324e-01,  ...,  1.7885e-03,
           8.6181e-02,  7.6331e-02],
         [-6.2632e-02,  2.2289e-01, -1.9254e-01,  ..., -1.8628e-01,
           3.6278e-01,  1.3920e-01],
         ...,
         [-1.3837e-01, -5.0365e-02,  2.5320e-01,  ..., -9.7182e-02,
           2.9252e-02,  1.1883e-01],
         [ 3.6318e-01, -1.4389e-02,  1.4974e-01,  ..., -1.5681e-01,
           2.2288e-01, -1.2608e-01],
         [ 3.4192e-02, -6.1767e-02,  3.1173e-02,  ...,  8.6230e-02,
          -2.6353e-02, -5.5832e-02]],

        [[ 3.9902e-01,  3.0212e-01,  3.9483e-02,  ..., -1.9250e-01,
           3.0337e-01,  3.9445e-02],
         [ 4.4717e-02,  1.1588e-01, -1.1071e-01,  ..., -8.8618e-02,
           8.8362e-02, -9.9734e-02],
         [ 7.6705e-02, -5.0363e-02,  4.4998e-02,  ..., -1.3614e-02,
           4.6928e-01, -1.7085e-01],
         ...,
         [ 1.8555e-01, -9

In [155]:
class Encoder(nn.Module):
	def __init__(self):
		super().__init__()
		self.multi_headed_attention = MultiHeadedAttention()
		self.norm1 = nn.LayerNorm(EMBEDDING_DIMENSIONS)
		self.feed_forward = FeedForward()
		self.norm2 = nn.LayerNorm(EMBEDDING_DIMENSIONS)

	def forward(self, data):
		attention_vectors = self.multi_headed_attention(data)
		normalised = self.norm1(attention_vectors) + data # Residual Connection
		fed_through = self.feed_forward(normalised)
		normalised_2 = self.norm2(fed_through) + normalised

		return normalised_2
	
test = torch.randn(size=(BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
print(test)
test_module = Encoder()
print(test_module(test))

tensor([[[ 2.1378e+00,  2.2594e+00, -8.9774e-01,  ..., -3.9347e-01,
          -1.3277e-01, -1.1951e-01],
         [-1.1793e+00, -4.7634e-01,  7.8327e-01,  ...,  6.8740e-01,
           9.9575e-02, -1.1034e-01],
         [ 1.1214e+00,  9.1023e-01, -9.4066e-01,  ..., -4.2511e-01,
          -2.5348e-01, -6.8145e-01],
         ...,
         [ 1.2117e+00,  6.7151e-01,  9.5772e-01,  ..., -1.0077e-01,
           1.0680e-01, -1.4099e+00],
         [ 1.8021e+00,  2.0497e-01,  2.2148e+00,  ..., -1.1103e-01,
           8.5511e-01,  1.2522e+00],
         [ 5.0089e-02, -5.1241e-01, -1.2340e+00,  ..., -3.7484e-01,
          -2.7225e-02,  5.5570e-01]],

        [[-1.3226e+00, -6.2207e-01,  5.3128e-01,  ...,  2.4155e-02,
           1.4907e+00, -9.8801e-02],
         [ 1.0117e-02, -1.2984e+00, -1.8277e+00,  ..., -1.7832e-01,
          -2.6286e-01,  1.2925e+00],
         [ 5.8895e-01,  5.8580e-01, -3.5561e-02,  ...,  2.8899e-01,
           9.9369e-01, -1.3183e-01],
         ...,
         [-2.0359e+00,  1

In [156]:
class Decoder(nn.Module):
	def __init__(self):
		super().__init__()
		self.masked_attention = MultiHeadedAttention(masked=True)
		self.norm1 = nn.LayerNorm(EMBEDDING_DIMENSIONS)
		self.cross_attention  = MultiHeadedAttention()
		self.norm2 = nn.LayerNorm(EMBEDDING_DIMENSIONS)
		self.feed_forward = FeedForward()
		self.norm3 = nn.LayerNorm(EMBEDDING_DIMENSIONS)

	def forward(self, data, encoder_output):
		attention_vectors = self.masked_attention(data)
		normalised = self.norm1(attention_vectors) + data
		cross_attention = self.cross_attention(normalised, encoder_output=encoder_output)
		normalised = self.norm2(cross_attention) + normalised
		linear = self.feed_forward(normalised)
		normalised = self.norm3(linear) + normalised

		return normalised


test = torch.randn(size=(BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
encoder_output = torch.randn(size=(BATCH_SIZE, SEQUENCE_LENGTH, QKV_DIMENSIONS))
test_module = Decoder()
print(test_module(test, encoder_output))

tensor([[[-1.2642,  5.4909,  2.0919,  ..., -1.5268,  2.8451,  0.6862],
         [ 1.0624,  2.7324,  3.4598,  ...,  1.3091,  2.2959,  2.8245],
         [-0.0172,  5.6192,  2.8797,  ...,  1.8983,  2.1877,  2.6392],
         ...,
         [ 0.1248,  2.6615,  3.6154,  ...,  2.2346,  0.0211,  0.2310],
         [-0.3420,  4.3324,  0.6619,  ...,  1.5645,  1.3461,  0.3359],
         [ 3.0530,  2.8983,  2.4913,  ..., -0.6784, -2.1746,  0.4604]],

        [[-1.2555,  4.3146,  1.0621,  ..., -0.9031, -3.2124,  1.0194],
         [-1.8580,  3.3886, -1.3947,  ...,  3.2414, -3.8878,  0.8742],
         [-2.1807,  2.9099,  2.9216,  ...,  3.9311, -1.1293,  0.5376],
         ...,
         [ 0.4995, -0.1362,  1.6128,  ...,  4.9181, -3.7591,  0.3999],
         [-0.1896,  2.5756,  0.2065,  ...,  4.1975, -1.3033, -0.0813],
         [-2.3787, -0.5187,  0.9549,  ...,  3.8166, -3.7047,  0.1514]],

        [[-0.2118, -1.0157, -0.7257,  ...,  3.9676, -0.3843,  4.3723],
         [-2.7645,  1.0665, -1.7705,  ...,  2

In [157]:
class Transformer(nn.Module):
	def __init__(self):
		super().__init__()
		self.cherokee_embeddings = nn.Embedding(cherokee_vocab_size, EMBEDDING_DIMENSIONS)
		self.cherokee_positional_encodings = nn.Embedding(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS)
		self.encoders = [Encoder() for _ in range(ENCODERS)]
		self.decoders = [Decoder() for _ in range(DECODERS)]
		self.english_embeddings = nn.Embedding(english_vocab_size, EMBEDDING_DIMENSIONS)
		self.english_positional_encodings = nn.Embedding(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS)
		self.linear   = nn.Linear(SEQUENCE_LENGTH, english_vocab_size) 

	def forward(self, source, target_context):
		encoder_output = self.cherokee_embeddings(source) + self.cherokee_positional_encodings(torch.tensor([i for i in range(SEQUENCE_LENGTH)]))
		for encoder in self.encoders:
			encoder_output = encoder(encoder_output)

		decoder_output = self.english_embeddings(target_context) + self.english_positional_encodings(torch.tensor([i for i in range(SEQUENCE_LENGTH)]))

		for decoder in self.decoders:
			decoder_output = decoder(decoder_output, encoder_output=encoder_output)
			
		decoder_output = torch.mean(decoder_output, dim=-1)
		logits = self.linear(decoder_output)
		probabilities = torch.softmax(logits, dim=-1)

		return probabilities
	
cherokee_input = torch.randint(low=0, high=cherokee_vocab_size-1, size=(BATCH_SIZE, SEQUENCE_LENGTH,))
target_context = torch.zeros(size=(BATCH_SIZE, SEQUENCE_LENGTH,), dtype=int)
print(cherokee_input.shape, target_context.shape)
model = Transformer()
print(model(cherokee_input, target_context).shape)

torch.Size([128, 106]) torch.Size([128, 106])
torch.Size([128, 6401])


In [158]:
class Trainer:
	def __init__(self, learning_rate, model):
		self.model = model
		self.learning_rate = learning_rate
		self.optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

	def get_batch(self, english, cherokee, probabilities):
		indexes = torch.randint(0, english.shape[0], size=(BATCH_SIZE,))
		english  = torch.stack([english[i] for i in indexes])
		cherokee = torch.stack([cherokee[i] for i in indexes])
		probabilities = torch.stack([probabilities[i] for i in indexes])

		return english, cherokee, probabilities
	
	def pass_batch(self, english, cherokee, probabilities):
		english, cherokee, probabilities = self.get_batch(english, cherokee, probabilities)
		logits = self.model(cherokee, english)
		loss = nn.functional.cross_entropy(logits, probabilities.float())

		return logits, loss
	
	def train(self, epochs):
		for i in range(epochs):
			logits, loss = self.pass_batch(train_english, train_cherokee, train_probabilities)
			self.optimizer.zero_grad(set_to_none=True)
			loss.backward()
			self.optimizer.step()

			#for param in self.model.parameters():
			#	print(param.grad.data.sum())
			
			print(loss)

In [159]:
model = Transformer()
trainer = Trainer(learning_rate=0.2, model=model)
trainer.train(100)

tensor(8.7642, grad_fn=<DivBackward1>)
tensor(8.7630, grad_fn=<DivBackward1>)
tensor(8.7401, grad_fn=<DivBackward1>)
tensor(8.7269, grad_fn=<DivBackward1>)
tensor(8.7279, grad_fn=<DivBackward1>)
tensor(8.7224, grad_fn=<DivBackward1>)
tensor(8.6864, grad_fn=<DivBackward1>)
tensor(8.7098, grad_fn=<DivBackward1>)
tensor(8.6864, grad_fn=<DivBackward1>)
tensor(8.6707, grad_fn=<DivBackward1>)
tensor(8.7176, grad_fn=<DivBackward1>)
tensor(8.7176, grad_fn=<DivBackward1>)
tensor(8.7176, grad_fn=<DivBackward1>)
tensor(8.7176, grad_fn=<DivBackward1>)
tensor(8.7176, grad_fn=<DivBackward1>)
tensor(8.6317, grad_fn=<DivBackward1>)
tensor(8.6864, grad_fn=<DivBackward1>)
tensor(8.7176, grad_fn=<DivBackward1>)
tensor(8.7098, grad_fn=<DivBackward1>)
tensor(8.7020, grad_fn=<DivBackward1>)
tensor(8.7176, grad_fn=<DivBackward1>)
tensor(8.6942, grad_fn=<DivBackward1>)
tensor(8.7098, grad_fn=<DivBackward1>)
tensor(8.6864, grad_fn=<DivBackward1>)
tensor(8.6942, grad_fn=<DivBackward1>)
tensor(8.6942, grad_fn=<D

KeyboardInterrupt: 