In [47]:
import torch
import torch.nn as nn
from nltk.tokenize import word_tokenize
from os import path
from math import sqrt

In [102]:
class Vocabulary:
	def __init__(self):
		self.vocabulary = set('')
		self.stoi = {'<N>':0, '<S>': 1, '<E>': 2}
		self.itos = {0:'<N>', 1:'<S>', 2: '<E>'}

	def add(self, v):
		if type(v) == str:
			self.vocabulary.add(v)
		elif type(v) == list:
			self.vocabulary = self.vocabulary.union(set(v)) 

	def create_mappings(self):
		self.stoi = {v:i+1 for i, v in enumerate(self.vocabulary)}
		self.itos = {i+1:v for i, v in enumerate(self.vocabulary)}

	def encode(self, s): 
		return [self.stoi[c] for c in s]
	
	def decode(self, i): 
		return [self.itos[n] for n in i]
	

class PreProcessor:
	def __init__(self):
		self.english_vocabulary = Vocabulary()
		self.cherokee_vocabulary = Vocabulary()
		self.cherokee = []
		self.english = []
		self.max_length = 0
		self.count = 0

	def load_text(self, file_name):
		data, language = [], file_name.split('.')[1]

		with open(file_name) as f:
			for line in f.readlines():
				sentence = ['<S>'] + word_tokenize(line) + ['<E>']

				if language == 'en':
					self.english_vocabulary.add(sentence)
				else:
					self.cherokee_vocabulary.add(sentence)

				self.max_length = max(self.max_length, len(sentence))
				data.append(sentence)
				self.count += 1 
		return data
	
	def get_data(self, file_set):
		cherokee = self.load_text(path.join('chr_en_data', f'{file_set}.chr'))
		english  = self.load_text(path.join('chr_en_data', f'{file_set}.en' ))
		assert len(cherokee) == len(english)
		self.cherokee += cherokee
		self.english  += english

		return cherokee, english

	
	def create_tensors(self):
		self.english_vocabulary.create_mappings()
		self.cherokee_vocabulary.create_mappings()

		english  = torch.zeros(size=(self.count//2, self.max_length), dtype=int)
		cherokee = torch.zeros(size=(self.count//2, self.max_length), dtype=int)

		for i, sen in enumerate(self.english):
			for j, v in enumerate(self.english_vocabulary.encode(sen)):
				english[i, j] = v
		
		for i, sen in enumerate(self.cherokee):
			for j, v in enumerate(self.cherokee_vocabulary.encode(sen)):
				cherokee[i, j] = v

		self.cherokee, self.english = cherokee, english


preprocessor = PreProcessor()
preprocessor.get_data('dev')
preprocessor.get_data('test')
preprocessor.get_data('train')
preprocessor.create_tensors()

test = word_tokenize('ᏣᏌᏙᏰᏃ ᎢᎦᎦᏛ ᏓᏳᏂᎷᏤᎵ ᏂᎬᎾᏛ ᏗᏁᎯ.')

assert preprocessor.cherokee_vocabulary.decode(preprocessor.cherokee_vocabulary.encode(test)) == test
assert preprocessor.english.shape == preprocessor.cherokee.shape

In [103]:
cherokee_vocab_size, english_vocab_size = len(preprocessor.cherokee_vocabulary.vocabulary), len(preprocessor.english_vocabulary.vocabulary)

print(f'Cherokee Vocabulary Size: {cherokee_vocab_size}')
print(f'English  Vocabulary Size: {english_vocab_size}')

Cherokee Vocabulary Size: 44566
English  Vocabulary Size: 13930


In [104]:
EMBEDDING_DIMENSIONS = 5
QKV_DIMENSIONS       = 5
SEQUENCE_LENGTH      = 5
ATTENTION_HEADS      = 2
DECODERS             = 2
ENCODERS             = 2

In [105]:
def mask_tensor(t):
	mask = torch.tril(torch.ones(size=(t.shape)))
	mask[mask==0], mask[mask==1] = float('-inf'), 0
	
	return t + mask

test = torch.randn(SEQUENCE_LENGTH, SEQUENCE_LENGTH)
print(test)
print(mask_tensor(test))

tensor([[ 0.0026, -0.9088,  1.6150,  1.0649, -0.1640],
        [ 0.2089, -1.5967,  2.2217,  0.3630, -1.1184],
        [-0.6699,  0.9744, -0.7703, -1.4703, -0.3523],
        [ 1.0055, -0.3507, -0.4039, -1.6458,  0.3344],
        [ 1.0550,  0.7338, -0.8857,  1.0953, -1.0944]])
tensor([[ 0.0026,    -inf,    -inf,    -inf,    -inf],
        [ 0.2089, -1.5967,    -inf,    -inf,    -inf],
        [-0.6699,  0.9744, -0.7703,    -inf,    -inf],
        [ 1.0055, -0.3507, -0.4039, -1.6458,    -inf],
        [ 1.0550,  0.7338, -0.8857,  1.0953, -1.0944]])


In [106]:
class AttentionHead(nn.Module):
	def __init__(self, masked=False):
		super().__init__()
		self.obtain_key   = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.obtain_query = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.obtain_value = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.masked = masked

	def forward(self, data, encoder_output=None):
		if encoder_output is None: Q, K, V = self.obtain_query(data), self.obtain_key(data), self.obtain_value(data)
		else: Q, K, V = self.obtain_query(data), self.obtain_key(encoder_output), self.obtain_value(encoder_output)
		mat_mul = Q @ K.transpose(-2, -1)
		scaled_mat_mul = mat_mul / sqrt(QKV_DIMENSIONS)
		if self.masked: scaled_mat_mul = mask_tensor(scaled_mat_mul)
		softmax_mat_mul = torch.softmax(scaled_mat_mul, dim=-1)
		output = softmax_mat_mul @ V

		return output


test, e_output = torch.randn(size=(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS)), torch.randn(size=(SEQUENCE_LENGTH, QKV_DIMENSIONS))
test_module = AttentionHead()
print(test_module(test, encoder_output=e_output))



tensor([[ 0.5802, -0.6937,  0.0367,  0.1371, -0.0760],
        [ 0.7351, -0.9166,  0.0179,  0.2681, -0.2171],
        [ 0.3909, -0.5445,  0.2628, -0.1997,  0.0178],
        [ 0.6411, -0.7779,  0.0194,  0.2525, -0.0891],
        [ 0.3949, -0.4536,  0.1381, -0.0649,  0.1054]], grad_fn=<MmBackward0>)


In [107]:
class MultiHeadedAttention(nn.Module):
	def __init__(self, masked=False):
		super().__init__()
		self.heads  = [AttentionHead(masked=masked) for _ in range(ATTENTION_HEADS)]
		self.linear = nn.Linear(EMBEDDING_DIMENSIONS*ATTENTION_HEADS, EMBEDDING_DIMENSIONS)

	def forward(self, data, encoder_output=None):
		vectors = torch.cat([head(data, encoder_output=encoder_output) for head in self.heads], dim=-1)
		return self.linear(vectors)
	

test = torch.randn(size=(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
test_module = MultiHeadedAttention()
print(test_module(test))


tensor([[-0.1817, -0.3894, -0.2600, -0.2322, -0.1825],
        [-0.0993, -0.4125, -0.2427, -0.2454, -0.2715],
        [-0.1817, -0.4075, -0.2692, -0.1919, -0.1530],
        [-0.2235, -0.3671, -0.2861, -0.2582, -0.1797],
        [-0.0955, -0.4401, -0.2483, -0.2158, -0.2510]],
       grad_fn=<AddmmBackward0>)


In [108]:
class FeedForward(nn.Module):
	def __init__(self):
		super().__init__()
		self.network = nn.Sequential(
			nn.Linear(EMBEDDING_DIMENSIONS, EMBEDDING_DIMENSIONS),
			nn.ReLU(),
			nn.Linear(EMBEDDING_DIMENSIONS, EMBEDDING_DIMENSIONS)
		)

	def forward(self, data):
		return self.network(data)

test = torch.randn(size=(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
test_module = FeedForward()
print(test_module(test))


tensor([[-0.9341,  0.1639,  0.7006,  0.1613,  1.1755],
        [-0.5051,  0.2032,  0.4093, -0.0170,  0.4876],
        [-0.5818,  0.2931,  0.4013, -0.0212,  0.5960],
        [ 0.0598,  0.7212,  0.7467, -0.6149,  0.4439],
        [-0.5700,  0.5483,  0.4951, -0.2060,  0.7478]],
       grad_fn=<AddmmBackward0>)


In [109]:
class Encoder(nn.Module):
	def __init__(self):
		super().__init__()
		self.multi_headed_attention = MultiHeadedAttention()
		self.norm1 = nn.LayerNorm(EMBEDDING_DIMENSIONS)
		self.feed_forward = FeedForward()
		self.norm2 = nn.LayerNorm(EMBEDDING_DIMENSIONS)

	def forward(self, data):
		attention_vectors = self.multi_headed_attention(data)
		normalised = self.norm1(attention_vectors) + data # Residual Connection
		fed_through = self.feed_forward(normalised)
		normalised_2 = self.norm2(fed_through) + normalised

		return normalised_2
	
test = torch.randn(size=(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
print(test)
test_module = Encoder()
print(test_module(test))

tensor([[ 0.5764,  0.3275,  0.5348, -0.7746,  1.3217],
        [-0.0601, -0.0192, -0.9197,  0.4947, -0.2846],
        [-1.7239, -1.9580, -0.8073, -1.7400,  0.6799],
        [-0.1074, -0.1866, -0.4353, -1.0101,  0.7205],
        [-1.2910, -1.2811,  0.1341, -0.4968, -0.1988]])
tensor([[ 1.6232,  1.4327, -2.2987, -0.4335,  1.6620],
        [ 1.5448,  1.3409, -3.6880,  0.8506, -0.8372],
        [ 0.2254, -0.6989, -3.1884, -0.7318, -1.1555],
        [ 1.8617,  1.0154, -3.2904, -0.4599, -0.1459],
        [ 0.5982,  0.4254, -1.5111, -0.1768, -2.4691]], grad_fn=<AddBackward0>)


In [110]:
class Decoder(nn.Module):
	def __init__(self):
		super().__init__()
		self.masked_attention = MultiHeadedAttention(masked=True)
		self.norm1 = nn.LayerNorm(EMBEDDING_DIMENSIONS)
		self.cross_attention  = MultiHeadedAttention()
		self.norm2 = nn.LayerNorm(EMBEDDING_DIMENSIONS)
		self.feed_forward = FeedForward()
		self.norm3 = nn.LayerNorm(EMBEDDING_DIMENSIONS)

	def forward(self, data, encoder_output):
		attention_vectors = self.masked_attention(data)
		normalised = self.norm1(attention_vectors) + data
		cross_attention = self.cross_attention(normalised, encoder_output=encoder_output)
		normalised = self.norm2(cross_attention) + normalised
		linear = self.feed_forward(normalised)
		normalised = self.norm3(linear) + normalised

		return normalised


test = torch.randn(size=(EMBEDDING_DIMENSIONS, SEQUENCE_LENGTH))
encoder_output = torch.randn(size=(SEQUENCE_LENGTH, QKV_DIMENSIONS))
test_module = Decoder()
print(test_module(test, encoder_output))

tensor([[ 0.3078,  2.9139, -0.2311, -1.8258, -1.5334],
        [ 0.2316,  2.7685, -2.3870, -3.0375, -0.1999],
        [ 1.6120,  3.4501, -0.0888, -1.5791, -1.2621],
        [-0.9488,  2.1729,  0.0114, -0.1391, -0.8286],
        [ 0.0897,  2.3470, -1.0117, -1.3317, -1.0284]], grad_fn=<AddBackward0>)


In [112]:
class Transformer(nn.Module):
	def __init__(self):
		super().__init__()
		self.cherokee_embeddings = nn.Embedding(cherokee_vocab_size, EMBEDDING_DIMENSIONS)
		self.cherokee_positional_encodings = nn.Embedding(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS)
		self.encoders = [Encoder() for _ in range(ENCODERS)]
		self.decoders = [Decoder() for _ in range(DECODERS)]
		self.english_embeddings = nn.Embedding(english_vocab_size, EMBEDDING_DIMENSIONS)
		self.english_positional_encodings = nn.Embedding(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS)
		self.linear   = nn.Linear(SEQUENCE_LENGTH*QKV_DIMENSIONS, english_vocab_size) 

	def forward(self, source, target_context):
		encoder_output = self.cherokee_embeddings(source) + self.cherokee_positional_encodings(torch.tensor([i for i in range(SEQUENCE_LENGTH)]))
		for encoder in self.encoders:
			encoder_output = encoder(encoder_output)

		decoder_output = self.english_embeddings(target_context) + self.english_positional_encodings(torch.tensor([i for i in range(SEQUENCE_LENGTH)]))
		for decoder in self.decoders:
			decoder_output = decoder(decoder_output, encoder_output=encoder_output)

		decoder_output = torch.flatten(decoder_output)
		logits = self.linear(decoder_output)
		probabilities = torch.softmax(logits, dim=-1)

		return probabilities
	
cherokee_input = torch.randint(low=0, high=cherokee_vocab_size-1, size=(SEQUENCE_LENGTH,))
target_context = torch.zeros(size=(SEQUENCE_LENGTH,), dtype=int)
model = Transformer()
print(model(cherokee_input, target_context).shape)

torch.Size([13930])
