In [2]:
import torch
import torch.nn as nn
from nltk.tokenize import word_tokenize
from os import path
from math import sqrt

In [107]:
class Vocabulary:
	def __init__(self):
		self.vocabulary = set('')
		self.stoi = {'':0}
		self.itos = {0:''}

	def add(self, v):
		if type(v) == str:
			self.vocabulary(v)
		elif type(v) == list:
			self.vocabulary = self.vocabulary.union(set(v)) 

	def create_mappings(self):
		self.stoi = {v:i+1 for i, v in enumerate(self.vocabulary)}
		self.itos = {i+1:v for i, v in enumerate(self.vocabulary)}

	def encode(self, s): 
		return [self.stoi[c] for c in s]
	
	def decode(self, i): 
		return [self.itos[n] for n in i]
	

class PreProcessor:
	def __init__(self):
		self.english_vocabulary = Vocabulary()
		self.cherokee_vocabulary = Vocabulary()
		self.cherokee = []
		self.english = []
		self.max_length = 0
		self.count = 0

	def load_text(self, file_name):
		data, language = [], file_name.split('.')[1]

		with open(file_name) as f:
			for line in f.readlines():
				sentence = word_tokenize(line)

				if language == 'en': 
					self.english_vocabulary.add(sentence)
				else:
					self.cherokee_vocabulary.add(sentence)

				self.max_length = max(self.max_length, len(sentence))
				data.append(sentence)
				self.count += 1 
		return data
	
	def get_data(self, file_set):
		cherokee = self.load_text(path.join('chr_en_data', f'{file_set}.chr'))
		english  = self.load_text(path.join('chr_en_data', f'{file_set}.en' ))
		assert len(cherokee) == len(english)
		self.cherokee += cherokee
		self.english  += english

		return cherokee, english

	
	def create_tensors(self):
		self.english_vocabulary.create_mappings()
		self.cherokee_vocabulary.create_mappings()

		english  = torch.zeros(size=(self.count//2, self.max_length), dtype=int)
		cherokee = torch.zeros(size=(self.count//2, self.max_length), dtype=int)

		for i, sen in enumerate(self.english):
			for j, v in enumerate(self.english_vocabulary.encode(sen)):
				english[i, j] = v
		
		for i, sen in enumerate(self.cherokee):
			for j, v in enumerate(self.cherokee_vocabulary.encode(sen)):
				cherokee[i, j] = v

		self.cherokee, self.english = cherokee, english


preprocessor = PreProcessor()
preprocessor.get_data('dev')
preprocessor.get_data('test')
preprocessor.get_data('train')
preprocessor.create_tensors()

test = word_tokenize('ᏣᏌᏙᏰᏃ ᎢᎦᎦᏛ ᏓᏳᏂᎷᏤᎵ ᏂᎬᎾᏛ ᏗᏁᎯ.')

assert preprocessor.cherokee_vocabulary.decode(preprocessor.cherokee_vocabulary.encode(test)) == test
assert preprocessor.english.shape == preprocessor.cherokee.shape

In [109]:
cherokee_vocab_size, english_vocab_size = len(preprocessor.cherokee_vocabulary.vocabulary), len(preprocessor.english_vocabulary.vocabulary)

print(f'Cherokee Vocabulary Size: {cherokee_vocab_size}')
print(f'English  Vocabulary Size: {english_vocab_size}')

Cherokee Vocabulary Size: 44564
English  Vocabulary Size: 13928


In [99]:
EMBEDDING_DIMENSIONS = 5
QKV_DIMENSIONS       = 5
SEQUENCE_LENGTH      = 5
ATTENTION_HEADS      = 2

In [112]:
def mask_tensor(t):
	mask = torch.tril(torch.ones(size=(t.shape)))
	mask[mask==0], mask[mask==1] = float('-inf'), 0
	
	return t + mask

test = torch.randn(SEQUENCE_LENGTH, SEQUENCE_LENGTH)
print(test)
print(mask_tensor(test))

tensor([[-0.1047, -1.5705,  1.2003, -0.5949, -0.2761],
        [ 0.7957, -0.1390,  1.4010,  0.7270, -0.3648],
        [ 0.3677, -0.1476, -0.7904, -0.4763,  0.6792],
        [ 0.6952,  2.1270,  0.2976, -0.4413, -1.1226],
        [ 1.5522,  0.8237, -1.2119, -2.3446, -0.0524]])
tensor([[-0.1047,    -inf,    -inf,    -inf,    -inf],
        [ 0.7957, -0.1390,    -inf,    -inf,    -inf],
        [ 0.3677, -0.1476, -0.7904,    -inf,    -inf],
        [ 0.6952,  2.1270,  0.2976, -0.4413,    -inf],
        [ 1.5522,  0.8237, -1.2119, -2.3446, -0.0524]])


In [303]:
class AttentionHead(nn.Module):
	def __init__(self, masked=False):
		super().__init__()
		self.obtain_key   = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.obtain_query = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.obtain_value = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.masked = masked

	def forward(self, data, encoder_output=None):
		if encoder_output is None: Q, K, V = self.obtain_query(data), self.obtain_key(data), self.obtain_value(data)
		else: Q, K, V = self.obtain_query(data), self.obtain_key(encoder_output), self.obtain_value(encoder_output)
		mat_mul = Q @ K.transpose(-2, -1)
		scaled_mat_mul = mat_mul / sqrt(QKV_DIMENSIONS)
		if self.masked: scaled_mat_mul = mask_tensor(scaled_mat_mul)
		softmax_mat_mul = torch.softmax(scaled_mat_mul, dim=-1)
		output = softmax_mat_mul @ V

		return output


test, e_output = torch.randn(size=(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS)), torch.randn(size=(SEQUENCE_LENGTH, QKV_DIMENSIONS))
test_module = AttentionHead()
print(test_module(test, encoder_output=e_output))



tensor([[ 0.5238, -0.1361, -0.5022,  0.0705,  0.7452],
        [ 0.5188, -0.1687, -0.5098,  0.0967,  0.7222],
        [ 0.5456, -0.0698, -0.5114,  0.0209,  0.7707],
        [ 0.5116, -0.1575, -0.5001,  0.0789,  0.7388],
        [ 0.5441, -0.0897, -0.5100,  0.0411,  0.7618]], grad_fn=<MmBackward0>)


In [153]:
class MultiHeadedAttention(nn.Module):
	def __init__(self, masked=False):
		super().__init__()
		self.heads  = [AttentionHead(masked=masked) for _ in range(ATTENTION_HEADS)]
		self.linear = nn.Linear(EMBEDDING_DIMENSIONS*ATTENTION_HEADS, EMBEDDING_DIMENSIONS)

	def forward(self, data, encoder_output=None):
		vectors = torch.cat([head(data, encoder_output=encoder_output) for head in self.heads], dim=-1)
		return self.linear(vectors)
	

test = torch.randn(size=(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
test_module = MultiHeadedAttention()
print(test_module(test))


tensor([[-0.0296,  0.0624,  0.3835, -0.0347,  0.2198],
        [ 0.0420,  0.0708,  0.4284, -0.0607,  0.2008],
        [-0.0293,  0.0509,  0.4014, -0.0481,  0.2192],
        [ 0.0223,  0.1215,  0.3265,  0.0107,  0.2050],
        [-0.0321,  0.0275,  0.4639, -0.0776,  0.2134]],
       grad_fn=<AddmmBackward0>)


In [104]:
class FeedForward(nn.Module):
	def __init__(self):
		super().__init__()
		self.network = nn.Sequential(
			nn.Linear(EMBEDDING_DIMENSIONS, EMBEDDING_DIMENSIONS),
			nn.ReLU(),
			nn.Linear(EMBEDDING_DIMENSIONS, EMBEDDING_DIMENSIONS)
		)

	def forward(self, data):
		return self.network(data)

test = torch.randn(size=(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
test_module = FeedForward()
print(test_module(test))


tensor([[ 0.2772,  0.7057, -0.5556,  0.3806, -0.3331],
        [ 0.2781,  0.4525, -0.2963,  0.2891,  0.0409],
        [ 0.1867,  0.0698, -0.1522, -0.1708,  0.4364],
        [ 0.1400,  0.0236, -0.2029, -0.1427,  0.4395],
        [ 0.0734,  0.0109, -0.2626, -0.0380,  0.4228]],
       grad_fn=<AddmmBackward0>)


In [110]:
class Encoder(nn.Module):
	def __init__(self):
		super().__init__()
		self.word_embeddings = nn.Embedding(cherokee_vocab_size, EMBEDDING_DIMENSIONS)
		self.positional_encodings = nn.Embedding(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS)
		self.multi_headed_attention = MultiHeadedAttention()
		self.norm1 = nn.LayerNorm(EMBEDDING_DIMENSIONS)
		self.feed_forward = FeedForward()
		self.norm2 = nn.LayerNorm(EMBEDDING_DIMENSIONS)

	def forward(self, data):
		encodings = self.word_embeddings(data) + self.positional_encodings(torch.tensor([i for i in range(SEQUENCE_LENGTH)]))
		attention_vectors = self.multi_headed_attention(encodings)
		normalised = self.norm1(attention_vectors) + encodings # Residual Connection
		fed_through = self.feed_forward(normalised)
		normalised_2 = self.norm2(fed_through) + normalised

		return normalised_2
	
test = torch.randint(low=0, high=cherokee_vocab_size-1, size=(SEQUENCE_LENGTH,))
test_module = Encoder()
print(test_module(test))

tensor([[-2.6583,  1.4576,  1.4369,  7.5770, -1.2309],
        [-1.1150, -0.9315, -0.6729,  2.5359, -2.9129],
        [-2.6358, -3.7781,  1.2280,  1.8935,  0.0227],
        [-1.5676,  0.1664, -1.9978,  2.6061, -0.3017],
        [-4.1008,  0.0193, -1.0644,  2.3958, -1.5581]], grad_fn=<AddBackward0>)


In [340]:
class Decoder(nn.Module):
	def __init__(self):
		super().__init__()
		self.word_embeddings = nn.Embedding(english_vocab_size, EMBEDDING_DIMENSIONS)
		self.positional_encodings = nn.Embedding(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS)
		self.masked_attention = MultiHeadedAttention(masked=True)
		self.norm1 = nn.LayerNorm(EMBEDDING_DIMENSIONS)
		self.cross_attention  = MultiHeadedAttention()
		self.norm2 = nn.LayerNorm(EMBEDDING_DIMENSIONS)
		self.feed_forward = FeedForward()
		self.norm3 = nn.LayerNorm(EMBEDDING_DIMENSIONS)

	def forward(self, data, encoder_output):
		encodings = self.word_embeddings(data) + self.positional_encodings(torch.tensor([i for i in range(data.shape[0])]))
		print(encodings)
		attention_vectors = self.masked_attention(encodings)
		normalised = self.norm1(attention_vectors) + encodings
		cross_attention = self.cross_attention(normalised, encoder_output=encoder_output)
		normalised = self.norm2(cross_attention) + normalised
		linear = self.feed_forward(normalised)
		normalised = self.norm3(linear) + normalised

		return normalised


test = torch.randint(low=0, high=english_vocab_size-1, size=(SEQUENCE_LENGTH,))
encoder_output = torch.randn(size=(SEQUENCE_LENGTH, QKV_DIMENSIONS))
test_module = Decoder()
print(test_module(test, encoder_output=encoder_output))
		

tensor([[ 1.2994, -0.0791, -0.9142,  0.3885, -0.4406],
        [ 0.1903, -0.6254, -0.5760, -0.9234,  0.9340],
        [-1.2949, -1.8305, -1.9931, -1.0267, -1.1494],
        [ 0.2776,  0.1528, -0.5993, -1.9231, -2.2113],
        [-1.4634,  0.2542,  0.5791, -0.7519,  0.6976]], grad_fn=<AddBackward0>)
tensor([[ 2.0625,  1.1450, -3.0582,  0.2328, -0.1280],
        [ 2.0636,  0.5796, -1.5802, -2.0583, -0.0052],
        [-0.1194, -0.8987, -2.9507, -1.1116, -2.2142],
        [ 0.4009,  0.6837, -2.6979, -0.4149, -2.2751],
        [ 1.0244,  0.4895, -0.6037, -1.4303, -0.1642]], grad_fn=<AddBackward0>)
