In [1]:
import torch
import torch.nn as nn
from nltk.tokenize import word_tokenize
from os import path
from math import sqrt

In [2]:
class Vocabulary:
	def __init__(self):
		self.vocabulary = set('')
		self.stoi = {'':0}
		self.itos = {0:''}

	def add(self, v):
		if type(v) == str:
			self.vocabulary(v)
		elif type(v) == list:
			self.vocabulary = self.vocabulary.union(set(v)) 

	def create_mappings(self):
		self.stoi = {v:i+1 for i, v in enumerate(self.vocabulary)}
		self.itos = {i+1:v for i, v in enumerate(self.vocabulary)}

	def encode(self, s): 
		return [self.stoi[c] for c in s]
	
	def decode(self, i): 
		return [self.itos[n] for n in i]
	

class PreProcessor:
	def __init__(self):
		self.english_vocabulary = Vocabulary()
		self.cherokee_vocabulary = Vocabulary()
		self.cherokee = []
		self.english = []
		self.max_length = 0
		self.count = 0

	def load_text(self, file_name):
		data, language = [], file_name.split('.')[1]

		with open(file_name) as f:
			for line in f.readlines():
				sentence = word_tokenize(line)

				if language == 'en': 
					self.english_vocabulary.add(sentence)
				else:
					self.cherokee_vocabulary.add(sentence)

				self.max_length = max(self.max_length, len(sentence))
				data.append(sentence)
				self.count += 1 
		return data
	
	def get_data(self, file_set):
		cherokee = self.load_text(path.join('chr_en_data', f'{file_set}.chr'))
		english  = self.load_text(path.join('chr_en_data', f'{file_set}.en' ))
		assert len(cherokee) == len(english)
		self.cherokee += cherokee
		self.english  += english

		return cherokee, english

	
	def create_tensors(self):
		self.english_vocabulary.create_mappings()
		self.cherokee_vocabulary.create_mappings()

		english  = torch.zeros(size=(self.count//2, self.max_length), dtype=int)
		cherokee = torch.zeros(size=(self.count//2, self.max_length), dtype=int)

		for i, sen in enumerate(self.english):
			for j, v in enumerate(self.english_vocabulary.encode(sen)):
				english[i, j] = v
		
		for i, sen in enumerate(self.cherokee):
			for j, v in enumerate(self.cherokee_vocabulary.encode(sen)):
				cherokee[i, j] = v

		self.cherokee, self.english = cherokee, english


preprocessor = PreProcessor()
preprocessor.get_data('dev')
preprocessor.get_data('test')
preprocessor.get_data('train')
preprocessor.create_tensors()

test = word_tokenize('ᏣᏌᏙᏰᏃ ᎢᎦᎦᏛ ᏓᏳᏂᎷᏤᎵ ᏂᎬᎾᏛ ᏗᏁᎯ.')

assert preprocessor.cherokee_vocabulary.decode(preprocessor.cherokee_vocabulary.encode(test)) == test
assert preprocessor.english.shape == preprocessor.cherokee.shape

In [5]:
cherokee, english = preprocessor.cherokee, preprocessor.english
print(cherokee[0])

tensor([34498,  7141, 40884, 16430,  7141, 40884, 39221,  7141, 39464, 35046,
        22862, 17129, 16350, 19761, 23319, 40884, 23696, 17479, 33683,  8831,
        32897,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0])


In [24]:
EMBEDDING_DIMENSIONS = 2
SEQUENCE_LENGTH      = 5
QKV_DIMENSIONS       = 2

In [38]:
class AttentionHead(nn.Module):
	def __init__(self):
		super().__init__()
		self.obtain_key   = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.obtain_query = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)
		self.obtain_value = nn.Linear(EMBEDDING_DIMENSIONS, QKV_DIMENSIONS)

	def forward(self, data):
		Q, K, V = self.obtain_key(data), self.obtain_query(data), self.obtain_value(data)
		mat_mul = Q @ torch.transpose(K, 0, 1)
		scaled_mat_mul = mat_mul / sqrt(QKV_DIMENSIONS)
		softmax_mat_mul = torch.softmax(scaled_mat_mul, dim=-1)
		output = softmax_mat_mul @ V

		return output
	
test = torch.randn(size=(SEQUENCE_LENGTH, EMBEDDING_DIMENSIONS))
test_module = AttentionHead()
print(test_module(test))



tensor([[0.6418, 0.5756],
        [0.5064, 0.7219],
        [0.7315, 0.4322],
        [0.6999, 0.4847],
        [0.6573, 0.5526]], grad_fn=<MmBackward0>)
