In [122]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from torch import nn
import torch
import csv
from random import shuffle

In [244]:
translation_clusters = {}
idioms = set()
encoder = {}
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

with open('dataset.csv') as f:
	reader = csv.DictReader(f)
	for row in reader:
		if row['english'] in translation_clusters:
			translation_clusters[row['english']].add(row['german'])
		else:
			translation_clusters[row['english']] = {row['german']}

		if row['german'] in translation_clusters:
			translation_clusters[row['german']].add(row['english'])
		else:
			translation_clusters[row['german']] = {row['english']}

		idioms.add(row['english'])
		idioms.add(row['german'])

max_length = 0

for idiom in idioms:
	for match in translation_clusters[idiom]:
		translation_clusters[idiom] = translation_clusters[idiom].union(translation_clusters[match])

	encoded = tokenizer(idiom, return_tensors='pt').input_ids
	max_length = max(max_length, encoded.shape[1])

	encoder[idiom] = encoded

num_idioms = len(idioms)
decoder = {}

idiom_tensor = torch.full(size=(num_idioms, max_length), fill_value=-1)
for i, idiom in enumerate(idioms):
	encoded = encoder[idiom]
	idiom_tensor[i, :encoded.shape[1]] = encoded[0]
	encoder[idiom] = idiom_tensor[i]
	decoder[tuple(idiom_tensor[i].tolist())] = idiom

assert decoder[tuple(encoder['makes me feel like'].tolist())] == 'makes me feel like'

print(idiom_tensor.shape)


torch.Size([14673, 24])


In [242]:
train = idiom_tensor[:int(0.9*num_idioms)]
test = idiom_tensor[int(0.9*num_idioms):int(0.95*num_idioms)]
val = idiom_tensor[int(0.95*num_idioms)]

In [145]:

latent_dimensions = 1000
device = 'mps'
iterations = 2000
learning_rate = 0.01
batch_size = 64

In [341]:
def get_batch(data):
    indexes = torch.randint(0, len(data), (batch_size,))
    data = set()

    for i in indexes:
        data.add(tuple(idiom_tensor[i].tolist()))
        idiom = decoder[tuple(idiom_tensor[i].tolist())]
        for clust in translation_clusters[idiom]:
            data.add(tuple(encoder[clust].tolist()))

    data = [list(x) for x in list(data)]
    return torch.tensor(data)

print(len(get_batch(train)))

240


In [165]:
class Model(nn.Module):
	def __init__(self, pooling):
		super().__init__()
		self.pooling = pooling
		self.roberta = AutoModelForMaskedLM.from_pretrained('xlm-roberta-base')
		self.output_layer = nn.Linear(250002, latent_dimensions) 

	def forward(self, input):
		input_ids = torch.LongTensor(input["input_ids"])
		attention_mask = torch.LongTensor(torch.ones(max_length))
		roberta_logits = self.roberta(input_ids=input_ids, attention_mask=attention_mask).logits[0]
		if self.pooling == 'average': pooled = torch.mean(roberta_logits, dim=0)
		vector_representation = self.output_layer(pooled)
		
		return vector_representation

In [166]:
model = Model(pooling='average')

In [189]:
for _ in range(1):
	batch = get_batch(train)
	
	for data in batch:
		model(data)