In [56]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from torch import nn
import torch
import csv
from random import choice

In [57]:
# Load data
translation_clusters = {}
idioms = set()
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

with open('dataset.csv') as f:
    reader = csv.DictReader(f)
    for row in reader:
        idiom = row['english']
        translation = row['german']
        idioms.add(idiom)
        idioms.add(translation)

        if idiom in translation_clusters:
            translation_clusters[idiom].add(translation)
        else:
            translation_clusters[idiom] = {translation}

        if translation in translation_clusters:
            translation_clusters[translation].add(idiom)
        else:
            translation_clusters[translation] = {idiom}

# Convert idioms to tokenized representations
max_length = 0
encoder = {}

for idiom in idioms:
    encoded = tokenizer(idiom, return_tensors='pt', padding=True, truncation=True).input_ids
    max_length = max(max_length, encoded.shape[1])
    encoder[idiom] = encoded

num_idioms = len(idioms)
decoder = {}

# Generate idiom tensor
idiom_tensor = torch.zeros(size=(num_idioms, max_length), dtype=torch.long)
for i, idiom in enumerate(idioms):
    encoded = encoder[idiom]
    idiom_tensor[i, :encoded.shape[1]] = encoded[0]
    encoder[idiom] = idiom_tensor[i]
    decoder[tuple(idiom_tensor[i].tolist())] = idiom

assert decoder[tuple(encoder['makes me feel like'].tolist())] == 'makes me feel like'

# Convert idiom clusters to tokenized representations
translation_clusters_tokenized = {}
for idiom in translation_clusters:
    translation_clusters_tokenized[tuple(encoder[idiom].tolist())] = set()
    for match in translation_clusters[idiom]:
        if tuple(encoder[match].tolist()) == tuple(encoder[idiom].tolist()): continue
        translation_clusters_tokenized[tuple(encoder[idiom].tolist())].add(tuple(encoder[match].tolist()))

# Print the shape of idiom_tensor
print(idiom_tensor.shape)

translation_clusters = translation_clusters_tokenized


torch.Size([14673, 24])


In [45]:
train = idiom_tensor[:int(0.9*num_idioms)]
test = idiom_tensor[int(0.9*num_idioms):int(0.95*num_idioms)]
val = idiom_tensor[int(0.95*num_idioms)]

In [67]:

latent_dimensions = 2
device = 'mps'
iterations = 2000
learning_rate = 0.01
batch_size = 2

In [47]:
def tensor_to_set(tensor):
    return {tuple(d.tolist()) for d in tensor}

In [68]:
def get_batch(data):
	indexes = torch.randint(0, len(data), (batch_size,))
	batch = set()
	set_data = tensor_to_set(data)

	for i in indexes:
		idiom = data[i]

		possible_idioms = set_data.intersection(translation_clusters[tuple(idiom.tolist())])

		if len(possible_idioms) == 0:
			continue

		assert not(tuple(idiom.tolist()) in possible_idioms)

		random_cluster_mate = choice(list(possible_idioms))
		batch.add(tuple(random_cluster_mate))
		batch.add(tuple(idiom.tolist()))

	batch = torch.tensor([list(x) for x in list(batch)]).to(device)
	return batch

print(get_batch(train).shape)


torch.Size([4, 24])


In [63]:
def get_positive_sample(data):
    positive_samples = []

    for anchor in data:
        possible_positive = translation_clusters[tuple(anchor.tolist())].intersection(tensor_to_set(data))


        chosen = torch.tensor(choice(list(possible_positive)))
        positive_samples.append(chosen)

    positive_samples = torch.stack(positive_samples).to(device)

    return positive_samples


def get_negative_sample(data):
    negative_samples = []

    for anchor in data:
        possible_negative = tensor_to_set(data).difference(translation_clusters[tuple(anchor.tolist())])

        chosen = torch.tensor(choice(list(possible_negative)))
        negative_samples.append(chosen)

    negative_samples = torch.stack(negative_samples).to(device)

    return negative_samples

In [71]:
class Model(nn.Module):
	def __init__(self, pooling):
		super().__init__()
		self.pooling = pooling
		self.roberta = AutoModelForMaskedLM.from_pretrained('xlm-roberta-base')
		self.output_layer = nn.Linear(250002, latent_dimensions) 

	def forward(self, input):
		input_ids = torch.tensor(input, dtype=torch.long).to(device)
		attention_mask = torch.LongTensor(torch.ones(input.shape, dtype=torch.long)).to(device)
		roberta_logits = self.roberta(input_ids=input_ids, attention_mask=attention_mask).logits
		if self.pooling == 'average': pooled = torch.mean(roberta_logits, dim=0)
		vector_representation = self.output_layer(pooled)
		
		return vector_representation

In [72]:
model = Model(pooling='average')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [73]:
for _ in range(10):
	batch = get_batch(train)
	positive_samples = get_positive_sample(batch)
	negative_samples = get_negative_sample(batch)

	encodings = model(batch)
	positive_sample_encodings = model(positive_samples)
	negative_sample_encodings = model(negative_samples)

	triplet_loss = nn.TripletMarginLoss()
	loss = triplet_loss(encodings, positive_sample_encodings, negative_sample_encodings)

	optimizer.zero_grad()
	loss.backward()
	optimizer.step()

	print(loss)


  input_ids = torch.tensor(input, dtype=torch.long).to(device)


RuntimeError: MPS backend out of memory (MPS allocated: 6.06 GB, other allocations: 2.93 GB, max allowed: 9.07 GB). Tried to allocate 91.55 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).