In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from torch import nn
import torch
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load dataset and create translation clusters

translation_clusters = {}
idioms = set()

encoder = {}
decoder = {}

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

with open('dataset.csv') as f:
    reader = csv.DictReader(f)

    for row in reader:
        if row['english'] in translation_clusters:
            translation_clusters[row['english']].add(row['german'])
        else:
            translation_clusters[row['english']] = {row['german']}

        if row['german'] in translation_clusters:
            translation_clusters[row['german']].add(row['english'])
        else:
            translation_clusters[row['german']] = {row['english']}

        idioms.add(row['english'])
        idioms.add(row['german'])

max_length = 0

for idiom in idioms:
    for match in translation_clusters[idiom]:
        translation_clusters[idiom] = translation_clusters[idiom].union(translation_clusters[match])
    

    encoded = tokenizer(idiom, return_tensors='pt').input_ids
    max_length = max(max_length, encoded.shape[1])

    encoder[idiom] = encoded
    decoder[encoded] = idiom

assert decoder[encoder['move mountains']] == 'move mountains'

num_idioms = len(idioms)

idiom_tensor = torch.full(size=(num_idioms, max_length), fill_value=-1)
for i, idiom in enumerate(idioms):
    encoded = encoder[idiom]
    for j in range(encoded.shape[1]):
        idiom_tensor[i, j] = encoded[0, j].item()

In [None]:
EPOCHS = 1000
LATENT_DIMENSIONS = 100

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
        self.roberta = AutoModelForMaskedLM.from_pretrained('xlm-roberta-base')
        self.output_layer = nn.Linear(self.roberta.config.last_hidden_state, LATENT_DIMENSIONS) 

