In [1]:
!pip install -r requirements.txt

from transformers import AutoTokenizer, AutoModelForMaskedLM
from torch import nn
import torch
import csv
from random import choice
import wandb
import plotly.express as px
import numpy as np
import umap.umap_ as umap

[0m

In [4]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
# Load data
translation_clusters = {}
idioms = set()
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

with open('dataset.csv') as f:
	reader = csv.DictReader(f)
	for row in reader:
		idiom = row['english']
		translation = row['german']
		idioms.add(idiom)
		idioms.add(translation)

		if idiom in translation_clusters:
			translation_clusters[idiom].add(translation)
		else:
			translation_clusters[idiom] = {translation}

		if translation in translation_clusters:
			translation_clusters[translation].add(idiom)
		else:
			translation_clusters[translation] = {idiom}

# Convert idioms to tokenized representations
max_length = 0
encoder = {}

for idiom in idioms:
	encoded = tokenizer(idiom, return_tensors='pt', padding=True, truncation=True).input_ids
	max_length = max(max_length, encoded.shape[1])
	encoder[idiom] = encoded

num_idioms = len(idioms)
decoder = {}

# Generate idiom tensor
idiom_tensor = torch.zeros(size=(num_idioms, max_length), dtype=torch.long)
for i, idiom in enumerate(idioms):
	encoded = encoder[idiom]
	idiom_tensor[i, :encoded.shape[1]] = encoded[0]
	encoder[idiom] = idiom_tensor[i]
	decoder[tuple(idiom_tensor[i].tolist())] = idiom

assert decoder[tuple(encoder['makes me feel like'].tolist())] == 'makes me feel like'

# Convert idiom clusters to tokenized representations
translation_clusters_tokenized = {}
for idiom in translation_clusters:
	translation_clusters_tokenized[tuple(encoder[idiom].tolist())] = set()
	for match in translation_clusters[idiom]:
		if tuple(encoder[match].tolist()) == tuple(encoder[idiom].tolist()): continue
		translation_clusters_tokenized[tuple(encoder[idiom].tolist())].add(tuple(encoder[match].tolist()))

# Print the shape of idiom_tensor
print(idiom_tensor.shape)

translation_clusters = translation_clusters_tokenized


Downloading config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading sentencepiece.bpe.model:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

torch.Size([14673, 24])


In [4]:
train = idiom_tensor[:int(0.9*num_idioms)]
test = idiom_tensor[int(0.9*num_idioms):int(0.95*num_idioms)]
val = idiom_tensor[int(0.95*num_idioms):]

In [5]:
latent_dimensions = 64
device = 'cuda'
iterations = 500
learning_rate = 0.00001
batch_size = 32
batch_accumulation = 1
roberta_output_length = 250002

roberta_output_length = 250002

wandb.init(
	project="Cross-Lingual-Idiom-Sense-Clustering",
	
	# track hyperparameters and run metadata
	config={
	"learning_rate": learning_rate,
	"architecture": "BERT",
	"epochs": iterations,
	"embedding_dimensions":latent_dimensions,
	}
)


[34m[1mwandb[0m: Currently logged in as: [33mshayaan-absar[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
def tensor_to_set(tensor):
	return {tuple(d.tolist()) for d in tensor}

In [19]:
def create_batch(data):
	indexes = torch.randint(0, len(data), (batch_size,))
	batch = set()
	set_data = tensor_to_set(data)

	for i in indexes:
		idiom = data[i]

		possible_idioms = set_data.intersection(translation_clusters[tuple(idiom.tolist())])

		if len(possible_idioms) == 0:
			continue

		assert not(tuple(idiom.tolist()) in possible_idioms)

		random_cluster_mate = choice(list(possible_idioms))
		batch.add(tuple(random_cluster_mate))
		batch.add(tuple(idiom.tolist()))

	batch = torch.tensor([list(x) for x in list(batch)]).to(device)

	return batch

def get_batch(data):
	batch = torch.tensor([])

	while batch.numel() == 0:
		batch = create_batch(data)
	return batch

print(get_batch(train).shape)


torch.Size([56, 24])


In [8]:
def get_positive_sample(data):
	positive_samples = []

	for anchor in data:
		possible_positive = translation_clusters[tuple(anchor.tolist())].intersection(tensor_to_set(data))


		chosen = torch.tensor(choice(list(possible_positive)))
		positive_samples.append(chosen)

	positive_samples = torch.stack(positive_samples).to(device)

	return positive_samples


def get_negative_sample(data):
	negative_samples = []

	for anchor in data:
		possible_negative = tensor_to_set(data).difference(translation_clusters[tuple(anchor.tolist())])

		chosen = torch.tensor(choice(list(possible_negative)))
		negative_samples.append(chosen)

	negative_samples = torch.stack(negative_samples).to(device)

	return negative_samples

In [9]:
class Model(nn.Module):
	def __init__(self, pooling):
		super().__init__()
		self.pooling = pooling
		self.roberta = AutoModelForMaskedLM.from_pretrained('xlm-roberta-base')
		self.batch_norm = nn.BatchNorm1d(max_length)
		self.output_layer = nn.Linear(roberta_output_length, latent_dimensions)
		self.activation = nn.LeakyReLU()

	def forward(self, input):
		input_ids = torch.tensor(input, dtype=torch.long).to(device)
		attention_mask = torch.LongTensor(torch.ones(input.shape, dtype=torch.long)).to(device)
		roberta_logits = self.roberta(input_ids=input_ids, attention_mask=attention_mask).logits
		batch_norm_roberta_logits = self.batch_norm(roberta_logits)
		if self.pooling == 'mean': pooled = torch.mean(batch_norm_roberta_logits, dim=1)
		if self.pooling == 'max': pooled, _ = torch.max(batch_norm_roberta_logits, dim=1)
		if self.pooling == 'min': pooled, _ = torch.min(batch_norm_roberta_logits, dim=1)
		vector_representation = self.output_layer(pooled)
		activation = self.activation(vector_representation)

		return activation

In [10]:
model = Model(pooling='mean')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.1)

Downloading pytorch_model.bin:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

In [11]:
triplet_loss = nn.TripletMarginLoss()

def pass_batch(batch):
	torch.cuda.empty_cache()
	positive_samples = get_positive_sample(batch)
	negative_samples = get_negative_sample(batch)

	encodings = model(batch)
	positive_sample_encodings = model(positive_samples)
	negative_sample_encodings = model(negative_samples)

	loss = triplet_loss(encodings, positive_sample_encodings, negative_sample_encodings)
	return loss

print(f'Batch size=~{batch_size*batch_accumulation*2}')
for i in range(iterations):
	torch.cuda.empty_cache()
	loss = 0
	for j in range(batch_accumulation):
		batch = get_batch(train)
		loss += pass_batch(batch)
	loss /= batch_accumulation
	optimizer.zero_grad()
	loss.backward()
	optimizer.step()

	val_loss = pass_batch(get_batch(val))
	wandb.log({"val_loss": val_loss})

	wandb.log({"loss": loss})
wandb.finish()

Batch size=~64


  input_ids = torch.tensor(input, dtype=torch.long).to(device)


0,1
loss,▇▅▂█▁▃▄▂▁▂▄▁▁▅▇▁▁▁▄▄▂▃▃▁▂▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▃▃▅▁▁▇▂▁▁▄▁▁▃▄▂▂▁▄▄▁▃▁▁▁▁▅▁▁▅▁▃▁▁▄▄▁▆▁

0,1
loss,0.0
val_loss,0.0


In [57]:
model = Model(pooling='mean')
model.load_state_dict(torch.load('model.pt'))
model.to(device)

<All keys matched successfully>

In [115]:
batch_size=100
tds = create_batch(test)
print(tds.shape)

torch.Size([8, 24])


In [124]:
encodings = model(tds)

encodings = encodings.detach().cpu().numpy()
reducer = umap.UMAP(n_components=2)
encodings = reducer.fit_transform(encodings)

captions = [decoder[tuple(tds[i].tolist())] for i in range(tds.shape[0])]
count = 0
categories = [None for i in range(tds.shape[0])]

for i in range(tds.shape[0]):
    curr = tds[i]
    
    if categories[i] is not None:
        continue
        
    categories[i] = count
    
    for j in range(tds.shape[0]):
        if categories[j] is None:
            if tuple(tds[j].tolist()) in translation_clusters[tuple(curr.tolist())]:
                categories[j] = count
    
    count += 1

fig = px.scatter(x=encodings[:, 0], y=encodings[:, 1], color=categories, color_continuous_scale='Viridis')

fig.update_traces(marker=dict(size=15))

for i, caption in enumerate(captions):
    fig.add_annotation(
        x=encodings[i, 0],
        y=encodings[i, 1],
        text=caption,
        showarrow=True,
        arrowhead=2,
        font=dict(size=10),
    )

fig.update_layout(title="UMAP Projection on Fine-Tuned Model")
fig.show()


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).


n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1



In [122]:
roberta = AutoModelForMaskedLM.from_pretrained('xlm-roberta-base').to(device)
encodings = roberta(tds).logits
encodings = torch.mean(encodings, dim=1)

encodings = encodings.detach().cpu().numpy()
reducer = umap.UMAP(n_components=2)
encodings = reducer.fit_transform(encodings)


fig = px.scatter(x=encodings[:, 0], y=encodings[:, 1], color=categories, color_continuous_scale='Viridis')
fig.update_traces(marker=dict(size=15))

for i, caption in enumerate(captions):
    fig.add_annotation(
        x=encodings[i, 0],
        y=encodings[i, 1],
        text=caption,
        showarrow=True,
        arrowhead=2,
        font=dict(size=10),
    )

fig.update_layout(title="UMAP Projection")
fig.show()



n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1

