# Problem 2

In [11]:
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
import pickle 

with open('avg_embeddings_partial.pkl', 'rb') as f:
    avg_embeddings = pickle.load(f)

In [5]:
len(avg_embeddings)

15974

In [7]:
glove_filepath = "../home/schen9/glove.6B.300d-vocabulary.txt"
csv_glove_filepath = "glove.csv"

In [15]:
with open(glove_filepath, "r", encoding="utf-8") as text_file:
    glove_rows = text_file.read().strip().split("\n")

print(f"Finished reading from {glove_filepath}") 

Finished reading from ../home/schen9/glove.6B.300d-vocabulary.txt


In [16]:
import csv

with open(csv_glove_filepath, "w", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Text"]) # Header
    for i, row in enumerate(tqdm(glove_rows)):
        writer.writerow([row.strip()])

print(f"Finished writing to {csv_glove_filepath}") 

  0%|          | 0/400000 [00:00<?, ?it/s]

Finished writing to glove.csv


In [17]:
from datasets import Dataset, load_dataset

glove_dataset = load_dataset("csv",data_files=csv_glove_filepath) 
glove_dataset = glove_dataset.filter(lambda x: x["Text"] is not None) # filter out NoneTypes
glove_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/400000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text'],
        num_rows: 399997
    })
})

In [18]:
from transformers import AutoTokenizer

transformer_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)

In [19]:
def tokenize(batch):
    return tokenizer(batch["Text"], truncation=True)

batch_size = 50
glove_tokens = glove_dataset.map(tokenize, batched=True, batch_size=batch_size)
glove_tokens

Map:   0%|          | 0/399997 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text', 'input_ids', 'attention_mask'],
        num_rows: 399997
    })
})

In [22]:
glove_df = glove_tokens["train"].to_pandas()
glove_df

Unnamed: 0,Text,input_ids,attention_mask
0,the,"[0, 627, 2]","[1, 1, 1]"
1,",","[0, 6, 2]","[1, 1, 1]"
2,.,"[0, 4, 2]","[1, 1, 1]"
3,of,"[0, 1116, 2]","[1, 1, 1]"
4,to,"[0, 560, 2]","[1, 1, 1]"
...,...,...,...
399992,chanty,"[0, 40805, 219, 2]","[1, 1, 1, 1]"
399993,kronik,"[0, 330, 2839, 967, 2]","[1, 1, 1, 1, 1]"
399994,rolonda,"[0, 9396, 11192, 2]","[1, 1, 1, 1]"
399995,zsombor,"[0, 329, 29, 5223, 368, 2]","[1, 1, 1, 1, 1, 1]"


In [23]:
word_embeddings = {}

In [44]:
for word, t_vector in tqdm(zip(glove_df["Text"], glove_df["input_ids"]), total=len(glove_df)):
    embeddings = [avg_embeddings[token].to(device) for token in t_vector if token in avg_embeddings]
    w_embedding = torch.empty(embeddings[0].shape[0]).to(device)
    for emb in embeddings:
        w_embedding = torch.add(w_embedding,emb.to(device))
    word_embeddings[word] = torch.div(w_embedding, len(embeddings))

  0%|          | 0/399997 [00:00<?, ?it/s]

In [42]:
avg_embeddings[6026].shape[0]

768

In [49]:
import pickle 

with open('word_embeddings_partial.pkl', 'wb') as f:
    pickle.dump(word_embeddings, f)

In [48]:
def cosine_similarity(word1_tensor,word2_tensor):
    return torch.dot(word1_tensor,word2_tensor) / (torch.norm(word1_tensor) * (word2_tensor))

In [76]:
def word_similarity(word1,word2,eps=1):
    w1_tensor = torch.nan_to_num(word_embeddings[word1])
    w2_tensor = torch.nan_to_num(word_embeddings[word2])
    print(torch.norm(w1_tensor))
    print(torch.norm(w2_tensor))
    print(w1_tensor)
    print(w2_tensor)
    return torch.dot(w1_tensor,w2_tensor) / (torch.norm(w1_tensor) * torch.norm(w2_tensor) +eps)

In [77]:
word_similarity("kitty","kitty")

tensor(inf, device='cuda:0')
tensor(inf, device='cuda:0')
tensor([ 5.9333e+27,  2.7966e-01,  1.8903e-01, -2.9305e-01,  5.9333e+27,
        -2.3750e-01,  5.9333e+27,  5.2783e-01, -4.6367e-02,  1.9704e-01,
         0.0000e+00,  0.0000e+00,  7.0700e-02,  0.0000e+00,  7.3938e-01,
        -7.0024e-01,  4.5911e-01, -4.4229e-03,  0.0000e+00, -5.6776e-01,
         0.0000e+00,  0.0000e+00,  0.0000e+00, -5.9232e-02,  0.0000e+00,
         0.0000e+00, -2.5711e-01,  2.6586e-01, -3.6007e-01, -9.5725e-02,
        -7.0484e-02,  1.4148e-01,  0.0000e+00,  0.0000e+00,  8.6378e-01,
         3.0338e-01,  0.0000e+00,  6.3862e-02,  7.1336e-01,  8.0956e-02,
         8.1686e-01, -1.9891e-01, -1.1694e-02,  0.0000e+00,  6.2930e-02,
         0.0000e+00,  8.8650e-01,  0.0000e+00, -9.2120e-02, -3.9064e-01,
         1.7566e-01,  0.0000e+00,  7.9736e-02,  0.0000e+00, -2.0150e-02,
        -1.5637e-01,  2.0143e-01,  6.1145e-01,  0.0000e+00,  1.2409e-01,
         0.0000e+00,  5.2021e+00,  1.6437e-01, -2.9839e-01,  0.000

tensor(nan, device='cuda:0')

In [69]:
# Define two 1D tensors (vectors)
vector1 = torch.tensor([1.0, 2.0, 3.0]).to(device) # Shape: [3]
vector2 = torch.tensor([4.0, 5.0, 6.0]).to(device)  # Shape: [3]

# Compute the dot product
dot_product = torch.dot(vector1, vector2)
print(type(torch.dot(vector1, vector2)))

print("Vector 1:", vector1)
print("Vector 2:", vector2)
print("Dot Product:", dot_product)

<class 'torch.Tensor'>
Vector 1: tensor([1., 2., 3.], device='cuda:0')
Vector 2: tensor([4., 5., 6.], device='cuda:0')
Dot Product: tensor(32., device='cuda:0')


In [64]:
import torch

# Example tensor with NaN values
tensor = torch.tensor([1.0, float('nan'), 3.0, float('nan'), 5.0])

# Replace NaN values with 0
tensor_cleaned = torch.nan_to_num(tensor, nan=0.0)

print("Original Tensor:", tensor)
print("Cleaned Tensor:", tensor_cleaned)

Original Tensor: tensor([1., nan, 3., nan, 5.])
Cleaned Tensor: tensor([1., 0., 3., 0., 5.])


In [54]:
print(w_emb)

{'the': tensor([ 6.0274e+30, -3.3956e-01,  2.7980e-02, -3.9658e-02,  6.0274e+30,
         1.2447e-01,  4.5923e+22,  1.2841e+00,  8.0342e+16,  9.4723e-02,
         8.0321e+16,  1.1669e-01,  8.0330e+16, -1.8452e-01,  8.0321e+16,
         8.9204e-03,  8.0350e+16, -3.3984e-01,  8.0351e+16, -1.7192e+00,
         8.0314e+16,  1.5052e-01,  8.0314e+16,  3.8901e-01,  8.0343e+16,
         2.0620e-02,  8.0346e+16,  3.9876e-01,  8.0347e+16,  2.2550e-01,
         8.0321e+16,  7.5196e-03,  8.0349e+16,  1.3589e-01,  8.0346e+16,
         2.0335e-01,  8.0347e+16,  5.1875e-02,  8.0343e+16,  1.1650e-01,
         8.0325e+16,  8.2385e-01,  8.0321e+16, -1.4402e-01,  8.0348e+16,
        -5.0784e-02,  8.0341e+16, -1.9922e-01,  8.0349e+16, -4.6669e-01,
         8.0343e+16, -2.7125e-01,  8.0343e+16,  5.0448e-01,  8.0348e+16,
        -6.6300e-01,  8.0346e+16, -4.4401e-01,  8.0347e+16,  1.9839e-01,
         8.0321e+16, -1.0313e+00,  8.0344e+16,  5.9337e-02,  8.0346e+16,
         4.6629e-02,  8.0321e+16,  3.8148e-

In [None]:
most_similar("cactus")

In [None]:
most_similar("cake")

In [None]:
most_similar("angry")

In [None]:
most_similar("quickly")

In [None]:
most_similar("between")

In [None]:
most_similar("the")