# Problem 2

In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu = torch.device("cpu")
print(cpu)

cuda
cpu


In [2]:
import pickle 

with open('avg_embeddings.pkl', 'rb') as f:
    avg_embeddings = pickle.load(f)

In [3]:
len(avg_embeddings)

49381

In [4]:
glove_filepath = "../home/schen9/glove.6B.300d-vocabulary.txt"
csv_glove_filepath = "glove.csv"

In [5]:
with open(glove_filepath, "r", encoding="utf-8") as text_file:
    glove_rows = text_file.read().strip().split("\n")

print(f"Finished reading from {glove_filepath}") 

Finished reading from ../home/schen9/glove.6B.300d-vocabulary.txt


In [6]:
import csv

with open(csv_glove_filepath, "w", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Text"]) # Header
    for i, row in enumerate(tqdm(glove_rows)):
        writer.writerow([row.strip()])

print(f"Finished writing to {csv_glove_filepath}") 

  0%|          | 0/400000 [00:00<?, ?it/s]

Finished writing to glove.csv


In [7]:
from datasets import Dataset, load_dataset

glove_dataset = load_dataset("csv",data_files=csv_glove_filepath) 
glove_dataset = glove_dataset.filter(lambda x: x["Text"] is not None) # filter out NoneTypes
glove_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/400000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text'],
        num_rows: 399997
    })
})

In [8]:
from transformers import AutoTokenizer

transformer_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)

In [9]:
def tokenize(batch):
    return tokenizer(batch["Text"], truncation=True)

batch_size = 50
glove_tokens = glove_dataset.map(tokenize, batched=True, batch_size=batch_size)
glove_tokens

Map:   0%|          | 0/399997 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text', 'input_ids', 'attention_mask'],
        num_rows: 399997
    })
})

In [10]:
glove_df = glove_tokens["train"].to_pandas()
glove_df

Unnamed: 0,Text,input_ids,attention_mask
0,the,"[0, 627, 2]","[1, 1, 1]"
1,",","[0, 6, 2]","[1, 1, 1]"
2,.,"[0, 4, 2]","[1, 1, 1]"
3,of,"[0, 1116, 2]","[1, 1, 1]"
4,to,"[0, 560, 2]","[1, 1, 1]"
...,...,...,...
399992,chanty,"[0, 40805, 219, 2]","[1, 1, 1, 1]"
399993,kronik,"[0, 330, 2839, 967, 2]","[1, 1, 1, 1, 1]"
399994,rolonda,"[0, 9396, 11192, 2]","[1, 1, 1, 1]"
399995,zsombor,"[0, 329, 29, 5223, 368, 2]","[1, 1, 1, 1, 1, 1]"


In [11]:
word_embeddings = {}

In [12]:
for word, t_vector in tqdm(zip(glove_df["Text"], glove_df["input_ids"]), total=len(glove_df)):
    embeddings = [avg_embeddings[token].to(device) for token in t_vector if token in avg_embeddings]
    w_embedding = torch.empty(embeddings[0].shape[0]).to(device)
    for emb in embeddings:
        w_embedding = torch.add(w_embedding,emb.to(device))
    word_embeddings[word] = torch.div(w_embedding, len(embeddings))

  0%|          | 0/399997 [00:00<?, ?it/s]

In [14]:
avg_embeddings[6026].shape[0]

768

In [13]:
import pickle 

with open('word_embeddings.pkl', 'wb') as f:
    pickle.dump(word_embeddings, f)

In [None]:
# def word_similarity(word1,word2,eps=1):
#     w1_tensor = torch.nan_to_num(word_embeddings[word1])
#     w2_tensor = torch.nan_to_num(word_embeddings[word2])
#     w1_tensor = torch.nn.functional.normalize(w1_tensor, p=2, dim=0)
#     w2_tensor = torch.nn.functional.normalize(w2_tensor, p=2, dim=0)
#     print(torch.norm(w1_tensor))
#     print(torch.norm(w2_tensor))
#     print(w1_tensor)
#     print(w2_tensor)
#     return torch.dot(w1_tensor,w2_tensor)

def word_similarity(word1,word2,debug=False):
    w1_tensor = word_embeddings[word1].to(cpu)
    w2_tensor = word_embeddings[word2].to(cpu)
    # w1_tensor = torch.nn.functional.normalize(w1_tensor, p=2, dim=0)
    # w2_tensor = torch.nn.functional.normalize(w2_tensor, p=2, dim=0)
    if debug:
        print(torch.norm(w1_tensor))
        print(torch.norm(w2_tensor))
    # w1_tensor[0] = 0
    # w2_tensor[0] = 0
        print(w1_tensor[0])
        print(w1_tensor)
    # print(w2_tensor)
    return torch.dot(w1_tensor,w2_tensor) / (torch.norm(w1_tensor) * torch.norm(w2_tensor))

# def word_similarity(word1,word2,eps=1):
#     print(word_embeddings[word1])
#     w1 = torch.nan_to_num(word_embeddings[word1],nan=1).to(cpu).numpy()
#     w2 = torch.nan_to_num(word_embeddings[word2],nan=1).to(cpu).numpy()
#     print(w1)
#     return w1 @ w2

In [23]:
def most_similar(word, topn=10):
    word_similarities = []
    for w, emb in word_embeddings.items():
        if w != word:
            similarity = word_similarity(w,word)
            word_similarities.append((w,similarity))
    return sorted(word_similarities, key=lambda x: x[1])[:topn]

In [None]:
word_similarity("kitty","kitten", debug=True)

In [19]:
print(word_embeddings["the"])

tensor([-6.7794e+08, -3.5859e-01, -6.7794e+08, -8.1448e-02, -1.4206e-01,
         1.1236e-01, -5.9517e-02,  1.2287e+00,  8.7674e+19, -6.8239e-02,
         4.2910e+30,  2.4043e+22,  2.7732e+35,  2.4488e+28,  5.6780e+18,
         9.6525e+31, -5.2046e-01,  1.9356e+25,  2.6350e+31,  1.1726e+33,
         2.2765e+22,  2.6971e+20, -2.5264e-01,  3.8941e+18,  2.2474e+22,
         2.4488e+28,  2.3263e+22,  1.5150e+30,  6.1745e+27,  2.5972e-01,
        -1.7488e-01,  9.0421e-02,  2.4488e+28,  5.6780e+18,  2.3354e+22,
         1.2353e-01,  2.5726e+28,  2.1195e+28,  1.0976e+00,  3.8941e+18,
         1.5510e+33,  1.0115e+32, -1.5904e-01,  3.9561e+26,  1.3309e-02,
         2.2969e+22,  2.4488e+28,  2.3263e+22,  1.5421e-01,  1.0127e+29,
         1.5149e+30,  5.7422e+21,  6.4544e+30,  5.9522e+30, -2.7974e-03,
         6.2051e+24,  1.4408e+27,  3.9555e+26,  2.4239e+22,  2.4083e+28,
        -3.0544e-02, -1.2028e+00, -8.2060e-02,  3.7714e+32,  5.9423e+18,
         2.3354e+22,  8.6357e-02,  2.5726e+28,  5.4

In [None]:
most_similar("cactus")

In [None]:
most_similar("cake")

In [None]:
most_similar("angry")

In [None]:
most_similar("quickly")

In [None]:
most_similar("between")

In [None]:
most_similar("the")