In [1]:
!pip install transformers
!pip install torch
!pip install -U datasets

Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from transformers)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.26.5-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp311-cp311-manylinux_2_17_x

Imports:

In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
cpu = torch.device("cpu")
print(cpu)

device(type='cuda')

# Problem 1

Set filepaths for text file and csv file:

In [2]:
# text_filepath = "../home/schen9/dataset.txt"
text_filepath = "dataset.txt"
csv_filepath = "dataset.csv"

Read in the text file and split it by double newline:

In [3]:
with open(text_filepath, "r", encoding="utf-8", errors="ignore") as text_file:
    rows = text_file.read().strip().split("\n\n")

print(f"Finished reading from {text_filepath}") 

Finished reading from ../home/schen9/dataset.txt


Write the text to the csv file:

In [4]:
import csv

with open(csv_filepath, "w", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Text"]) # Header
    for i, row in enumerate(tqdm(rows)):
        writer.writerow([row.strip()])

print(f"Finished writing to {csv_filepath}") 

  0%|          | 0/449919 [00:00<?, ?it/s]

Finished writing to dataset.csv


Load the csv onto a hugging face dataset object:

In [5]:
from datasets import Dataset, load_dataset

dataset = load_dataset("csv",data_files=csv_filepath) 
dataset = dataset.filter(lambda x: x["Text"] is not None) # filter out NoneTypes
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/449919 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text'],
        num_rows: 446203
    })
})

Load the tokenizer:

In [6]:
from transformers import AutoTokenizer

transformer_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)

Tokenize all the text in batches:

In [7]:
def tokenize(batch):
    return tokenizer(batch["Text"], truncation=True)

batch_size = 50
tokens = dataset.map(tokenize, batched=True, batch_size=batch_size)
tokens

Map:   0%|          | 0/446203 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text', 'input_ids', 'attention_mask'],
        num_rows: 446203
    })
})

Saved the tokens to a folder so they don't have to be retokenized later:

In [8]:
tokens.save_to_disk("my_tokens")

Saving the dataset (0/2 shards):   0%|          | 0/446203 [00:00<?, ? examples/s]

In [9]:
df = tokens["train"].to_pandas()
df

Unnamed: 0,Text,input_ids,attention_mask
0,The White Monkey is a 1925 American silent dra...,"[0, 133, 735, 34546, 16, 10, 36248, 470, 8454,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,Preservation\nAn incomplete print of The White...,"[0, 28917, 26481, 50118, 4688, 20044, 5780, 9,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,External links,"[0, 47380, 5678, 2]","[1, 1, 1, 1]"
3,Films based on works by John Galsworthy\nFilms...,"[0, 36361, 4339, 716, 15, 1364, 30, 610, 272, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,the montane grasslands and shrublands biome\n ...,"[0, 627, 27121, 1728, 6964, 8391, 8, 15383, 17...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
446198,References,"[0, 49379, 2]","[1, 1, 1]"
446199,External links,"[0, 47380, 5678, 2]","[1, 1, 1, 1]"
446200,"1929 births\n2015 deaths\nPeople from Naseby, ...","[0, 1646, 2890, 26906, 50118, 14420, 3257, 501...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
446201,Scottish female golfers\nGolfers from Edinburg...,"[0, 22041, 1173, 2182, 3524, 268, 50118, 534, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [10]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained(transformer_name, output_hidden_states = True).to(device)
model.eval()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor

In [11]:
def get_embeddings(t_vector, a_mask, model):
    token_tensor = torch.tensor([t_vector]).to(model.device)
    seg_tensor = torch.tensor([a_mask]).to(model.device)

    with torch.no_grad():
        outputs = model(input_ids=token_tensor, attention_mask=seg_tensor)
        hidden_state = outputs[1]
    token_embeddings = torch.stack(hidden_state, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)

    token_vecs = []
    for token in token_embeddings:
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs.append(sum_vec)
    return token_vecs

# Iterate over rows in DataFrame and call the function
token_counts = {i: 0 for i in range(tokenizer.vocab_size)}
avg_embeddings = {}

for t_vector, a_mask in tqdm(zip(df["input_ids"], df["attention_mask"]),total=len(df)):
    embeddings = get_embeddings(t_vector, a_mask, model)
    for t_index, token in enumerate(t_vector):
        token_counts[token] += 1
        embedding = embeddings[t_index] # Get the corresponding embedding for the token
        if token not in avg_embeddings:
            avg_embeddings[token] = embedding
        else:
            avg_embeddings[token] = torch.add(embedding, avg_embeddings[token])
    # break  # Break after first iteration for testing

# Average the embeddings
for token in avg_embeddings.keys():
    avg_embeddings[token] = torch.div(avg_embeddings[token], token_counts[token])

  0%|          | 0/446203 [00:00<?, ?it/s]

  token_tensor = torch.tensor([t_vector]).to(model.device)


Saved the average embeddings to a pickle file so they don't have to be recomputed:

In [12]:
import pickle 

with open('avg_embeddings.pkl', 'wb') as f:
    pickle.dump(avg_embeddings, f)

# Problem 2

Load the average embeddings if they were saved earlier:

In [2]:
import pickle 

with open('avg_embeddings.pkl', 'rb') as f:
    avg_embeddings = pickle.load(f)

In [4]:
# glove_filepath = "../home/schen9/glove.6B.300d-vocabulary.txt"
glove_filepath = "glove.6B.300d-vocabulary.txt"
csv_glove_filepath = "glove.csv"

In [5]:
with open(glove_filepath, "r", encoding="utf-8") as text_file:
    glove_rows = text_file.read().strip().split("\n")

print(f"Finished reading from {glove_filepath}") 

Finished reading from ../home/schen9/glove.6B.300d-vocabulary.txt


In [6]:
import csv

with open(csv_glove_filepath, "w", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Text"]) # Header
    for i, row in enumerate(tqdm(glove_rows)):
        writer.writerow([row.strip()])

print(f"Finished writing to {csv_glove_filepath}") 

  0%|          | 0/400000 [00:00<?, ?it/s]

Finished writing to glove.csv


In [7]:
from datasets import Dataset, load_dataset

glove_dataset = load_dataset("csv",data_files=csv_glove_filepath) 
glove_dataset = glove_dataset.filter(lambda x: x["Text"] is not None) # filter out NoneTypes
glove_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/400000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text'],
        num_rows: 399997
    })
})

In [8]:
from transformers import AutoTokenizer

transformer_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)

In [9]:
def tokenize(batch):
    return tokenizer(batch["Text"], truncation=True)

batch_size = 50
glove_tokens = glove_dataset.map(tokenize, batched=True, batch_size=batch_size)
glove_tokens

Map:   0%|          | 0/399997 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text', 'input_ids', 'attention_mask'],
        num_rows: 399997
    })
})

In [10]:
glove_df = glove_tokens["train"].to_pandas()
glove_df

Unnamed: 0,Text,input_ids,attention_mask
0,the,"[0, 627, 2]","[1, 1, 1]"
1,",","[0, 6, 2]","[1, 1, 1]"
2,.,"[0, 4, 2]","[1, 1, 1]"
3,of,"[0, 1116, 2]","[1, 1, 1]"
4,to,"[0, 560, 2]","[1, 1, 1]"
...,...,...,...
399992,chanty,"[0, 40805, 219, 2]","[1, 1, 1, 1]"
399993,kronik,"[0, 330, 2839, 967, 2]","[1, 1, 1, 1, 1]"
399994,rolonda,"[0, 9396, 11192, 2]","[1, 1, 1, 1]"
399995,zsombor,"[0, 329, 29, 5223, 368, 2]","[1, 1, 1, 1, 1, 1]"


In [11]:
word_embeddings = {}

In [12]:
for word, t_vector in tqdm(zip(glove_df["Text"], glove_df["input_ids"]), total=len(glove_df)):
    embeddings = [avg_embeddings[token].to(device) for token in t_vector if token in avg_embeddings]
    w_embedding = torch.empty(embeddings[0].shape[0]).to(device)
    for emb in embeddings:
        w_embedding = torch.add(w_embedding,emb.to(device))
    word_embeddings[word] = torch.div(w_embedding, len(embeddings))

  0%|          | 0/399997 [00:00<?, ?it/s]

In [13]:
import pickle 

with open('word_embeddings.pkl', 'wb') as f:
    pickle.dump(word_embeddings, f)

In [50]:
def word_similarity(word1,word2,debug=False):
    w1 = torch.nan_to_num(word_embeddings[word1]).to(cpu)
    w2 = torch.nan_to_num(word_embeddings[word2]).to(cpu)
    w1 = torch.abs(w1) + 1
    w2 = torch.abs(w2) + 1
    w1 = torch.log(w1)
    w2 = torch.log(w2)
    w1 = torch.nan_to_num(w1)
    w2 = torch.nan_to_num(w2)
    if debug:
        print(torch.norm(w1))
        print(torch.norm(w2))
        print(w1[0])
        print(w1)
    return (torch.dot(w1,w2) / (torch.norm(w1) * torch.norm(w2))).item()

In [40]:
def most_similar(word, topn=10):
    word_similarities = []
    for w, emb in tqdm(word_embeddings.items()):
        if w != word:
            similarity = word_similarity(w,word)
            word_similarities.append((w,similarity))
    return sorted(word_similarities, key=lambda x: x[1])[:topn]

In [52]:
most_similar("cactus")

  0%|          | 0/399995 [00:00<?, ?it/s]

[('username', 0.0349513404071331),
 ('______', 0.0349513404071331),
 ('----------------', 0.0349513404071331),
 ('groupon', 0.0349513404071331),
 ('cmd', 0.0349513404071331),
 ('---------------', 0.0349513404071331),
 ('wcs', 0.0349513404071331),
 ('_____________________________', 0.0349513404071331),
 ('csv', 0.0349513404071331),
 ('thumbnails', 0.0349513404071331)]

In [53]:
most_similar("cake")

  0%|          | 0/399995 [00:00<?, ?it/s]

[('playhouse', 0.03392989560961723),
 ('hossain', 0.03767850622534752),
 ('onlookers', 0.03899753466248512),
 ('prefrontal', 0.04103730246424675),
 ('roanoke', 0.04178909212350845),
 ('bilal', 0.042268525809049606),
 ('bicol', 0.04498825594782829),
 ('scandinavians', 0.046238746494054794),
 ('nadezhda', 0.05014895275235176),
 ('antiestablishment', 0.05054926499724388)]

In [54]:
most_similar("angry")

  0%|          | 0/399995 [00:00<?, ?it/s]

[('username', 0.03433229401707649),
 ('______', 0.03433229401707649),
 ('----------------', 0.03433229401707649),
 ('groupon', 0.03433229401707649),
 ('cmd', 0.03433229401707649),
 ('---------------', 0.03433229401707649),
 ('wcs', 0.03433229401707649),
 ('_____________________________', 0.03433229401707649),
 ('csv', 0.03433229401707649),
 ('thumbnails', 0.03433229401707649)]

In [55]:
most_similar("quickly")

  0%|          | 0/399995 [00:00<?, ?it/s]

[('magnesium', 0.03176235035061836),
 ('username', 0.035526860505342484),
 ('______', 0.035526860505342484),
 ('----------------', 0.035526860505342484),
 ('groupon', 0.035526860505342484),
 ('cmd', 0.035526860505342484),
 ('---------------', 0.035526860505342484),
 ('wcs', 0.035526860505342484),
 ('_____________________________', 0.035526860505342484),
 ('csv', 0.035526860505342484)]

In [56]:
most_similar("between")

  0%|          | 0/399995 [00:00<?, ?it/s]

[('smartest', 0.023020431399345398),
 ('spectre', 0.02862287499010563),
 ('username', 0.030754514038562775),
 ('______', 0.030754514038562775),
 ('----------------', 0.030754514038562775),
 ('groupon', 0.030754514038562775),
 ('cmd', 0.030754514038562775),
 ('---------------', 0.030754514038562775),
 ('wcs', 0.030754514038562775),
 ('_____________________________', 0.030754514038562775)]

In [57]:
most_similar("the")

  0%|          | 0/399995 [00:00<?, ?it/s]

[('username', 0.036416195333004),
 ('______', 0.036416195333004),
 ('----------------', 0.036416195333004),
 ('groupon', 0.036416195333004),
 ('cmd', 0.036416195333004),
 ('---------------', 0.036416195333004),
 ('wcs', 0.036416195333004),
 ('_____________________________', 0.036416195333004),
 ('csv', 0.036416195333004),
 ('thumbnails', 0.036416195333004)]