In [1]:
!pip install transformers
!pip install torch
!pip install -U datasets



Imports:

In [2]:
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

Set filepaths for text file and csv file:

In [4]:
# text_filepath = "../home/schen9/dataset.txt"
text_filepath = "dataset/part_1.txt"
csv_filepath = "dataset.csv"

Read in the text file and split it by double newline:

In [6]:
with open(text_filepath, "r", encoding="utf-8", errors="ignore") as text_file:
    rows = text_file.read().strip().split("\n\n")

print(f"Finished reading from {text_filepath}") 

Finished reading from dataset/part_1.txt


Write the text to the csv file:

In [7]:
import csv

with open(csv_filepath, "w", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Text"]) # Header
    for i, row in enumerate(tqdm(rows)):
        writer.writerow([row.strip()])

print(f"Finished writing to {csv_filepath}") 

  0%|          | 0/77240 [00:00<?, ?it/s]

Finished writing to dataset.csv


Load the csv onto a hugging face dataset object:

In [8]:
from datasets import Dataset, load_dataset

dataset = load_dataset("csv",data_files=csv_filepath) 
dataset = dataset.filter(lambda x: x["Text"] is not None) # filter out NoneTypes
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/77240 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text'],
        num_rows: 76583
    })
})

Load the tokenizer:

In [9]:
from transformers import AutoTokenizer

transformer_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)

Tokenize all the text in batches:

In [11]:
def tokenize(batch):
    return tokenizer(batch["Text"], truncation=True)

batch_size = 50
tokens = dataset.map(tokenize, batched=True, batch_size=batch_size)
tokens

Map:   0%|          | 0/76583 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text', 'input_ids', 'attention_mask'],
        num_rows: 76583
    })
})

In [8]:
tokens.save_to_disk("my_tokens")

Saving the dataset (0/2 shards):   0%|          | 0/446203 [00:00<?, ? examples/s]

In [12]:
tokens["train"].to_pandas()

Unnamed: 0,Text,input_ids,attention_mask
0,The White Monkey is a 1925 American silent dra...,"[0, 133, 735, 34546, 16, 10, 36248, 470, 8454,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,Preservation\nAn incomplete print of The White...,"[0, 28917, 26481, 50118, 4688, 20044, 5780, 9,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,External links,"[0, 47380, 5678, 2]","[1, 1, 1, 1]"
3,Films based on works by John Galsworthy\nFilms...,"[0, 36361, 4339, 716, 15, 1364, 30, 610, 272, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,the montane grasslands and shrublands biome\n ...,"[0, 627, 27121, 1728, 6964, 8391, 8, 15383, 17...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
76578,2000s\nList of Pakistani films of 2000\nList o...,"[0, 17472, 29, 50118, 36583, 9, 9246, 3541, 9,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
76579,2010s\n List of Pakistani films of 2010\n List...,"[0, 24789, 29, 50118, 9527, 9, 9246, 3541, 9, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
76580,2020s\n List of Pakistani films of 2020\n List...,"[0, 24837, 29, 50118, 9527, 9, 9246, 3541, 9, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
76581,See also \n Cinema of Pakistan\n List of Pakis...,"[0, 19224, 67, 1437, 50118, 20036, 9, 1752, 50...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [54]:
df = tokens["train"].to_pandas()
# df["tensor"] = df["input_ids"].apply(lambda x: torch.tensor(x))
df

Unnamed: 0,Text,input_ids,attention_mask
0,The White Monkey is a 1925 American silent dra...,"[0, 133, 735, 34546, 16, 10, 36248, 470, 8454,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,Preservation\nAn incomplete print of The White...,"[0, 28917, 26481, 50118, 4688, 20044, 5780, 9,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,External links,"[0, 47380, 5678, 2]","[1, 1, 1, 1]"
3,Films based on works by John Galsworthy\nFilms...,"[0, 36361, 4339, 716, 15, 1364, 30, 610, 272, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,the montane grasslands and shrublands biome\n ...,"[0, 627, 27121, 1728, 6964, 8391, 8, 15383, 17...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
76578,2000s\nList of Pakistani films of 2000\nList o...,"[0, 17472, 29, 50118, 36583, 9, 9246, 3541, 9,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
76579,2010s\n List of Pakistani films of 2010\n List...,"[0, 24789, 29, 50118, 9527, 9, 9246, 3541, 9, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
76580,2020s\n List of Pakistani films of 2020\n List...,"[0, 24837, 29, 50118, 9527, 9, 9246, 3541, 9, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
76581,See also \n Cinema of Pakistan\n List of Pakis...,"[0, 19224, 67, 1437, 50118, 20036, 9, 1752, 50...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [78]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained(transformer_name, output_hidden_states = True).to(device)
model.eval()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor

In [23]:
from torch.utils.data import DataLoader

dataloader = DataLoader(tokens["train"], batch_size=20)
token_embeddings = {i: [] for i in range(tokenizer.vocab_size)}

In [96]:
len(df)

76583

In [None]:
def get_embeddings(t_vector, a_mask, model):
    token_tensor = torch.tensor([t_vector]).to(model.device)
    seg_tensor = torch.tensor([a_mask]).to(model.device)

    with torch.no_grad():
        outputs = model(input_ids=token_tensor, attention_mask=seg_tensor)
        hidden_state = outputs[1]
    token_embeddings = torch.stack(hidden_state, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)

    token_vecs = []
    for token in token_embeddings:
    # "token" is a [12 x 768] tensor
    # sum the vectors from the last four layers
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs.append(sum_vec)
    return token_vecs

# Iterate over rows in DataFrame and call the function
token_counts = {i: 0 for i in range(tokenizer.vocab_size)}
avg_embeddings = {}

for t_vector, a_mask in tqdm(zip(df["input_ids"], df["attention_mask"]),total=len(df)):
    embeddings = get_embeddings(t_vector, a_mask, model)
    for t_index, token in enumerate(t_vector):
        token_counts[token] += 1
        embedding = embeddings[t_index] # Get the corresponding embedding for the token
        if token not in avg_embeddings:
            avg_embeddings[token] = embedding
        else:
            avg_embeddings[token] = torch.add(embedding, avg_embeddings[token])
    # break  # Break after first iteration for testing

# Average the embeddings
for token in avg_embeddings.keys():
    avg_embeddings[token] = torch.div(avg_embeddings[token], token_counts[token])

avg_embeddings

  0%|          | 0/76583 [00:00<?, ?it/s]

In [None]:
import pickle 

with open('avg_embeddings.pkl', 'wb') as f:
    pickle.dump(dictionary, f)