In [1]:
!pip install transformers
!pip install torch
!pip install -U datasets



Imports:

In [2]:
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

Set filepaths for text file and csv file:

In [4]:
# text_filepath = "../home/schen9/dataset.txt"
text_filepath = "dataset/part_1.txt"
csv_filepath = "dataset.csv"

Read in the text file and split it by double newline:

In [6]:
with open(text_filepath, "r", encoding="utf-8", errors="ignore") as text_file:
    rows = text_file.read().strip().split("\n\n")

print(f"Finished reading from {text_filepath}") 

Finished reading from dataset/part_1.txt


Write the text to the csv file:

In [7]:
import csv

with open(csv_filepath, "w", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Text"]) # Header
    for i, row in enumerate(tqdm(rows)):
        writer.writerow([row.strip()])

print(f"Finished writing to {csv_filepath}") 

  0%|          | 0/77240 [00:00<?, ?it/s]

Finished writing to dataset.csv


Load the csv onto a hugging face dataset object:

In [8]:
from datasets import Dataset, load_dataset

dataset = load_dataset("csv",data_files=csv_filepath) 
dataset = dataset.filter(lambda x: x["Text"] is not None) # filter out NoneTypes
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/77240 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text'],
        num_rows: 76583
    })
})

Load the tokenizer:

In [9]:
from transformers import AutoTokenizer

transformer_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)

Tokenize all the text in batches:

In [11]:
def tokenize(batch):
    return tokenizer(batch["Text"], truncation=True)

batch_size = 50
tokens = dataset.map(tokenize, batched=True, batch_size=batch_size)
tokens

Map:   0%|          | 0/76583 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text', 'input_ids', 'attention_mask'],
        num_rows: 76583
    })
})

In [8]:
tokens.save_to_disk("my_tokens")

Saving the dataset (0/2 shards):   0%|          | 0/446203 [00:00<?, ? examples/s]

In [12]:
tokens["train"].to_pandas()

Unnamed: 0,Text,input_ids,attention_mask
0,The White Monkey is a 1925 American silent dra...,"[0, 133, 735, 34546, 16, 10, 36248, 470, 8454,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,Preservation\nAn incomplete print of The White...,"[0, 28917, 26481, 50118, 4688, 20044, 5780, 9,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,External links,"[0, 47380, 5678, 2]","[1, 1, 1, 1]"
3,Films based on works by John Galsworthy\nFilms...,"[0, 36361, 4339, 716, 15, 1364, 30, 610, 272, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,the montane grasslands and shrublands biome\n ...,"[0, 627, 27121, 1728, 6964, 8391, 8, 15383, 17...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
76578,2000s\nList of Pakistani films of 2000\nList o...,"[0, 17472, 29, 50118, 36583, 9, 9246, 3541, 9,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
76579,2010s\n List of Pakistani films of 2010\n List...,"[0, 24789, 29, 50118, 9527, 9, 9246, 3541, 9, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
76580,2020s\n List of Pakistani films of 2020\n List...,"[0, 24837, 29, 50118, 9527, 9, 9246, 3541, 9, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
76581,See also \n Cinema of Pakistan\n List of Pakis...,"[0, 19224, 67, 1437, 50118, 20036, 9, 1752, 50...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [30]:
df = tokens["train"].to_pandas()
df["tensors"] = df["input_ids"].apply(lambda x: torch.tensor(x))
df["tensors"]

0        [tensor(0, dtype=torch.int32), tensor(133, dty...
1        [tensor(0, dtype=torch.int32), tensor(28917, d...
2        [tensor(0, dtype=torch.int32), tensor(47380, d...
3        [tensor(0, dtype=torch.int32), tensor(36361, d...
4        [tensor(0, dtype=torch.int32), tensor(627, dty...
                               ...                        
76578    [tensor(0, dtype=torch.int32), tensor(17472, d...
76579    [tensor(0, dtype=torch.int32), tensor(24789, d...
76580    [tensor(0, dtype=torch.int32), tensor(24837, d...
76581    [tensor(0, dtype=torch.int32), tensor(19224, d...
76582    [tensor(0, dtype=torch.int32), tensor(47380, d...
Name: tensors, Length: 76583, dtype: object

In [13]:
tokens

DatasetDict({
    train: Dataset({
        features: ['Text', 'input_ids', 'attention_mask'],
        num_rows: 76583
    })
})

In [24]:
tokenizer.encode("[")

[0, 10975, 2]

In [42]:
tokenizer.convert_ids_to_tokens(470)

'ĠAmerican'

In [14]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained(transformer_name).to(device)

In [23]:
from torch.utils.data import DataLoader

dataloader = DataLoader(tokens["train"], batch_size=20)
token_embeddings = {i: [] for i in range(tokenizer.vocab_size)}

In [22]:
tokens["train"]

Dataset({
    features: ['Text', 'input_ids', 'attention_mask'],
    num_rows: 76583
})

In [16]:
token_embeddings

{0: [],
 1: [],
 2: [],
 3: [],
 4: [],
 5: [],
 6: [],
 7: [],
 8: [],
 9: [],
 10: [],
 11: [],
 12: [],
 13: [],
 14: [],
 15: [],
 16: [],
 17: [],
 18: [],
 19: [],
 20: [],
 21: [],
 22: [],
 23: [],
 24: [],
 25: [],
 26: [],
 27: [],
 28: [],
 29: [],
 30: [],
 31: [],
 32: [],
 33: [],
 34: [],
 35: [],
 36: [],
 37: [],
 38: [],
 39: [],
 40: [],
 41: [],
 42: [],
 43: [],
 44: [],
 45: [],
 46: [],
 47: [],
 48: [],
 49: [],
 50: [],
 51: [],
 52: [],
 53: [],
 54: [],
 55: [],
 56: [],
 57: [],
 58: [],
 59: [],
 60: [],
 61: [],
 62: [],
 63: [],
 64: [],
 65: [],
 66: [],
 67: [],
 68: [],
 69: [],
 70: [],
 71: [],
 72: [],
 73: [],
 74: [],
 75: [],
 76: [],
 77: [],
 78: [],
 79: [],
 80: [],
 81: [],
 82: [],
 83: [],
 84: [],
 85: [],
 86: [],
 87: [],
 88: [],
 89: [],
 90: [],
 91: [],
 92: [],
 93: [],
 94: [],
 95: [],
 96: [],
 97: [],
 98: [],
 99: [],
 100: [],
 101: [],
 102: [],
 103: [],
 104: [],
 105: [],
 106: [],
 107: [],
 108: [],
 109: [],
 110: [],


In [24]:
model.eval()
with torch.no_grad():
    for batch in tqdm(dataloader):
        # Extract input IDs and attention masks
        print(batch)
        input_ids = batch["input_ids"].to(model.device)
        attention_mask = batch["attention_mask"].to(model.device)
        
        # Get model outputs (last hidden state)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_dim]
        
        # Iterate over tokens in the batch
        for i, seq in enumerate(input_ids):
            for j, token_id in enumerate(seq):
                # Skip padding tokens
                if attention_mask[i, j].item() == 0:
                    continue
                token_embeddings[token_id.item()].append(hidden_states[i, j].cpu())

  0%|          | 0/3830 [00:00<?, ?it/s]

RuntimeError: each element in list of batch should be of equal size