In [4]:
!pip install --upgrade pip
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!jupyter nbextension install --py widgetsnbextension
!pip install jupyterlab_widgets
!pip install transformers
!pip install torch torchvision torchaudio
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117


import torch
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm
from transformers import RobertaTokenizer, RobertaModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
text = "This is a text"
model = model.to(device)
encoded_input = tokenizer(text, return_tensors='pt').to(device)
output = model(**encoded_input)


usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: dejavu events execute kernel kernelspec lab
labextension labhub migrate nbclassic nbconvert notebook run server
troubleshoot trust

Jupyter command `jupyter-nbextension` not found.
usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [-

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Load the dataset from the provided gzip file
import gzip

# Path to the dataset
dataset_path = "assignment4-dataset.txt.gz"

# Read dataset into a list of sentences
dataset_sentences = []
try:
    with gzip.open(dataset_path, 'rt') as file:
        for line in file:
            try:
                line = line.strip()
                if line:
                    dataset_sentences.append(line)
            except UnicodeDecodeError:
                continue
except EOFError:
    print("Warning: Reached unexpected end of file. Processed only valid lines.")

print(f"Loaded dataset with {len(dataset_sentences)} sentences.")

Loaded dataset with 3980290 sentences.


In [6]:
# Tokenize a subset of the dataset for demonstration purposes
subset_sentences = dataset_sentences[:100] 
tokenized_inputs = tokenizer(subset_sentences, padding=True, truncation=True, return_tensors="pt")
tokenized_inputs = {key: val.to(device) for key, val in tokenized_inputs.items()}  
print("Tokenization complete for 100 sentences.")


Tokenization complete for 100 sentences.


In [7]:
# Pass tokenized inputs through the model

with torch.no_grad():
    outputs = model(**tokenized_inputs)
contextual_embeddings = outputs.last_hidden_state 
    

In [11]:
from collections import defaultdict
import numpy as np
from tqdm import tqdm


def compute_static_embeddings(model_name, texts, device="cuda" if torch.cuda.is_available() else "cpu"):
    # Load tokenizer and model
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = RobertaModel.from_pretrained(model_name).to(device)
    model.eval() 
    
    # Dictionary to store embeddings and counts
    token_embeddings = defaultdict(list)
    
    with torch.no_grad():  
        for text in tqdm(texts, desc="Processing Texts"):
            # Tokenize the text
            encoded_input = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
            input_ids = encoded_input['input_ids'].to(device)
            attention_mask = encoded_input['attention_mask'].to(device)
            
            # Generate contextualized embeddings
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            hidden_states = outputs.last_hidden_state  
            
            # Iterate over each token in the sequence
            for seq_idx in range(input_ids.size(0)):
                for token_idx in range(input_ids.size(1)):
                    token_id = input_ids[seq_idx, token_idx].item()
                    token_embedding = hidden_states[seq_idx, token_idx].cpu().numpy()
                    token_embeddings[token_id].append(token_embedding)
    
    averaged_embeddings = {
        token_id: np.mean(np.array(embeddings), axis=0)
        for token_id, embeddings in token_embeddings.items()
    }
    
    # Map back to tokens 
    token_to_static_embedding = {
        tokenizer.convert_ids_to_tokens(token_id): embedding
        for token_id, embedding in averaged_embeddings.items()
    }
    
    return token_to_static_embedding

In [None]:
import gzip

# Function to load dataset
def load_dataset(dataset_path):
    texts = []
    with gzip.open(dataset_path, 'rt', encoding='utf-8') as file:
        for line in file:
            texts.append(line.strip())  
    return texts


if __name__ == "__main__":
    # Path to the dataset
    dataset_path = "assignment4-dataset.txt.gz"
    
    # Load the dataset
    texts = load_dataset(dataset_path)
    
    print(f"Loaded {len(texts)} texts from the dataset.")
    
    # Specify the transformer encoder model
    model_name = "roberta-base"  
    
    # Compute static embeddings
    static_embeddings = compute_static_embeddings(model_name, texts)
    
    # Print a sample static embedding
    for token, embedding in list(static_embeddings.items())[:5]:
        print(f"Token: {token}, Embedding: {embedding[:5]}...")  


Loaded 4468825 texts from the dataset.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing Texts:   1%|          | 45657/4468825 [05:27<8:44:40, 140.50it/s] 