In [0]:
%pip install torch

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import torch
from transformers import pipeline
from transformers import BertTokenizer, AutoModel
import pandas as pd
import random

In [0]:
unmasker = pipeline('fill-mask', model='bert-base-uncased')

test = "Every morning last summer in Greece, " + \
       "I visited the [MASK] where I would swim, " + \
       "play in the sand, and sunbathe."

result = unmasker(test)
result

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


[{'score': 0.4774385690689087,
  'token': 3509,
  'token_str': 'beach',
  'sequence': 'every morning last summer in greece, i visited the beach where i would swim, play in the sand, and sunbathe.'},
 {'score': 0.22596196830272675,
  'token': 12212,
  'token_str': 'beaches',
  'sequence': 'every morning last summer in greece, i visited the beaches where i would swim, play in the sand, and sunbathe.'},
 {'score': 0.05403747782111168,
  'token': 4770,
  'token_str': 'pool',
  'sequence': 'every morning last summer in greece, i visited the pool where i would swim, play in the sand, and sunbathe.'},
 {'score': 0.04365471750497818,
  'token': 12679,
  'token_str': 'pools',
  'sequence': 'every morning last summer in greece, i visited the pools where i would swim, play in the sand, and sunbathe.'},
 {'score': 0.02927646040916443,
  'token': 2697,
  'token_str': 'lake',
  'sequence': 'every morning last summer in greece, i visited the lake where i would swim, play in the sand, and sunbathe.'}]

## Sentiment Data

In [0]:
df = pd.read_csv("/Volumes/daz_aitraining_cat/aitraining/aitraining_volume/fed_sentiment_training.csv", sep='\t')

In [0]:
df.head(10)

Unnamed: 0,ID,text,sentiment
0,157_1,The action was taken to cushion the effects on...,dovish
1,161_2,Such trends could foster inflationary imbalanc...,hawkish
2,52_0,The Federal Open Market Committee at its meeti...,neutral
3,21_5,Although continuing favorable trends bolster l...,dovish
4,78_7,The Committee perceives that the upside and do...,neutral
5,115_6,"Nonetheless, the Committee judges that some in...",hawkish
6,160_4,"As a consequence, the pool of available worker...",hawkish
7,114_3,"Readings on core inflation have been elevated,...",hawkish
8,60_0,The Federal Open Market Committee at its meeti...,dovish
9,80_7,"The Committee judges that, on balance, the ris...",dovish


In [0]:
df.loc[0, "text"]

'The action was taken to cushion the effects on prospective economic growth in the United States of increasing weakness in foreign economies and of less accommodative financial conditions domestically'

### Tokenizing Text

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [0]:
vocab = tokenizer.get_vocab()
print(f"Total number of tokens in vocabulary: {len(vocab)} \n---------")
for _ in range(10):
    word, idx = random.choice(list(vocab.items()))
    print(word, idx)

Total number of tokens in vocabulary: 30522 
---------
moments 5312
terrible 6659
fate 6580
personality 6180
annoyance 17466
ð 1098
1729 28449
summit 6465
cm 4642
confess 18766


In [0]:
encoded_input1 = tokenizer(df.loc[0, "text"],
                           max_length=100,
                           padding="max_length",
                           return_tensors='pt')

print("Tokens:")
temp_tokens = encoded_input1["input_ids"][0]  # ← Add [0] here to get first sequence
print(tokenizer.convert_ids_to_tokens(temp_tokens))
print("\n------------------------------------------\n")
print("Tokens IDs:")
print(temp_tokens)

Tokens:
['[CLS]', 'the', 'action', 'was', 'taken', 'to', 'cushion', 'the', 'effects', 'on', 'prospective', 'economic', 'growth', 'in', 'the', 'united', 'states', 'of', 'increasing', 'weakness', 'in', 'foreign', 'economies', 'and', 'of', 'less', 'acc', '##om', '##mo', '##da', '##tive', 'financial', 'conditions', 'domestically', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']

------------------------------------------

Tokens IDs:
tensor([  101,  1996

### Obtaining Embeddings

In [0]:
model = AutoModel.from_pretrained("bert-base-uncased",
                                  output_hidden_states=True,
                                  output_attentions=True,
                                  attn_implementation="eager"
                                  )

print(model.config)

BertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_attentions": true,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [0]:
# Step 1: Get BERT output without computing gradients (inference mode)
with torch.no_grad():
    result1 = model(**encoded_input1)

# Step 2: Extract token-level embeddings from BERT's last layer
last_hidden_state = result1.last_hidden_state
print(f"Token embeddings shape: {last_hidden_state.shape}")

# Step 3: Get attention mask (1 = real token, 0 = padding)
attention_mask = encoded_input1["attention_mask"]  # [1, 30]
print("attention mask:")
print(attention_mask)
print("\n------------------------------------------\n")

# Step 4: Zero out padding token embeddings
# unsqueeze(-1) adds dimension: [1, 30] → [1, 30, 1]
# This allows broadcasting when multiplying with embeddings [1, 30, 768]
masked_embeddings = last_hidden_state * attention_mask.unsqueeze(-1)

# Step 5: Compute mean pooling (average of non-padding tokens)
# Numerator: sum all 30 token embeddings
sum_embeddings = masked_embeddings.sum(dim=1)  # [1, 768]

# Denominator: count how many real tokens
num_real_tokens = attention_mask.sum(dim=1, keepdim=True)  # [1, 1]

# Final sentence embedding: average of real token embeddings
mean_embedding1 = sum_embeddings / num_real_tokens  # [1, 768]

print(f"Sentence embedding shape: {mean_embedding1.shape}")
print("\n------------------------------------------\n")
print("First Ten Elements of Embedding:")
print(mean_embedding1[0, :10])

Token embeddings shape: torch.Size([1, 100, 768])
attention mask:
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])

------------------------------------------

Sentence embedding shape: torch.Size([1, 768])

------------------------------------------

First Ten Elements of Embeddings:
tensor([-0.7555, -0.1470, -0.0567,  0.2348,  0.1567,  0.0438, -0.0582,  0.4445,
        -0.1379, -0.1501])


In [0]:
# %% Now scale to ALL examples in the dataset
import numpy as np

all_embeddings = []

for text in df["text"]:
    # Same steps as above
    encoded_input = tokenizer(text,
                             max_length=30, # for speed
                             padding="max_length",
                             truncation=True,
                             return_tensors='pt')
    
    with torch.no_grad():
        result = model(**encoded_input)
    
    last_hidden_state = result.last_hidden_state
    attention_mask = encoded_input["attention_mask"]
    masked_embeddings = last_hidden_state * attention_mask.unsqueeze(-1)
    mean_embedding = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
    
    embedding_vector = mean_embedding.squeeze(0).numpy()
    all_embeddings.append(embedding_vector)

# Stack into matrix
embeddings_matrix = np.vstack(all_embeddings)

print(f"Embeddings matrix shape: {embeddings_matrix.shape}")
print(f"Number of texts: {embeddings_matrix.shape[0]}")
print(f"Embedding dimension: {embeddings_matrix.shape[1]}")

# Store in dataframe
df['embedding'] = all_embeddings

Embeddings matrix shape: (1243, 768)
Number of texts: 1243
Embedding dimension: 768
