In [87]:
# ! pip install transformers
# ! pip install adjustText
# ! pip install sentence-transformers

In [1]:
%matplotlib inline
import torch
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import sys
np.set_printoptions(threshold=sys.maxsize)


In [2]:
plt.rcParams['figure.figsize'] = [100, 60]

In [3]:
from adjustText import adjust_text

In [4]:
from transformers import BertTokenizer, BertModel, BertForMaskedLM

In [5]:
import logging
logging.basicConfig(level=logging.INFO)
# Load BERT.
model = BertModel.from_pretrained('bert-large-uncased-whole-word-masking')
model.to('cuda')
# Set the model to eval mode.
model.eval()
# This notebook assumes CPU execution. If you want to use GPUs, put the model on cuda and modify subsequent code blocks.
#model.to('cuda')
# Load tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## BERT's vocabulary embeddings.

In [7]:
wordembs = model.get_input_embeddings()

In [9]:
def loadLines(filename):
    print("Loading lines from file", filename)
    f = open(filename,'r')
    lines = np.array([])
    for line in f:
        lines = np.append(lines, line.rstrip())
    print("Done. ", len(lines)," lines loaded!")
    return lines

In [10]:
bertwords_sents = loadLines('vocab_sents.txt')
bertwords_captions = loadLines('vocab_captions.txt')
bertwords = bertwords_sents
vocab_size = len(bertwords)
# bertwords = loadLines('vocab.txt')

Loading lines from file vocab_sents.txt
Done.  12650  lines loaded!
Loading lines from file vocab_captions.txt
Done.  9661  lines loaded!


In [11]:
# Convert the vocabulary embeddings to numpy.
# allinds = np.arange(0,model.config.vocab_size,1)
allinds = np.arange(0,vocab_size,1)
inputinds = torch.LongTensor(allinds)
bertwordembs = wordembs(inputinds.to('cuda')).to('cpu').detach().numpy()

In [12]:
bertwordembs.shape

(12650, 1024)

In [13]:
# Determine vocabulary to use for t-SNE/visualization. The indices are hard-coded based partially on inspection:
# bert_char_indices_to_use = np.arange(999, 1063, 1)
# bert_voc_indices_to_plot = np.append(bert_char_indices_to_use, np.arange(1996, 5932, 1))
# bert_voc_indices_to_use = np.append(bert_char_indices_to_use, np.arange(1996, 11932, 1))
np.random.seed(123456)
bert_voc_indices_to_plot = np.random.permutation(vocab_size)[:4000]
bert_voc_indices_to_use = np.arange(vocab_size)

In [14]:
print(len(bert_voc_indices_to_plot))
print(len(bert_voc_indices_to_use))

4000
12650


In [15]:
print(bertwords[bert_voc_indices_to_use])

['multiples' 'stripe' 'crucial' 'hauling' 'anchors' 'sunflowers' 'classy'
 'flies' 'story' '1700' 'lambo' 'expansive' 'maddy' 'boss' 'feldspar'
 'promoting' 'shhhh' 'foil' 'wood' 'departs' 'chaos' 'incomplete' 'flood'
 'costume' 'omega' 'tech' 'sergeant' 'twin' 'document' 'brisk' 'warming'
 'technological' 'arcade' 'leading' 'elementary' 'wrong' 'tulips'
 'captures' 'ready' 'elder' 'express' 'barkers' 'pope' 'victors' 'piece'
 'debated' 'slightly' 'kayakers' 'forming' 'soaked' 'flirt' 'receiver'
 'ship' '80th' 'vista' 'guarding' 'protestors' 'hah' 'aka' 'happily'
 'doherty' 'must' 'abounded' 'sprawling' 'pointy' 'passageway' 'berry'
 'stink' 'direct' 'reminder' 'rustic' 'children' 'battery' 'masts'
 'planets' 'hm' 'outlines' 'credentials' 'refreshment' 'fro' 'ceremonial'
 'corroborate' 'spirit' 'gossip' 'results' 'full' 'acknowledgment'
 'gourds' 'dome' '24' 'plentiful' 'pagoda' 'abstract' 'bustling'
 'demonstrations' 'pleased' 'sidelines' 'sampled' 'battered' 'pepper'
 'disrepair' 'fo

In [143]:
bert_voc_indices_to_use_tensor = torch.LongTensor(bert_voc_indices_to_use)
bert_word_embs_to_use = wordembs(bert_voc_indices_to_use_tensor.to('cuda')).to('cpu').detach().numpy()

In [144]:
# Run t-SNE on the BERT vocabulary embeddings we selected:
mytsne_words = TSNE(n_components=2,early_exaggeration=12,verbose=2,metric='cosine',init='pca',n_iter=2000)
bert_word_embs_to_use_tsne = mytsne_words.fit_transform(bert_word_embs_to_use)



[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 12650 samples in 0.022s...
[t-SNE] Computed neighbors for 12650 samples in 4.001s...
[t-SNE] Computed conditional probabilities for sample 1000 / 12650
[t-SNE] Computed conditional probabilities for sample 2000 / 12650
[t-SNE] Computed conditional probabilities for sample 3000 / 12650
[t-SNE] Computed conditional probabilities for sample 4000 / 12650
[t-SNE] Computed conditional probabilities for sample 5000 / 12650
[t-SNE] Computed conditional probabilities for sample 6000 / 12650
[t-SNE] Computed conditional probabilities for sample 7000 / 12650
[t-SNE] Computed conditional probabilities for sample 8000 / 12650
[t-SNE] Computed conditional probabilities for sample 9000 / 12650
[t-SNE] Computed conditional probabilities for sample 10000 / 12650
[t-SNE] Computed conditional probabilities for sample 11000 / 12650
[t-SNE] Computed conditional probabilities for sample 12000 / 12650
[t-SNE] Computed conditional probabilities for sam

In [16]:
bert_words_to_plot = bertwords[bert_voc_indices_to_plot]
print(len(bert_words_to_plot))

4000


In [146]:
# Plot the transformed BERT vocabulary embeddings:
do_text_adjust = True

fig = plt.figure()
alltexts = list()
for i, txt in enumerate(bert_words_to_plot):
    plt.scatter(bert_word_embs_to_use_tsne[i,0], bert_word_embs_to_use_tsne[i,1], s=0)
    currtext = plt.text(bert_word_embs_to_use_tsne[i,0], bert_word_embs_to_use_tsne[i,1], txt, family='sans-serif')
    alltexts.append(currtext)
    

# Save the plot before adjusting.
plt.xlim([-151,151]); plt.ylim([-151,151])
plt.tight_layout()
plt.savefig('bert-voc-sents-tsne-viz4k-noadj.pdf', format='pdf')
if do_text_adjust:
    print('now running adjust_text')
    # Using autoalign often works better in my experience, but it can be very slow for this case, so it's false by default below:
    #numiters = adjust_text(alltexts, autoalign=True, lim=50)
    numiters = adjust_text(alltexts, autoalign=False, lim=10)
    print('done adjust text, num iterations: ', numiters)
    plt.savefig('bert-voc-sents-tsne-viz4k-adj10.pdf', format='pdf')
plt.close()
# plt.show

now running adjust_text
done adjust text, num iterations:  10


## DistilBERT's sentence embeddings

In [17]:
with open('stories_texts.txt','r') as f: all_story_texts = [line[:-1] for line in f.readlines()]
with open('captions_texts.txt','r') as f: all_caption_texts = [line[:-1] for line in f.readlines()]

In [18]:
from collections import Counter
import nltk, re

STOP_WORDS = set(nltk.corpus.stopwords.words())
def clean_sentence(val):
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    return sentence
def clean_words(words):
    orig_words = words.copy()
    for word in orig_words:
        if word in STOP_WORDS:
            words.pop(word)
    return words
word_frequencies = clean_words(Counter(clean_sentence(' '.join(all_story_texts)).split()))

In [19]:
word_frequencies.most_common(100)

[('day', 2449),
 ('male', 2420),
 ('time', 2243),
 ('went', 2032),
 ('got', 1830),
 ('great', 1785),
 ('beach', 1689),
 ('female', 1574),
 ('location', 1563),
 ('see', 1453),
 ('family', 1449),
 ('people', 1448),
 ('fun', 1425),
 ('friends', 1408),
 ('beautiful', 1397),
 ('took', 1397),
 ('night', 1366),
 ('nt', 1347),
 ('everyone', 1334),
 ('get', 1299),
 ('birthday', 1246),
 ('party', 1188),
 ('go', 1142),
 ('many', 1109),
 ('even', 1088),
 ('like', 1074),
 ('picture', 1066),
 ('water', 1042),
 ('decided', 1019),
 ('could', 980),
 ('little', 975),
 ('around', 923),
 ('cake', 911),
 ('made', 904),
 ('good', 902),
 ('together', 896),
 ('first', 889),
 ('us', 867),
 ('fireworks', 858),
 ('really', 856),
 ('today', 843),
 ('lot', 815),
 ('saw', 814),
 ('trip', 794),
 ('ride', 768),
 ('city', 728),
 ('kids', 724),
 ('show', 704),
 ('ready', 701),
 ('view', 696),
 ('back', 692),
 ('happy', 690),
 ('came', 688),
 ('going', 682),
 ('race', 669),
 ('looked', 667),
 ('look', 664),
 ('food', 65

In [20]:
word_frequencies['well']

514

In [21]:
from sentence_transformers import SentenceTransformer
model_sent = SentenceTransformer('multi-qa-distilbert-cos-v1') # all-MiniLM-L6-v2
model_sent.to('cuda')

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: multi-qa-distilbert-cos-v1
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [22]:
# sentence_embeddings = model_sent.encode(all_story_texts)
sentence_embeddings = model_sent.encode(all_caption_texts)

Batches:   0%|          | 0/3677 [00:00<?, ?it/s]

In [23]:
np.random.seed(123456)
rows = np.arange(len(all_caption_texts))
np.random.shuffle(rows)
N_tsne = 10000; N_plot = 1000
bert_embs_sents_use = sentence_embeddings[rows[:N_tsne]]
bert_embs_sents_plt = sentence_embeddings[rows[:N_plot]]

In [25]:
# Run t-SNE on the BERT vocabulary embeddings we selected:
mytsne_words = TSNE(n_components=2,perplexity=30,early_exaggeration=12,verbose=2,metric='cosine',init='pca',n_iter=2000)
bert_sent_embs_to_use_tsne = mytsne_words.fit_transform(bert_embs_sents_use)



[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.014s...
[t-SNE] Computed neighbors for 10000 samples in 2.029s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.201274
[t-SNE] Computed conditional probabilities in 0.587s
[t-SNE] Iteration 50: error = 94.1509399, gradient norm = 0.0002792 (50 iterations in 3.423s)
[t-SNE] It

In [26]:
# Plot the keyword+context strings.
fig = plt.figure()
alltexts = list()
N_plot = 1000
for i, txt in enumerate(np.array(all_caption_texts)[rows[:N_plot]]):
    plt.scatter(bert_sent_embs_to_use_tsne[i,0], bert_sent_embs_to_use_tsne[i,1], s=1, c='red', marker='*')
    currtext = plt.text(bert_sent_embs_to_use_tsne[i,0], bert_sent_embs_to_use_tsne[i,1], txt, family='sans-serif')
    alltexts.append(currtext)    
plt.tight_layout()
plt.savefig('bert-sen-captions-tsne-viz1k-noadj.pdf', format='pdf')

# print('now running adjust_text')
# numiters = adjust_text(alltexts, autoalign=False, lim=10)
# print('done adjust text, num iterations: ', numiters)
# plt.savefig('viz-bert-ctx-viz700-adj10.pdf', format='pdf')

plt.close()

## BERT's contextualized word embeddings

In [158]:
# This function loads lines from a file, tokenizes them, and processes lines containing keyword, 
# up to a limit of maxLines lines. 
# It returns both the tokenized lines and the integer positions in those tokenized lines of the keyword.
def loadAndTokenizeLinesAndFindKeyword(filename, keyword, maxLines):
    print("Loading lines from file", filename)
    f = open(filename,'r')
    lines = []
    keywordIndices = []
    numSkipped = 0
    for line in f:
        # Tokenize input
        lineForBERT = "[CLS] " + line.rstrip() + " [SEP]"
        tokenized_text = tokenizer.tokenize(lineForBERT)
        if keyword in tokenized_text:
            keywordIndex = tokenized_text.index(keyword)
            lines.append(tokenized_text)
            keywordIndices.append(keywordIndex)
            if len(lines) >= maxLines:
                break
        else:
            # print("Keyword \"", keyword, "\" not found in line: ", tokenized_text)
            numSkipped += 1
    print("Done. ", len(lines)," lines loaded, ", numSkipped, " lines skipped.")
    return lines, keywordIndices

In [159]:
keywordLines, keywordIndices = loadAndTokenizeLinesAndFindKeyword("captions_texts.txt", "well", 15000)
# keywordLines, keywordIndices = loadAndTokenizeLinesAndFindKeyword("stories_texts.txt", "well", 15000)
# keywordLines, keywordIndices = loadAndTokenizeLinesAndFindKeyword("values.books-wiki.15k.txt", "values", 15000)

Loading lines from file captions_texts.txt
Done.  416  lines loaded,  117229  lines skipped.


In [160]:
# Now we will use BERT to encode the sentences we loaded and save the embeddings from the final layer 
# at the position of the keyword.
embs = np.empty((0,model.config.hidden_size), float)
# Go through all tokenized lines and keyword indices:
for tok, ind in tqdm(zip(keywordLines, keywordIndices)):
    #print(tok, ind)
    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tok)
    # segments_ids will hold indices associated with the first and second sentences in BERT.
    # We just use sentence A indices for all tokens:
    segments_ids = [0] * len(tok)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    # Compute hidden states for each layer:
    with torch.no_grad():
        outputs = model(tokens_tensor.to('cuda'), token_type_ids=segments_tensors.to('cuda'))
        # The first element of the output holds the hidden states of the last layer of BERT.
        encoded_layers = outputs[0]
        # encoded_layers has shape (batch size, sequence length, model hidden dimension)
        assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)
        # Get the hidden state for the keyword position, convert it to a numpy array, and add it to the embs matrix.
        embs = np.append(embs, [encoded_layers[0][ind][:].squeeze().to('cpu').numpy()], axis=0)
        

416it [00:09, 44.84it/s]


In [161]:
embs.shape

(416, 1024)

In [233]:
# Run t-SNE on the contextualized embeddings:
mytsne_tokens = TSNE(n_components=2,perplexity=50,early_exaggeration=12,verbose=2,metric='cosine',init='pca',n_iter=2000)
embs_tsne = mytsne_tokens.fit_transform(embs)



[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 416 samples in 0.001s...
[t-SNE] Computed neighbors for 416 samples in 0.027s...
[t-SNE] Computed conditional probabilities for sample 416 / 416
[t-SNE] Mean sigma: 0.257149
[t-SNE] Computed conditional probabilities in 0.100s
[t-SNE] Iteration 50: error = 53.4609833, gradient norm = 0.4045932 (50 iterations in 0.113s)
[t-SNE] Iteration 100: error = 57.4858017, gradient norm = 0.3791026 (50 iterations in 0.113s)
[t-SNE] Iteration 150: error = 58.4458733, gradient norm = 0.3735373 (50 iterations in 0.117s)
[t-SNE] Iteration 200: error = 56.1848869, gradient norm = 0.4256278 (50 iterations in 0.112s)
[t-SNE] Iteration 250: error = 57.4413300, gradient norm = 0.3926392 (50 iterations in 0.108s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.441330
[t-SNE] Iteration 300: error = -0.2040093, gradient norm = 0.0018377 (50 iterations in 0.079s)
[t-SNE] Iteration 350: error = -0.6096529, gradient norm = 0.000643

In [234]:
# Create the list of strings to plot; these will be the keyword with partial context to either side.
keywordWithContext = []
# The window size is the (max) number of subword units on either side of the keyword to display.
windowSize = 5
# The following flag determines whether to merge partial-word units into single words when displaying the context.
mergeSubwordUnits = True
# The following flag determines whether to remove BERT boundary tokens like [CLS] and [SEP] when displaying the context.
removeBoundaryTokens = True
for txt, ind in zip(keywordLines, keywordIndices):
    startInd = ind - windowSize
    if startInd < 0:
        startInd = 0
    currKeywordWithContext = " ".join(txt[startInd:ind+windowSize+1])
    if mergeSubwordUnits:
        currKeywordWithContext = currKeywordWithContext.replace(" ##", "")
        currKeywordWithContext = currKeywordWithContext.replace("##", "")
    if removeBoundaryTokens:
        currKeywordWithContext = currKeywordWithContext.replace("[CLS] ", "")
        currKeywordWithContext = currKeywordWithContext.replace(" [SEP]", "")
    keywordWithContext.append(currKeywordWithContext)
    

In [237]:
# Plot the keyword+context strings.
fig = plt.figure()
alltexts = list()
for i, txt in enumerate(keywordWithContextToPlot):
    plt.scatter(embs_tsne[i,0], embs_tsne[i,1], s=0)
    currtext = plt.text(embs_tsne[i,0], embs_tsne[i,1], txt, family='sans-serif')
    alltexts.append(currtext)
    
plt.tight_layout()
plt.savefig('bert-voc-captions-well-tsne-viz500-noadj.pdf', format='pdf')
# print('now running adjust_text')
# # numiters = adjust_text(alltexts, autoalign=True, lim=50)
# numiters = adjust_text(alltexts, autoalign=False, lim=10)
# print('done adjust text, num iterations: ', numiters)
# plt.savefig('bert-voc-captions-well-tsne-viz500-adj10.pdf', format='pdf')

plt.close()