### This code takes n sentences, calculates the sentence embeddings using roberta and uses PCA to reduce their dimensions

In [17]:
from transformers import RobertaModel, RobertaTokenizer
import torch 
import numpy as np

In [18]:
model = RobertaModel.from_pretrained('roberta-base')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# sentences = ["The bird chirped cheerfully in the morning light.",
#             "The bird chirped happily in the morning light.",
#             "The bird chirped joyfully in the morning light.",
#             "The bird chirped merrily in the morning light.",
#             "The bird chirped gleefully in the morning light."]


sentences = ["The sun rose over the horizon, painting the sky with hues of orange and pink.",
            "The sun ascended above the horizon, coloring the sky with shades of orange and pink.",
            "The sun emerged beyond the horizon, adorning the sky with tints of orange and pink.",
            "The sun climbed beyond the horizon, decorating the sky with tones of orange and pink.",
            "The sun appeared over the horizon, embellishing the sky with hues of orange and pink.",
            "The sun peeked above the horizon, illuminating the sky with shades of orange and pink.",
            "The sun lifted beyond the horizon, casting the sky with hues of orange and pink.",
            "The sun emerged above the horizon, drenching the sky with shades of orange and pink.",
            "The sun soared over the horizon, saturating the sky with hues of orange and pink.",
            "The sun ascended beyond the horizon, saturating the sky with shades of orange and pink.",
            "The sun peeked over the horizon, bathing the sky with hues of orange and pink.",
            "The sun climbed above the horizon, saturating the sky with tints of orange and pink.",
            "The sun rose beyond the horizon, saturating the sky with tones of orange and pink.",
            "The sun emerged over the horizon, saturating the sky with hues of orange and pink.",
            "The sun ascended above the horizon, saturating the sky with shades of orange and pink.",
            "The sun appeared beyond the horizon, saturating the sky with hues of orange and pink.",
            "The sun peeked above the horizon, saturating the sky with shades of orange and pink.",
            "The sun emerged over the horizon, saturating the sky with hues of orange and pink.",
            "The sun ascended beyond the horizon, saturating the sky with tints of orange and pink.",
            "The sun rose above the horizon, saturating the sky with tones of orange and pink."]             
sentences = sentences[:10]

In [20]:
# Example of one sentence
sentence = "She is a Machine Learning engineer from California"

In [21]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [33]:
sentence_embeddings = []

In [34]:
for sentence in sentences:
    tokens = ['[CLS]'] + tokenizer.tokenize(sentence) + ['[SEP]']
    if 50 > len(tokens):    
        tokens = tokens + ['[PAD]'] * ( - len(tokens))

    attention_mask = [1 if i != '[PAD]' else 0 for i in tokens]
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    token_ids = torch.tensor(token_ids).unsqueeze(0)
    attention_mask = torch.tensor(attention_mask).unsqueeze(0)

    output = model(token_ids, attention_mask=attention_mask)

    with torch.no_grad():
        sentence_embeddings.append(output[1].numpy().flatten())

sentence_embeddings = np.array(sentence_embeddings, dtype=np.float32)
print(sentence_embeddings.shape)

(10, 768)


In [37]:
final_reps = output.last_hidden_state
cls_rep = final_reps[0][0].unsqueeze(0)

In [38]:
cls_rep.size()

torch.Size([1, 768])

In [12]:
variances = np.var(sentence_embeddings, axis=0)
print(variances.shape)

(768,)


In [17]:
k = 600
k_top_variances = np.sort(variances)[-k:]

In [18]:
k_top_variances.shape

(600,)

In [74]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components=5)
# sentence_embeddings = pca.fit_transform(sentence_embeddings)

# print(sentence_embeddings.shape)

(5, 5)


In [75]:
# sentence_embeddings

array([[-1.6299108e-02, -1.3559462e-02, -1.1813207e-02,  1.8123059e-02,
         2.7120961e-07],
       [ 6.3303731e-02,  1.5784515e-03, -7.7138790e-03, -1.7068516e-03,
         2.7120984e-07],
       [-2.5389103e-02, -1.3099635e-02, -1.3827629e-02, -1.6427377e-02,
         2.7121001e-07],
       [-1.9713636e-03, -9.1002351e-03,  3.3946302e-02, -7.5917045e-04,
         2.7121018e-07],
       [-1.9644190e-02,  3.4180928e-02, -5.9163483e-04,  7.7036797e-04,
         2.7120950e-07]], dtype=float32)

### Running for single sentence

In [29]:
tokens = tokenizer.tokenize(sentence)
tokens

['She',
 'Ġis',
 'Ġa',
 'ĠMachine',
 'ĠLearning',
 'Ġengineer',
 'Ġfrom',
 'ĠCalifornia']

In [30]:
tokens = ['[CLS]'] + tokens + ['[SEP]']
print(tokens)

['[CLS]', 'She', 'Ġis', 'Ġa', 'ĠMachine', 'ĠLearning', 'Ġengineer', 'Ġfrom', 'ĠCalifornia', '[SEP]']


In [31]:
print(len(tokens))

10


In [32]:
if 16 > len(tokens):    
    tokens = tokens + ['[PAD]'] * (16 - len(tokens))
print(tokens)

['[CLS]', 'She', 'Ġis', 'Ġa', 'ĠMachine', 'ĠLearning', 'Ġengineer', 'Ġfrom', 'ĠCalifornia', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [33]:
attention_mask = [1 if i != '[PAD]' else 0 for i in tokens]
print(attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]


In [34]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

[3, 2515, 16, 10, 14969, 13807, 8083, 31, 886, 3, 3, 3, 3, 3, 3, 3]


In [35]:
token_ids = torch.tensor(token_ids).unsqueeze(0)
attention_mask = torch.tensor(attention_mask).unsqueeze(0)

In [36]:
output = model(token_ids, attention_mask=attention_mask)
print(output)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0766,  0.1293, -0.0531,  ..., -0.1126, -0.0208, -0.0780],
         [ 0.0341,  0.2211, -0.1066,  ...,  0.0218,  0.0353, -0.1388],
         [ 0.1586,  0.3804,  0.0263,  ..., -0.2466,  0.1823, -0.1107],
         ...,
         [-0.0594,  0.0846, -0.0192,  ..., -0.0142, -0.0453, -0.0783],
         [-0.0727,  0.0940, -0.0329,  ..., -0.0200, -0.0284, -0.0777],
         [-0.0909,  0.1235, -0.0270,  ..., -0.0416, -0.0494, -0.0505]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 1.7248e-01, -7.5536e-02,  7.6339e-02, -3.2871e-01,  7.1186e-02,
          4.8022e-02, -2.7413e-01,  5.1791e-01,  3.9498e-01,  1.4062e-01,
         -7.7410e-02, -1.3426e-02,  3.1057e-01,  3.2572e-01, -2.6087e-03,
          2.7001e-01,  6.5242e-02, -2.2043e-01,  1.5414e-01,  6.0698e-01,
         -2.1026e-01,  1.5004e-01,  2.3717e-01, -1.6605e-01,  1.7230e-01,
         -5.3879e-01, -4.0220e-02, -8.6488e-02, -4.3560e-02, -2.753

In [37]:
output.last_hidden_state

tensor([[[-0.0766,  0.1293, -0.0531,  ..., -0.1126, -0.0208, -0.0780],
         [ 0.0341,  0.2211, -0.1066,  ...,  0.0218,  0.0353, -0.1388],
         [ 0.1586,  0.3804,  0.0263,  ..., -0.2466,  0.1823, -0.1107],
         ...,
         [-0.0594,  0.0846, -0.0192,  ..., -0.0142, -0.0453, -0.0783],
         [-0.0727,  0.0940, -0.0329,  ..., -0.0200, -0.0284, -0.0777],
         [-0.0909,  0.1235, -0.0270,  ..., -0.0416, -0.0494, -0.0505]]],
       grad_fn=<NativeLayerNormBackward0>)

In [21]:
output[0].shape

torch.Size([1, 16, 768])

In [22]:
output[1].shape

torch.Size([1, 768])