In [2]:
# conda install conda-forge::bpemb
from bpemb import BPEmb

In [3]:
bpemb_en = BPEmb(
    lang="en",
    vs=10000,
    dim=100,
    cache_dir="data",
    model_file="../../data/bpemb/bpemb.model",
    emb_file="../../data/bpemb/bpemb_vectors.txt"
)

In [4]:
print(bpemb_en.vectors[bpemb_en.words.index('car')])


[-0.305548 -0.325598 -0.134716 -0.078735 -0.660545  0.076211 -0.735487
  0.124533 -0.294402  0.459688  0.030137  0.174041 -0.224223  0.486189
 -0.504649 -0.459699  0.315747  0.477885  0.091398  0.427867  0.016524
 -0.076833 -0.899727  0.493158 -0.022309 -0.422785 -0.154148  0.204981
  0.379834  0.070588  0.196073 -0.368222  0.473406  0.007409  0.004303
 -0.007823 -0.19103  -0.202509  0.109878 -0.224521 -0.35741  -0.611633
  0.329958 -0.212956 -0.497499 -0.393839 -0.130101 -0.216903 -0.105595
 -0.076007 -0.483942 -0.139704 -0.161647  0.136985  0.415363 -0.360143
  0.038601 -0.078804 -0.030421  0.324129  0.223378 -0.523636 -0.048317
 -0.032248 -0.117367  0.470519  0.225816 -0.222065 -0.225007 -0.165904
 -0.334389 -0.20157   0.572352 -0.268794  0.301929 -0.005563  0.387491
  0.261031 -0.11613   0.074982 -0.008433  0.259987 -0.099893 -0.268875
 -0.054047 -0.534776 -0.111101 -0.051742  0.214114  0.04293   0.039873
 -0.453112  0.087382 -0.333201 -0.034079 -0.833045  0.155232 -1.132393
 -0.29

In [5]:
sample_sentence = "Where can I find a pizzeria?"
tokens = bpemb_en.encode(sample_sentence)
print(tokens)

import torch
device = 'cpu'
token_seq = torch.tensor(bpemb_en.encode_ids("Where can I find a pizzeria?")).to(device)
print(token_seq)

['▁where', '▁can', '▁i', '▁find', '▁a', '▁p', 'iz', 'zer', 'ia', '?']
tensor([ 571,  280,  386, 1934,    4,   24,  248, 4339,  177, 9967])


In [6]:
bpemb_vocab_size, bpemb_embed_size = bpemb_en.vectors.shape
embed_dim = 12
token_embedding = torch.nn.Embedding(bpemb_vocab_size, embed_dim).to(device)
token_embeddings = token_embedding(token_seq)

# The untrained embeddings for our sample sentence.
print("Embeddings for: ", sample_sentence)
print(token_embeddings)

Embeddings for:  Where can I find a pizzeria?
tensor([[ 0.0487, -0.0395, -0.0061, -1.3340, -1.6241,  0.9888, -0.3754, -1.3920,
         -0.4376, -1.2552, -0.1572,  0.2396],
        [ 0.4480,  0.2035,  0.3438, -0.2572,  0.5632, -0.0920, -0.5023, -1.6056,
          0.9267, -0.0363, -1.2019,  0.9280],
        [ 2.5911, -1.1953, -0.0458,  0.3786, -1.7280,  0.5371, -0.7493, -1.0576,
          0.9983,  0.5340, -1.2885, -0.6720],
        [-0.6151, -1.3542, -0.0313, -0.0998, -0.6654,  0.7459,  0.1146, -1.3325,
         -0.2532, -1.1519,  1.3841,  0.6176],
        [ 0.6978,  1.4558,  0.1363, -0.2832, -1.4323, -0.3438, -0.2230, -1.4938,
         -0.5392,  1.8805,  0.6944, -1.6687],
        [-1.0916, -0.2567, -0.6753, -0.7824, -1.8704, -0.2698,  0.5987,  0.5320,
         -0.7272,  0.2474,  0.6123, -0.5436],
        [-2.5794,  0.3079, -0.5116,  0.4259, -1.6618,  0.1653, -1.5554, -1.1099,
         -0.2248, -0.1615,  0.5103,  1.2684],
        [-0.6550, -0.1928, -0.9535, -1.9223,  1.0531, -0.5353, -0

In [7]:
#positional information now
max_seq_len = 256
position_embedding = torch.nn.Embedding(max_seq_len, embed_dim).to(device)

pos_idx = torch.tensor(list(range(len(token_seq)))).to(device)
print(pos_idx)

position_embeddings = position_embedding(pos_idx)
print("Embeddings for: ", sample_sentence)
print(position_embeddings)

input = position_embeddings + token_embeddings

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
Embeddings for:  Where can I find a pizzeria?
tensor([[-4.6548e-01, -1.7531e+00,  1.1636e+00,  1.3516e+00,  3.2057e-01,
         -1.2374e+00,  1.6545e-01,  6.0048e-01, -4.8345e-01, -7.2730e-01,
          1.3202e+00, -1.1517e+00],
        [ 8.9537e-01,  9.5846e-01, -2.2149e-01, -1.5378e+00,  4.1604e-02,
         -1.5268e+00,  1.9332e+00, -1.0371e-01, -8.3778e-02,  6.0431e-01,
          5.0961e-02,  3.5628e-01],
        [-1.6337e+00,  6.5760e-01,  5.6695e-01, -9.3366e-01, -1.2522e+00,
          2.9134e-02,  9.2721e-01, -1.8085e+00, -7.6743e-02, -9.5599e-01,
          1.4144e+00, -3.9051e-01],
        [ 4.8702e-01,  2.9975e-01, -1.3919e+00,  2.6704e-01, -5.6298e-01,
          2.0309e-01, -2.7600e+00,  1.1450e-01, -2.3124e-01, -6.8432e-01,
         -3.2948e-01,  2.1939e+00],
        [-2.3476e-01,  7.2402e-01,  4.2270e-01, -8.9768e-02, -1.0313e-01,
         -4.7172e-01,  1.4633e+00,  8.3125e-01, -3.9950e-01,  7.6724e-01,
          5.2153e-01, -7.1896e-