In [1]:
from transformers import BertTokenizerFast  # !pip install transformers

# load bert tokenizer from huggingface
tokenizer = BertTokenizerFast.from_pretrained(
   'bert-base-german-cased'
)




In [7]:
# Print the special tokens
print("Special tokens:", tokenizer.special_tokens_map)

# Optionally, you can also print the IDs for these tokens
print("Special token IDs:", tokenizer.convert_tokens_to_ids(list(tokenizer.special_tokens_map.values())))

Special tokens: {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}
Special token IDs: [2, 4, 0, 3, 5]


In [2]:
contexts = ['Hallo, ich fahre nach Berlin',
            'Ich war gestern auf der Wiesn',
            'Ich bin gerade in einem ICE und es ist ungeheuer nervig wegen ein paar Typen']

# tokenize the context passage
inputs = tokenizer(
   contexts[0], padding=True, truncation=True,
   max_length=512
)
inputs.keys()


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [3]:
input_ids = inputs['input_ids']
input_ids


[3, 5850, 26910, 26918, 1169, 5137, 26897, 188, 715, 4]

In [4]:
from collections import Counter

# convert the input_ids list to a dictionary of key to frequency values
sparse_vec = dict(Counter(input_ids))
sparse_vec


{3: 1,
 5850: 1,
 26910: 1,
 26918: 1,
 1169: 1,
 5137: 1,
 26897: 1,
 188: 1,
 715: 1,
 4: 1}

In [8]:
def build_dict(input_batch):
  # store a batch of sparse embeddings
    sparse_emb = []
    # iterate through input batch
    for token_ids in input_batch:
        # convert the input_ids list to a dictionary of key to frequency values
        d = dict(Counter(token_ids))
        # remove special tokens and append sparse vectors to sparse_emb list
        sparse_emb.append({key: d[key] for key in d if key not in [2,4,0,3,5]})
    # return sparse_emb list
    return sparse_emb

def generate_sparse_vectors(context_batch):
    # create batch of input_ids
    inputs = tokenizer(
            context_batch, padding=True,
            truncation=True,
            max_length=512
    )['input_ids']
    # create sparse dictionaries
    sparse_embeds = build_dict(inputs)
    return sparse_embeds

In [9]:
generate_sparse_vectors(contexts)

[{5850: 1, 26910: 1, 26918: 1, 1169: 1, 5137: 1, 26897: 1, 188: 1, 715: 1},
 {1671: 1, 185: 1, 12656: 1, 115: 1, 21: 1, 26638: 1},
 {1671: 1,
  4058: 1,
  2023: 1,
  50: 1,
  297: 1,
  25675: 1,
  42: 1,
  229: 1,
  127: 1,
  4926: 1,
  208: 1,
  667: 1,
  20790: 1,
  80: 1,
  1026: 1,
  39: 1,
  4895: 1,
  15949: 1}]

In [25]:
embeds_list = []
for i in contexts:
    print(i)
    embeds_list.append(generate_sparse_vectors([i]))

for j in embeds_list:
    print(list(j[0].keys()))
    print(list(j[0].values()))

Hallo, ich fahre nach Berlin
Ich war gestern auf der Wiesn
Ich bin gerade in einem ICE und es ist ungeheuer nervig wegen ein paar Typen
[5850, 26910, 26918, 1169, 5137, 26897, 188, 715]
[1, 1, 1, 1, 1, 1, 1, 1]
[1671, 185, 12656, 115, 21, 26638]
[1, 1, 1, 1, 1, 1]
[1671, 4058, 2023, 50, 297, 25675, 42, 229, 127, 4926, 208, 667, 20790, 80, 1026, 39, 4895, 15949]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [10]:
[generate_sparse_vectors(i) for i in contexts]

TypeError: 'int' object is not iterable

In [None]:
def hybrid_scale(dense, sparse, alpha: float = 0.5):
    # check alpha value is in range
    if alpha < 0 or alpha > 1:
        raise ValueError("Alpha must be between 0 and 1")
    # scale sparse and dense vectors to create hybrid search vecs
    hsparse = {
        'indices': sparse['indices'],
        'values':  [v * (1 - alpha) for v in sparse['values']]
    }
    hdense = [v * alpha for v in dense]
    return hdense, hsparse


def hybrid_query(question, top_k, alpha):
   # convert the question into a sparse vector
   sparse_vec = generate_sparse_vectors([question])[0]
   # convert the question into a dense vector
   dense_vec = model.encode([question]).tolist()
   # scale alpha with hybrid_scale
   dense_vec, sparse_vec = hybrid_scale(
      dense_vec, sparse_vec, alpha
   )
   # query pinecone with the query parameters
   result = pinecone.query(
      vector=dense_vec,
      sparse_vector=sparse_vec[0],
      top_k=top_k,
      include_metadata=True
   )
   # return search results as json
   return result
