In [2]:
# !pip install transformers
from transformers import AutoModel
from transformers import AutoTokenizer

# # # load model and tokenizer
tokenizer2 = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
model2 = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)

# Load tokenizer and model 
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v3' , trust_remote_code=True)
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v3' , trust_remote_code=True)


In [4]:
some_input_text = """
Abstract
Neural information retrieval (IR) has greatly advanced search and other knowledge- intensive language tasks. While many neural IR methods encode queries and documents into single-vector representations, late interaction models produce multi-vector repre- sentations at the granularity of each token and decompose relevance modeling into scalable token-level computations. This decomposition has been shown to make late interaction more effective, but it inflates the space footprint of these models by an order of magnitude. In this work, we introduce ColBERTv2, a retriever that couples an aggressive residual compres- sion mechanism with a denoised supervision strategy to simultaneously improve the quality and space footprint of late interaction. We evaluate ColBERTv2 across a wide range of benchmarks, establishing state-of-the-art quality within and outside the training domain while reducing the space footprint of late interaction models by 6–10×.
When trained on MS MARCO Passage Rank- ing, ColBERTv2 achieves the highest MRR@10 of any standalone retriever. In addition to in-domain quality, we seek a retriever that generalizes “zero- shot” to domain-specific corpora and long-tail top- ics, ones that are often under-represented in large public training sets. To this end, we evaluate Col- BERTv2 on a wide array of out-of-domain bench- marks. These include three Wikipedia Open-QA retrieval tests and 13 diverse retrieval and semantic- similarity tasks from BEIR (Thakur et al., 2021). In addition, we introduce a new benchmark, dubbed LoTTE, for Long-Tail Topic-stratified Evaluation for IR that features 12 domain-specific search tests, spanning StackExchange communities and using queries from GooAQ (Khashabi et al., 2021). LoTTE focuses on relatively long-tail topics in its passages, unlike the Open-QA tests and many of the BEIR tasks, and evaluates models on their capacity to answer natural search queries with a practical intent, unlike many of BEIR’s semantic- similarity tasks. On 22 of 28 out-of-domain tests, ColBERTv2 achieves the highest quality, outper- forming the next best retriever by up to 8% relative gain, while using its compressed representations.
"""
tokens = tokenizer.tokenize(some_input_text)
print(len(tokens) , "tokens" , tokens)
tokens2 = tokenizer2.tokenize(some_input_text)
print(len(tokens2) , "tokens2" , tokens2)

553 tokens ['▁Abstract', '▁Ne', 'ural', '▁information', '▁re', 'trie', 'val', '▁(', 'IR', ')', '▁has', '▁great', 'ly', '▁advanced', '▁search', '▁and', '▁other', '▁knowledge', '-', '▁intensiv', 'e', '▁language', '▁task', 's', '.', '▁While', '▁many', '▁ne', 'ural', '▁IR', '▁methods', '▁en', 'code', '▁que', 'ries', '▁and', '▁documents', '▁into', '▁single', '-', 've', 'ctor', '▁representa', 'tions', ',', '▁late', '▁interaction', '▁models', '▁produce', '▁multi', '-', 've', 'ctor', '▁rep', 're', '-', '▁sent', 'ations', '▁at', '▁the', '▁granul', 'ar', 'ity', '▁of', '▁each', '▁to', 'ken', '▁and', '▁de', 'com', 'pose', '▁relevan', 'ce', '▁model', 'ing', '▁into', '▁scal', 'able', '▁to', 'ken', '-', 'level', '▁computa', 'tions', '.', '▁This', '▁de', 'com', 'position', '▁has', '▁been', '▁shown', '▁to', '▁make', '▁late', '▁interaction', '▁more', '▁effective', ',', '▁but', '▁it', '▁infla', 'tes', '▁the', '▁space', '▁foot', 'print', '▁of', '▁these', '▁models', '▁by', '▁an', '▁order', '▁of', '▁magnitu

In [5]:
def chunk_by_sentences(input_text: str, tokenizer: callable):
    """
    Split the input text into sentences using the tokenizer
    :param input_text: The text snippet to split into sentences
    :param tokenizer: The tokenizer to use
    :return: A tuple containing the list of text chunks and their corresponding token spans
    """
    inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True)
    punctuation_mark_id = tokenizer.convert_tokens_to_ids('.')
    sep_id = tokenizer.convert_tokens_to_ids('[SEP]')
    token_offsets = inputs['offset_mapping'][0]
    token_ids = inputs['input_ids'][0]
    chunk_positions = [
        (i, int(start + 1))
        for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets))
        if token_id == punctuation_mark_id
        and (
            token_offsets[i + 1][0] - token_offsets[i][1] > 0
            or token_ids[i + 1] == sep_id
        )
    ]
    chunks = [
        input_text[x[1] : y[1]]
        for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    span_annotations = [
        (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    return chunks, span_annotations

def chunk_by_sentences_v3(input_text: str, tokenizer: callable):
    """
    Split the input text into sentences using the tokenizer, compatible with Jina v3's tokenization
    """
    inputs = tokenizer(input_text, return_tensors="pt", return_offsets_mapping=True)
    token_offsets = inputs["offset_mapping"][0]
    token_ids = inputs["input_ids"][0]
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    
    # Find all potential sentence endings (periods)
    chunk_positions = []
    
    for i, (token_id, token) in enumerate(zip(token_ids, tokens)):
        # Skip if this isn't the last token
        if i >= len(token_ids) - 1:
            continue
            
        # Check if this token ends with a period
        if token.endswith('.'):
            next_token = tokens[i + 1]
            # Check if next token starts with ▁ (indicating new word)
            if next_token.startswith('▁'):
                position = (i, int(token_offsets[i][1]))
                chunk_positions.append(position)
    
    # Create chunks and span annotations
    chunks = [
        input_text[x[1] : y[1]]
        for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    
    span_annotations = [
        (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    
    return chunks, span_annotations

def late_chunking(
    model_output: 'BatchEncoding', span_annotation: list, max_length=None
):
    token_embeddings = model_output[0]
    outputs = []
    for embeddings, annotations in zip(token_embeddings, span_annotation):
        if (
            max_length is not None
        ):  # remove annotations which go bejond the max-length of the model
            annotations = [
                (start, min(end, max_length - 1))
                for (start, end) in annotations
                if start < (max_length - 1)
            ]
        pooled_embeddings = [
            embeddings[start:end].sum(dim=0) / (end - start)
            for start, end in annotations
            if (end - start) >= 1
        ]
        pooled_embeddings = [
            embedding.detach().cpu().numpy() for embedding in pooled_embeddings
        ]
        outputs.append(pooled_embeddings)

    return outputs

input_text = """Berlin is the capital and largest city of Germany, both by area and by population. 
Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits. 
The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."""

# determine chunks
chunks, span_annotations = chunk_by_sentences_v3(input_text, tokenizer)
print("chunks" , len(chunks) , chunks)
# print('Chunks:\n- "' + '"\n- "'.join(chunks) + '"')

# chunk before
embeddings_traditional_chunking = model.encode(chunks)

# chunk afterwards (context-sensitive chunked pooling)
inputs = tokenizer(input_text, return_tensors='pt')
model_output = model(**inputs)
embeddings = late_chunking(model_output, [span_annotations])[0]

import numpy as np

cos_sim = lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

berlin_embedding = model.encode('Berlin')

for chunk, new_embedding, trad_embeddings in zip(chunks, embeddings, embeddings_traditional_chunking):
    print(f'similarity_new("Berlin", "{chunk}"):', cos_sim(berlin_embedding, new_embedding))
    print(f'similarity_trad("Berlin", "{chunk}"):', cos_sim(berlin_embedding, trad_embeddings))

chunks 2 ['Berlin is the capital and largest city of Germany, both by area and by population.', " \nIts more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits."]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


similarity_new("Berlin", "Berlin is the capital and largest city of Germany, both by area and by population."): 0.7929044
similarity_trad("Berlin", "Berlin is the capital and largest city of Germany, both by area and by population."): 0.8531231
similarity_new("Berlin", " 
Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits."): 0.7890661
similarity_trad("Berlin", " 
Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits."): 0.56780535


In [41]:
# Example 1: Simple text
text = "Hello world! juila"
encoded = tokenizer(text, return_offsets_mapping=True)
print(encoded)
encoded2 = tokenizer(text, return_offsets_mapping=False)
print(encoded2 )
encoded2 = tokenizer2(text, return_offsets_mapping=True)
print(encoded2 )
punctuation_mark_id = tokenizer.convert_tokens_to_ids('.')
sep_id = tokenizer.convert_tokens_to_ids('[SEP]')
token_offsets = encoded['offset_mapping'][0]
token_ids = encoded['input_ids'][0]
print(token_offsets)
print(token_ids)
token_offsets2 = encoded2['offset_mapping'][0]
token_ids2 = encoded2['input_ids'][0]
print(token_offsets2)
print(token_ids2)

{'input_ids': [0, 35378, 8999, 38, 1129, 2439, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 5), (6, 11), (11, 12), (13, 15), (15, 18), (0, 0)]}
{'input_ids': [0, 35378, 8999, 38, 1129, 2439, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 7592, 2088, 999, 18414, 11733, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 5), (6, 11), (11, 12), (13, 15), (15, 18), (0, 0)]}
(0, 0)
0
(0, 0)
101


In [None]:
def chunk_by_sentences(input_text: str, tokenizer: callable):
    """
    Split the input text into sentences using the tokenizer
    args:
      input_text: The text snippet to split into sentences
      tokenizer: The tokenizer to use
    return: A tuple containing the list of text chunks and their corresponding token spans
    """
    inputs = tokenizer(input_text, return_tensors="pt", return_offsets_mapping=True)
    punctuation_mark_id = tokenizer.convert_tokens_to_ids(".")
    sep_id = tokenizer.convert_tokens_to_ids("[SEP]")
    token_offsets = inputs["offset_mapping"][0]
    token_ids = inputs["input_ids"][0]
    chunk_positions = [
        (i, int(start + 1))
        for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets))
        if token_id == punctuation_mark_id
        and (
            token_offsets[i + 1][0] - token_offsets[i][1] > 0
            or token_ids[i + 1] == sep_id
        )
    ]
    chunks = [
        input_text[x[1] : y[1]]
        for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    span_annotations = [
        (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    return chunks, span_annotations


def late_chunking(model_output, span_annotation: list, max_length=None):
    token_embeddings = model_output[0]
    outputs = []
    for embeddings, annotations in zip(token_embeddings, span_annotation):
        if (
            max_length is not None
        ):  # remove annotations which go bejond the max-length of the model
            annotations = [
                (start, min(end, max_length - 1))
                for (start, end) in annotations
                if start < (max_length - 1)
            ]
        pooled_embeddings = [
            embeddings[start:end].sum(dim=0) / (end - start)
            for start, end in annotations
            if (end - start) >= 1
        ]
        pooled_embeddings = [
            embedding.detach().cpu().numpy() for embedding in pooled_embeddings
        ]
        outputs.append(pooled_embeddings)

    return outputs