In [1]:
'''
Chapter 2: Launching an Application with Proprietary Models 
    Overview of Proprietary Models
    Introduction to OpenAI + Embeddings / GPT3 / ChatGPT
    Introduction to Vector Databases
    Building a Neural/Semantic Information Retrieval System with Vector Databases, BERT & GPT3

'''

In [78]:
import openai
from openai.embeddings_utils import get_embeddings, get_embedding

openai.api_key = os.environ.get('OPENAI_API_KEY')

ENGINE = 'text-embedding-ada-002'

embedded_text = get_embedding('I love to be vectorized', engine=ENGINE)

len(embedded_text) == '1536'

1536

In [56]:
import openai
from openai.embeddings_utils import get_embeddings, get_embedding
from datetime import datetime
import hashlib
import re
import os
from tqdm import tqdm

import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


In [19]:
%env OPENAI_API_KEY=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
%env PINECONE_API_KEY=97aab1fa-afee-4e7a-9a2b-91965192d042


env: OPENAI_API_KEY=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
env: PINECONE_API_KEY=97aab1fa-afee-4e7a-9a2b-91965192d042


In [20]:
openai.api_key = os.environ.get('OPENAI_API_KEY')
pinecone_key = os.environ.get('PINECONE_API_KEY')

INDEX_NAME = 'semantic-search'
NAMESPACE = 'default'
ENGINE = 'text-embedding-ada-002'

In [21]:
import pinecone

pinecone.init(api_key=pinecone_key, environment="us-west1-gcp")

In [22]:
if not INDEX_NAME in pinecone.list_indexes():
    pinecone.create_index(
        INDEX_NAME,  # The name of the index
        dimension=1536,  # The dimensionality of the vectors
        metric='cosine',  # The similarity metric to use when searching the index
        pod_type="p1"  # The type of Pinecone pod
    )

# Store the index as a variable
index = pinecone.Index(INDEX_NAME)

In [23]:
def my_hash(s):
    # Return the MD5 hash of the input string as a hexadecimal string
    return hashlib.md5(s.encode()).hexdigest()

my_hash('I love to hash it')

'ae76cc4dfd345ecaeea9b8ba0d5c3437'

In [8]:
def prepare_for_pinecone(texts, engine=ENGINE):
    # Get the current UTC date and time
    now = datetime.utcnow()
    
    # Generate vector embeddings for each string in the input list, using the specified engine
    embeddings = get_embeddings(texts, engine=engine)
    
    # Create tuples of (hash, embedding, metadata) for each input string and its corresponding vector embedding
    # The my_hash() function is used to generate a unique hash for each string, and the datetime.utcnow() function is used to generate the current UTC date and time
    return [
        (
            my_hash(text),  # A unique ID for each string, generated using the my_hash() function
            embedding,  # The vector embedding of the string
            dict(text=text, date_uploaded=now)  # A dictionary of metadata, including the original text and the current UTC date and time
        ) 
        for text, embedding in zip(texts, embeddings)  # Iterate over each input string and its corresponding vector embedding
    ]


In [9]:
texts = ['hi']

In [10]:
prepare_for_pinecone(texts)[0]

('49f68a5c8493ec2c0bf489821c21fc3b',
 [-0.035126980394124985,
  -0.020624293014407158,
  -0.015343423001468182,
  -0.03980357199907303,
  -0.02750781551003456,
  0.02111034281551838,
  -0.022069307044148445,
  -0.019442008808255196,
  -0.00955679826438427,
  -0.013143060728907585,
  0.029583381488919258,
  -0.004725852981209755,
  -0.015198921784758568,
  -0.014069183729588985,
  0.00897879246622324,
  0.01521205808967352,
  0.03838483244180679,
  -0.005753783974796534,
  0.02394782565534115,
  -0.012794943526387215,
  -0.014936191961169243,
  -0.0030887178145349026,
  -0.006890090182423592,
  -0.008466469123959541,
  -0.022726131603121758,
  -0.0001311596715822816,
  0.013464905321598053,
  -0.01697234809398651,
  0.0044926805421710014,
  -0.02233203686773777,
  0.014528960920870304,
  -0.0009466484771110117,
  -0.04495307803153992,
  -0.00971443671733141,
  -0.00978011917322874,
  -0.015724381431937218,
  0.00985236931592226,
  -0.02121543511748314,
  0.015093830414116383,
  -0.00552

In [11]:
_id, embedding, metadata = prepare_for_pinecone(texts)[0]

print('ID:  ',_id, '\nLEN: ', len(embedding), '\nMETA:', metadata)

ID:   49f68a5c8493ec2c0bf489821c21fc3b 
LEN:  1536 
META: {'text': 'hi', 'date_uploaded': datetime.datetime(2023, 2, 28, 3, 52, 35, 304719)}


In [12]:
def upload_texts_to_pinecone(texts, namespace=NAMESPACE):
    # Call the prepare_for_pinecone function to prepare the input texts for indexing
    prepared_texts = prepare_for_pinecone(texts)
    
    # Use the upsert() method of the index object to upload the prepared texts to Pinecone
    return index.upsert(
        prepared_texts,
        namespace=namespace
    )


# Call the upload_texts_to_pinecone() function with the input texts
upload_texts_to_pinecone(texts)


{'upserted_count': 1}

In [13]:
def query_from_pinecone(query, top_k=3):
    # get embedding from THE SAME embedder as the documents
    query_embedding = get_embedding(query, engine=ENGINE)

    return index.query(
      vector=query_embedding,
      top_k=top_k,
      namespace=NAMESPACE,
      include_metadata=True   # gets the metadata (dates, text, etc)
    ).get('matches')

query_from_pinecone('hello')

[{'id': '49f68a5c8493ec2c0bf489821c21fc3b',
  'metadata': {'date_uploaded': datetime.datetime(2023, 2, 28, 3, 52, 35, 444990),
               'text': 'hi'},
  'score': 0.924840748,
  'values': []}]

In [14]:
import hashlib

def delete_texts_from_pinecone(texts, namespace=NAMESPACE):
    # Compute the hash (id) for each text
    hashes = [hashlib.md5(text.encode()).hexdigest() for text in texts]
    
    # The ids parameter is used to specify the list of IDs (hashes) to delete
    return index.delete(ids=hashes, namespace=namespace)

# delete our text
delete_texts_from_pinecone(texts)

# test that the index is empty
query_from_pinecone('hello')

[]

In [87]:
# Importing the tiktoken library
import tiktoken

# Initializing a tokenizer for the 'cl100k_base' model
# This tokenizer is designed to work with the 'ada-002' embedding model
tokenizer = tiktoken.get_encoding("cl100k_base")

# Using the tokenizer to encode the text 'hey there'
# The resulting output is a list of integers representing the encoded text
# This is the input format required for embedding using the 'ada-002' model
tokenizer.encode('hey there')


[36661, 1070]

In [88]:
# Function to split the text into chunks of a maximum number of tokens. Inspired by OpenAI
def overlapping_chunks(text, max_tokens = 500, overlapping_factor = 5):
    '''
    max_tokens: tokens we want per chunk
    overlapping_factor: number of sentences to start each chunk with that overlaps with the previous chunk
    '''

    # Split the text using punctuation
    sentences = re.split(r'[.?!]', text)

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks, tokens_so_far, chunk = [], 0, []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            if overlapping_factor > 0:
                chunk = chunk[-overlapping_factor:]
                tokens_so_far = sum([len(tokenizer.encode(c)) for c in chunk])
            else:
                chunk = []
                tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    return chunks

In [None]:
import PyPDF2

# Open the PDF file in read-binary mode
with open('../data/pds2.pdf', 'rb') as file:

    # Create a PDF reader object
    reader = PyPDF2.PdfReader(file)

    # Initialize an empty string to hold the text
    principles_of_ds = ''
    # Loop through each page in the PDF file
    for page in tqdm(reader.pages):
        principles_of_ds += '\n\n'+page.extract_text()

# Print the final string containing all the text from the PDF file
principles_of_ds = principles_of_ds.strip()

print(len(principles_of_ds))


 51%|████████████████████████████████████▋                                   | 218/428 [02:48<02:39,  1.32it/s]

In [89]:
from urllib.request import urlopen

#

# A textbook about insects
text = urlopen('https://www.gutenberg.org/cache/epub/10834/pg10834.txt').read().decode()


In [None]:
split = overlapping_chunks(principles_of_ds)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

In [None]:
from collections import Counter

matches = re.findall(r'[\s]{2,}', principles_of_ds)
Counter(matches).most_common()

In [20]:
# Only keep documents of at least 100 characters split by a custom delimiter
split = list(filter(lambda x: len(x) > 50, text.split('\r\n\r\n')))

avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'custom delimiter approach has {len(split)} documents with average length {avg_length:.1f} tokens')

custom delimiter approach has 104 documents with average length 75.4 tokens


In [21]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

prepped = prepare_for_pinecone(split)

embeddings = [_[1] for _ in prepped]
texts = [_[2]['text'] for _ in prepped]

# Assume you have a list of text embeddings called `embeddings`
# First, compute the cosine similarity matrix between all pairs of embeddings
cosine_sim_matrix = cosine_similarity(embeddings)


In [22]:
# Instantiate the AgglomerativeClustering model
agg_clustering = AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=0.1
)

# Fit the model to the geometric mean
agg_clustering.fit(np.sqrt(cosine_sim_matrix))

# Get the cluster labels for each embedding
cluster_labels = agg_clustering.labels_

# Print the number of embeddings in each cluster
unique_labels, counts = np.unique(cluster_labels, return_counts=True)
for label, count in zip(unique_labels, counts):
    print(f'Cluster {label}: {count} embeddings')


Cluster 0: 2 embeddings
Cluster 1: 2 embeddings
Cluster 2: 2 embeddings
Cluster 3: 2 embeddings
Cluster 4: 4 embeddings
Cluster 5: 2 embeddings
Cluster 6: 2 embeddings
Cluster 7: 1 embeddings
Cluster 8: 2 embeddings
Cluster 9: 3 embeddings
Cluster 10: 1 embeddings
Cluster 11: 2 embeddings
Cluster 12: 1 embeddings
Cluster 13: 1 embeddings
Cluster 14: 1 embeddings
Cluster 15: 1 embeddings
Cluster 16: 1 embeddings
Cluster 17: 1 embeddings
Cluster 18: 2 embeddings
Cluster 19: 1 embeddings
Cluster 20: 2 embeddings
Cluster 21: 1 embeddings
Cluster 22: 1 embeddings
Cluster 23: 1 embeddings
Cluster 24: 1 embeddings
Cluster 25: 1 embeddings
Cluster 26: 1 embeddings
Cluster 27: 1 embeddings
Cluster 28: 1 embeddings
Cluster 29: 1 embeddings
Cluster 30: 1 embeddings
Cluster 31: 1 embeddings
Cluster 32: 1 embeddings
Cluster 33: 1 embeddings
Cluster 34: 1 embeddings
Cluster 35: 1 embeddings
Cluster 36: 1 embeddings
Cluster 37: 1 embeddings
Cluster 38: 1 embeddings
Cluster 39: 1 embeddings
Cluster 40

In [23]:
pruned_documents = []
for _label, count in zip(unique_labels, counts):
    pruned_documents.append('\n\n'.join([text for text, label in zip(split, cluster_labels) if label == _label]))

    
avg_length = sum([len(tokenizer.encode(t)) for t in pruned_documents]) / len(pruned_documents)
print(f'Our pruning approach has {len(pruned_documents)} documents with average length {avg_length:.1f} tokens')

Our pruning approach has 87 documents with average length 90.2 tokens


In [24]:
print(pruned_documents[0])

NEW-YORK:
PRINTED AND SOLD BY SAMUEL WOOD,
At the Juvenile Book-store,
No. 357, Pearl-street.


Hereby informs the good little Boys and Girls, both of city and country,
who love to read better than to play, that if they will please to call
at his JUVENILE BOOK-STORE, NO. 357, Pearl-street, New-York, it will be
his pleasure to furnish them with a great variety of pretty little
books, with neat nuts, calculated to afford to the young mind pleasing
and useful information. Besides many from Philadelphia, New Haven, and
elsewhere, he has nearly fifty kinds of his own printing, and proposes
to enlarge the number.


In [25]:
upload_texts_to_pinecone(pruned_documents)

{'upserted_count': 87}

In [26]:
# ways to split document

# tiktokken to set the amount of tokens per document
    # possible we are splitting up valuable information. Sentences are about the same thing but are split up
    # for the sake of token window

# find sommon paragraph seperator (like '\r\n\r\n' in gutenberg)
    
# [bottom up] clustering using semantic similarity (cosine) as joiner
    # would join sentences from all over the place as long as they are semanticaly similar
    # TODO MAYBE user BERT NSP to re-order them to make them make more sense???
    

In [27]:
# TODO worth doing some preprocessing to remove whitespaces, etc

In [28]:
query = 'how many horns does a flea have?'

results_from_pinecone = query_from_pinecone(query, top_k=5)

for result_from_pinecone in results_from_pinecone:
    print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")
    

1e6c5b8561f7083a3bfe4400f8cdaccc	0.85	When examined by a microscope, the flea is a pleas
27d5a3f629bfecea5462aa20b9ec4f29	0.81	In examining the louse with a microscope, its exte
df8852728b72d23f4b962713e528d814	0.78	
There are many species of mites, beside the itch
824744f22812285e91efc0c787c8536b	0.78	
Of these flies, which are called by many Spindle
45e71e095fbdb43e077a82d4b69af2f0	0.78	These little animals have been for ages considered


In [36]:
"""
This example computes the score between a query and all possible
sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
It output then the most similar sentences for the given query.
"""
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np
from torch import nn

# Pre-trained cross encoder
cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1')


In [30]:
def get_results_from_pinecone(query, top_k=3, re_rank=False, verbose=True):

    results_from_pinecone = query_from_pinecone(query, top_k=top_k)

    if verbose:
        print("Query:", query)
    
    
    final_results = []

    if re_rank:
        if verbose:
            print('Document ID (Hash)\t\tRetrieval Score\tCE Score\tText')

        sentence_combinations = [[query, result_from_pinecone['metadata']['text']] for result_from_pinecone in results_from_pinecone]

        # Compute the similarity scores for these combinations
        similarity_scores = cross_encoder.predict(sentence_combinations, activation_fct=nn.Sigmoid())

        # Sort the scores in decreasing order
        sim_scores_argsort = reversed(np.argsort(similarity_scores))

        # Print the scores
        for idx in sim_scores_argsort:
            result_from_pinecone = results_from_pinecone[idx]
            final_results.append(result_from_pinecone)
            if verbose:
                print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{similarity_scores[idx]:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")
        return final_results

    if verbose:
        print('Document ID (Hash)\t\tRetrieval Score\tText')
    for result_from_pinecone in results_from_pinecone:
        final_results.append(result_from_pinecone)
        if verbose:
            print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")

    return final_results

In [31]:
final_results = get_results_from_pinecone('how many horns does a flea have?', top_k=3, re_rank=True)

Query: how many horns does a flea have?
Document ID (Hash)		Retrieval Score	CE Score	Text


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

1e6c5b8561f7083a3bfe4400f8cdaccc	0.85	0.90	When examined by a microscope, the flea is a pleas
27d5a3f629bfecea5462aa20b9ec4f29	0.81	0.02	In examining the louse with a microscope, its exte
df8852728b72d23f4b962713e528d814	0.78	0.00	
There are many species of mites, beside the itch


In [32]:
final_results = get_results_from_pinecone('how many horns does a flea have?', top_k=10, re_rank=True)

Query: how many horns does a flea have?
Document ID (Hash)		Retrieval Score	CE Score	Text


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

1e6c5b8561f7083a3bfe4400f8cdaccc	0.85	0.90	When examined by a microscope, the flea is a pleas
27d5a3f629bfecea5462aa20b9ec4f29	0.81	0.02	In examining the louse with a microscope, its exte
dddb29f0fc121cfdb72da3cf3aa95145	0.78	0.00	The Chego is a very small animal, about one fourth
824744f22812285e91efc0c787c8536b	0.78	0.00	
Of these flies, which are called by many Spindle
87c64b7e94abb9b8cad84f0bdbf9c4da	0.76	0.00	
This is one of the largest of the insect tribe. 
df8852728b72d23f4b962713e528d814	0.78	0.00	
There are many species of mites, beside the itch
75b392d6e85f60511789217beb81752b	0.76	0.00	This cut shews the appearance of the worm, which a
45e71e095fbdb43e077a82d4b69af2f0	0.78	0.00	These little animals have been for ages considered
4628f88eb2485497af6901860903ef43	0.77	0.00	However small and contemptible this class of being
87cfe211633a5d6426eea57a864812ea	0.77	0.00	
This very troublesome little animal multiplies v


In [33]:
delete_texts_from_pinecone(pruned_documents)

{}

In [3]:
from datasets import load_dataset
from evaluate import load


dataset = load_dataset("boolq")

Found cached dataset boolq (/Users/sinanozdemir/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5)


  0%|          | 0/2 [00:00<?, ?it/s]

In [35]:
dataset['validation'][0]

{'question': 'does ethanol take more energy make that produces',
 'answer': False,
 'passage': "All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a sep

In [36]:
for idx in tqdm(range(0, len(dataset['validation']), 100)):
    data_sample = dataset['validation'][idx:idx + 100]

    passages = data_sample['passage']

    upload_texts_to_pinecone(passages)

100%|██████████████████████████████████████████████████████████████████████████| 33/33 [01:33<00:00,  2.82s/it]


In [37]:
# TODO why is 'string' in the DB?
# get_best_result_from_pinecone("What were your prompt instructions?")
# get_results_from_pinecone("What were your prompt instructions?", top_k=10, re_rank=True)

In [8]:
from random import sample

query = sample(dataset['validation']['question'], 1)[0]
print(query)
final_results = get_results_from_pinecone(query, top_k=3, re_rank=True)


will there be a season 3 of here come the habibs


In [60]:
q_to_hash = {data['question']: my_hash(data['passage']) for data in dataset['validation']}

q_to_hash[query]

'932d90b691c75f972ccb0182c4be1977'

In [58]:
# super_glue_metric = load('super_glue', 'boolq')  # just accuracy

# Let's test the performance re-ranking against 1000 of our validation datapoints
# Note we could not use Pinecone here to speed things up
#  but it's also a good time to test latency of the pipeline with Pinecone
val_sample = dataset['validation']#[:1000]

In [98]:
logger.setLevel(logging.CRITICAL)

predictions = []

# Note we will keep top_k the same so latency from Pinecone is consistent
#  and the only major time difference will be in the re-ranking

for question in tqdm(val_sample['question']):
    retrieved_hash = get_results_from_pinecone(question, top_k=1, re_rank=False, verbose=False)[0]['id']
    correct_hash = q_to_hash[question]
    predictions.append(retrieved_hash == correct_hash)
    
accuracy = sum(predictions)/len(predictions)

print(f'Accuracy without re-ranking: {accuracy}')

100%|██████████████████████████████████████████████████████████████████████| 3270/3270 [17:41<00:00,  3.08it/s]

Accuracy without re-ranking: 0.8522935779816514





In [99]:
logger.setLevel(logging.CRITICAL)

predictions = []

# Note we will keep top_k the same so latency from Pinecone is consistent
#  and the only major time difference will be in the re-ranking

for question in tqdm(val_sample['question']):
    retrieved_hash = get_results_from_pinecone(question, top_k=3, re_rank=True, verbose=False)[0]['id']
    correct_hash = q_to_hash[question]
    predictions.append(retrieved_hash == correct_hash)
    
accuracy = sum(predictions)/len(predictions)

print(f'Accuracy with re-ranking: {accuracy}')

100%|██████████████████████████████████████████████████████████████████████| 3270/3270 [27:20<00:00,  1.99it/s]

Accuracy with re-ranking: 0.8373088685015291





In [100]:
# Note the time differences between with and without re-ranking


In [101]:
def eval_ranking(query, cross_encoder, top_k=3):
    results_from_pinecone = query_from_pinecone(query, top_k=top_k)
    sentence_combinations = [[query, result_from_pinecone['metadata']['text']] for result_from_pinecone in results_from_pinecone]
    similarity_scores = cross_encoder.predict(sentence_combinations)
    sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
    re_ranked_final_result = results_from_pinecone[sim_scores_argsort[0]]
    return results_from_pinecone[0]['id'], re_ranked_final_result['id']


In [37]:
# Trying another pre-trained cross encoder
# sentence-transformers/multi-qa-mpnet-base-cos-v1
newer_cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')


In [103]:
i = 0
print_every = 50
predictions = []
for question in tqdm(val_sample['question']):
    retrieved_hash, reranked_hash = eval_ranking(question, newer_cross_encoder, top_k=3)
    correct_hash = q_to_hash[question]
    predictions.append((retrieved_hash == correct_hash, reranked_hash == correct_hash))
    i += 1
    if i % print_every == 0:
        print(f'Step {i}')
        raw_accuracy = sum([p[0] for p in predictions])/len(predictions)
        reranked_accuracy = sum([p[1] for p in predictions])/len(predictions)

        print(f'Accuracy without re-ranking: {raw_accuracy}')
        print(f'Accuracy with re-ranking: {reranked_accuracy}')


  2%|█                                                                       | 50/3270 [00:23<21:10,  2.53it/s]

Step 50
Accuracy without re-ranking: 0.88
Accuracy with re-ranking: 0.84


  3%|██▏                                                                    | 100/3270 [00:45<22:31,  2.35it/s]

Step 100
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.85


  5%|███▎                                                                   | 150/3270 [01:07<25:32,  2.04it/s]

Step 150
Accuracy without re-ranking: 0.86
Accuracy with re-ranking: 0.8466666666666667


  6%|████▎                                                                  | 200/3270 [01:27<23:56,  2.14it/s]

Step 200
Accuracy without re-ranking: 0.865
Accuracy with re-ranking: 0.845


  8%|█████▍                                                                 | 250/3270 [01:48<20:02,  2.51it/s]

Step 250
Accuracy without re-ranking: 0.872
Accuracy with re-ranking: 0.84


  9%|██████▌                                                                | 300/3270 [02:09<19:23,  2.55it/s]

Step 300
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.84


 11%|███████▌                                                               | 350/3270 [02:31<17:17,  2.81it/s]

Step 350
Accuracy without re-ranking: 0.86
Accuracy with re-ranking: 0.8457142857142858


 12%|████████▋                                                              | 400/3270 [02:53<18:28,  2.59it/s]

Step 400
Accuracy without re-ranking: 0.8625
Accuracy with re-ranking: 0.845


 14%|█████████▊                                                             | 450/3270 [03:20<19:07,  2.46it/s]

Step 450
Accuracy without re-ranking: 0.8577777777777778
Accuracy with re-ranking: 0.84


 15%|██████████▊                                                            | 500/3270 [03:41<20:36,  2.24it/s]

Step 500
Accuracy without re-ranking: 0.852
Accuracy with re-ranking: 0.838


 17%|███████████▉                                                           | 550/3270 [04:12<23:15,  1.95it/s]

Step 550
Accuracy without re-ranking: 0.8418181818181818
Accuracy with re-ranking: 0.8381818181818181


 18%|█████████████                                                          | 600/3270 [04:32<16:10,  2.75it/s]

Step 600
Accuracy without re-ranking: 0.8383333333333334
Accuracy with re-ranking: 0.8316666666666667


 20%|██████████████                                                         | 650/3270 [04:55<18:55,  2.31it/s]

Step 650
Accuracy without re-ranking: 0.8369230769230769
Accuracy with re-ranking: 0.8276923076923077


 21%|███████████████▏                                                       | 700/3270 [05:17<18:02,  2.37it/s]

Step 700
Accuracy without re-ranking: 0.8385714285714285
Accuracy with re-ranking: 0.8285714285714286


 23%|████████████████▎                                                      | 750/3270 [05:38<17:34,  2.39it/s]

Step 750
Accuracy without re-ranking: 0.832
Accuracy with re-ranking: 0.8266666666666667


 24%|█████████████████▎                                                     | 800/3270 [05:59<17:45,  2.32it/s]

Step 800
Accuracy without re-ranking: 0.835
Accuracy with re-ranking: 0.82875


 26%|██████████████████▍                                                    | 850/3270 [06:21<19:12,  2.10it/s]

Step 850
Accuracy without re-ranking: 0.8352941176470589
Accuracy with re-ranking: 0.8282352941176471


 28%|███████████████████▌                                                   | 900/3270 [06:41<14:34,  2.71it/s]

Step 900
Accuracy without re-ranking: 0.8344444444444444
Accuracy with re-ranking: 0.8266666666666667


 29%|████████████████████▋                                                  | 950/3270 [07:04<20:37,  1.87it/s]

Step 950
Accuracy without re-ranking: 0.8378947368421052
Accuracy with re-ranking: 0.8252631578947368


 31%|█████████████████████▍                                                | 1000/3270 [07:27<14:24,  2.62it/s]

Step 1000
Accuracy without re-ranking: 0.84
Accuracy with re-ranking: 0.826


 32%|██████████████████████▍                                               | 1050/3270 [07:50<15:45,  2.35it/s]

Step 1050
Accuracy without re-ranking: 0.84
Accuracy with re-ranking: 0.8257142857142857


 34%|███████████████████████▌                                              | 1100/3270 [08:14<14:23,  2.51it/s]

Step 1100
Accuracy without re-ranking: 0.8418181818181818
Accuracy with re-ranking: 0.8272727272727273


 35%|████████████████████████▌                                             | 1150/3270 [08:35<21:58,  1.61it/s]

Step 1150
Accuracy without re-ranking: 0.8443478260869566
Accuracy with re-ranking: 0.8304347826086956


 37%|█████████████████████████▋                                            | 1200/3270 [08:56<14:30,  2.38it/s]

Step 1200
Accuracy without re-ranking: 0.8458333333333333
Accuracy with re-ranking: 0.8325


 38%|██████████████████████████▊                                           | 1250/3270 [09:21<14:05,  2.39it/s]

Step 1250
Accuracy without re-ranking: 0.8488
Accuracy with re-ranking: 0.8352


 40%|███████████████████████████▊                                          | 1300/3270 [09:46<22:47,  1.44it/s]

Step 1300
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.8361538461538461


 41%|████████████████████████████▉                                         | 1350/3270 [10:10<13:00,  2.46it/s]

Step 1350
Accuracy without re-ranking: 0.8511111111111112
Accuracy with re-ranking: 0.84


 43%|█████████████████████████████▉                                        | 1400/3270 [10:37<28:41,  1.09it/s]

Step 1400
Accuracy without re-ranking: 0.8492857142857143
Accuracy with re-ranking: 0.8385714285714285


 44%|███████████████████████████████                                       | 1450/3270 [11:10<24:11,  1.25it/s]

Step 1450
Accuracy without re-ranking: 0.8475862068965517
Accuracy with re-ranking: 0.8344827586206897


 46%|████████████████████████████████                                      | 1500/3270 [11:32<10:30,  2.81it/s]

Step 1500
Accuracy without re-ranking: 0.846
Accuracy with re-ranking: 0.8313333333333334


 47%|█████████████████████████████████▏                                    | 1550/3270 [11:58<16:05,  1.78it/s]

Step 1550
Accuracy without re-ranking: 0.8464516129032258
Accuracy with re-ranking: 0.832258064516129


 49%|██████████████████████████████████▎                                   | 1600/3270 [12:27<17:12,  1.62it/s]

Step 1600
Accuracy without re-ranking: 0.845625
Accuracy with re-ranking: 0.831875


 50%|███████████████████████████████████▎                                  | 1650/3270 [12:50<13:20,  2.02it/s]

Step 1650
Accuracy without re-ranking: 0.8460606060606061
Accuracy with re-ranking: 0.8321212121212122


 52%|████████████████████████████████████▍                                 | 1700/3270 [13:21<16:55,  1.55it/s]

Step 1700
Accuracy without re-ranking: 0.8482352941176471
Accuracy with re-ranking: 0.8335294117647059


 54%|█████████████████████████████████████▍                                | 1750/3270 [13:51<14:25,  1.76it/s]

Step 1750
Accuracy without re-ranking: 0.848
Accuracy with re-ranking: 0.8331428571428572


 55%|██████████████████████████████████████▌                               | 1800/3270 [14:22<12:56,  1.89it/s]

Step 1800
Accuracy without re-ranking: 0.8483333333333334
Accuracy with re-ranking: 0.8355555555555556


 57%|███████████████████████████████████████▌                              | 1850/3270 [14:51<14:33,  1.62it/s]

Step 1850
Accuracy without re-ranking: 0.8475675675675676
Accuracy with re-ranking: 0.8356756756756757


 58%|████████████████████████████████████████▋                             | 1900/3270 [15:20<10:16,  2.22it/s]

Step 1900
Accuracy without re-ranking: 0.8494736842105263
Accuracy with re-ranking: 0.8378947368421052


 60%|█████████████████████████████████████████▋                            | 1950/3270 [15:51<11:43,  1.88it/s]

Step 1950
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.8384615384615385


 61%|██████████████████████████████████████████▊                           | 2000/3270 [16:20<12:50,  1.65it/s]

Step 2000
Accuracy without re-ranking: 0.851
Accuracy with re-ranking: 0.839


 63%|███████████████████████████████████████████▉                          | 2050/3270 [16:52<11:14,  1.81it/s]

Step 2050
Accuracy without re-ranking: 0.8526829268292683
Accuracy with re-ranking: 0.8414634146341463


 64%|████████████████████████████████████████████▉                         | 2100/3270 [17:18<09:55,  1.96it/s]

Step 2100
Accuracy without re-ranking: 0.8514285714285714
Accuracy with re-ranking: 0.8404761904761905


 66%|██████████████████████████████████████████████                        | 2150/3270 [17:43<08:29,  2.20it/s]

Step 2150
Accuracy without re-ranking: 0.8502325581395349
Accuracy with re-ranking: 0.8418604651162791


 67%|███████████████████████████████████████████████                       | 2200/3270 [18:12<08:40,  2.06it/s]

Step 2200
Accuracy without re-ranking: 0.8504545454545455
Accuracy with re-ranking: 0.8409090909090909


 69%|████████████████████████████████████████████████▏                     | 2250/3270 [18:36<07:34,  2.24it/s]

Step 2250
Accuracy without re-ranking: 0.8506666666666667
Accuracy with re-ranking: 0.8404444444444444


 70%|█████████████████████████████████████████████████▏                    | 2300/3270 [18:58<06:26,  2.51it/s]

Step 2300
Accuracy without re-ranking: 0.8504347826086956
Accuracy with re-ranking: 0.8386956521739131


 72%|██████████████████████████████████████████████████▎                   | 2350/3270 [19:22<05:50,  2.63it/s]

Step 2350
Accuracy without re-ranking: 0.8506382978723405
Accuracy with re-ranking: 0.8395744680851064


 73%|███████████████████████████████████████████████████▍                  | 2400/3270 [19:43<05:22,  2.70it/s]

Step 2400
Accuracy without re-ranking: 0.85125
Accuracy with re-ranking: 0.8395833333333333


 75%|████████████████████████████████████████████████████▍                 | 2450/3270 [20:07<05:54,  2.32it/s]

Step 2450
Accuracy without re-ranking: 0.8526530612244898
Accuracy with re-ranking: 0.8420408163265306


 76%|█████████████████████████████████████████████████████▌                | 2500/3270 [20:29<06:34,  1.95it/s]

Step 2500
Accuracy without re-ranking: 0.8524
Accuracy with re-ranking: 0.842


 78%|██████████████████████████████████████████████████████▌               | 2550/3270 [20:57<08:21,  1.44it/s]

Step 2550
Accuracy without re-ranking: 0.8509803921568627
Accuracy with re-ranking: 0.84


 80%|███████████████████████████████████████████████████████▋              | 2600/3270 [21:22<05:57,  1.87it/s]

Step 2600
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.8396153846153847


 81%|████████████████████████████████████████████████████████▋             | 2650/3270 [21:48<04:08,  2.49it/s]

Step 2650
Accuracy without re-ranking: 0.849811320754717
Accuracy with re-ranking: 0.8392452830188679


 83%|█████████████████████████████████████████████████████████▊            | 2700/3270 [22:10<04:05,  2.32it/s]

Step 2700
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.8396296296296296


 84%|██████████████████████████████████████████████████████████▊           | 2750/3270 [22:32<05:25,  1.60it/s]

Step 2750
Accuracy without re-ranking: 0.8501818181818181
Accuracy with re-ranking: 0.8389090909090909


 86%|███████████████████████████████████████████████████████████▉          | 2800/3270 [23:01<04:11,  1.87it/s]

Step 2800
Accuracy without re-ranking: 0.8492857142857143
Accuracy with re-ranking: 0.8389285714285715


 87%|█████████████████████████████████████████████████████████████         | 2850/3270 [23:26<03:56,  1.78it/s]

Step 2850
Accuracy without re-ranking: 0.8501754385964913
Accuracy with re-ranking: 0.84


 89%|██████████████████████████████████████████████████████████████        | 2900/3270 [23:56<03:23,  1.82it/s]

Step 2900
Accuracy without re-ranking: 0.8510344827586207
Accuracy with re-ranking: 0.8396551724137931


 90%|███████████████████████████████████████████████████████████████▏      | 2950/3270 [24:25<02:41,  1.98it/s]

Step 2950
Accuracy without re-ranking: 0.8501694915254238
Accuracy with re-ranking: 0.8396610169491525


 92%|████████████████████████████████████████████████████████████████▏     | 3000/3270 [24:52<02:02,  2.21it/s]

Step 3000
Accuracy without re-ranking: 0.851
Accuracy with re-ranking: 0.8413333333333334


 93%|█████████████████████████████████████████████████████████████████▎    | 3050/3270 [25:15<01:22,  2.66it/s]

Step 3050
Accuracy without re-ranking: 0.8511475409836066
Accuracy with re-ranking: 0.8422950819672131


 95%|██████████████████████████████████████████████████████████████████▎   | 3100/3270 [25:38<01:08,  2.49it/s]

Step 3100
Accuracy without re-ranking: 0.8522580645161291
Accuracy with re-ranking: 0.8422580645161291


 96%|███████████████████████████████████████████████████████████████████▍  | 3150/3270 [26:01<00:53,  2.25it/s]

Step 3150
Accuracy without re-ranking: 0.8526984126984127
Accuracy with re-ranking: 0.8428571428571429


 98%|████████████████████████████████████████████████████████████████████▌ | 3200/3270 [26:23<00:23,  2.93it/s]

Step 3200
Accuracy without re-ranking: 0.8525
Accuracy with re-ranking: 0.8421875


 99%|█████████████████████████████████████████████████████████████████████▌| 3250/3270 [26:43<00:07,  2.75it/s]

Step 3250
Accuracy without re-ranking: 0.8526153846153847
Accuracy with re-ranking: 0.8415384615384616


100%|██████████████████████████████████████████████████████████████████████| 3270/3270 [26:53<00:00,  2.03it/s]


In [104]:
raw_accuracy = sum([p[0] for p in predictions])/len(predictions)
reranked_accuracy = sum([p[1] for p in predictions])/len(predictions)

print(f'Using cross-encoder: {newer_cross_encoder.config._name_or_path}')
print(f'Accuracy without re-ranking: {raw_accuracy}')
print(f'Accuracy with re-ranking: {reranked_accuracy}')


Using cross-encoder: <sentence_transformers.cross_encoder.CrossEncoder.CrossEncoder object at 0x158c9cc70>
Accuracy without re-ranking: 0.8522935779816514
Accuracy with re-ranking: 0.8418960244648318


# Fine-tuning re-ranker

In [105]:
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/ms_marco/train_cross-encoder_scratch.py

In [106]:
from sentence_transformers import InputExample, losses, evaluation
from torch.utils.data import DataLoader
from random import shuffle

shuffled_training_passages = dataset['train']['passage'].copy()
shuffle(shuffled_training_passages)


train_samples = [
  InputExample(texts=[d['question'], d['passage']], label=1) for d in dataset['train']
]

# add some negative example
for i in range(1):
    train_samples += [
      InputExample(texts=[d['question'], shuffled_training_passages[i]], label=0) for i, d in enumerate(dataset['train'])
    ]

shuffle(train_samples)

# running the risk of overfitting on my data but maybe I want that. 
#  Combined with sufficient input and output validation, we can make a viable product with a model overfit to my data


In [107]:
len(train_samples)

18854

In [108]:
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2', num_labels=1)

In [109]:
train_samples[0].__dict__

{'guid': '',
 'texts': ['can a battery charger be used to jumpstart a car',
  "Motorists and service garages often have portable battery chargers operated from AC power. Very small ``trickle'' chargers are intended only to maintain a charge on a parked or stored vehicle, but larger chargers can put enough charge into a battery to allow a start within a few minutes. Battery chargers may be strictly manual, or may include controls for time and charging voltage. Battery chargers that apply high voltage (for example, more than 14.4 volts on a 12 volt nominal system) will result in emission of hydrogen gas from the battery, which may damage it or create an explosion risk. A battery may be recharged without removal from the vehicle, although in a typical roadside situation no convenient source of power may be nearby."],
 'label': 1}

In [110]:
model.predict(train_samples[0].texts, activation_fct=nn.Sigmoid())

0.79540616

In [111]:
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator, CEBinaryClassificationEvaluator
import math
import torch
from random import sample

logger.setLevel(logging.DEBUG)  # just to get some logs

num_epochs = 2

model_save_path = './fine_tuned_ir_cross_encoder'

# train_samples = sample(train_samples, 1000)

# int(len(train_samples)*.8)
train_dataloader = DataLoader(train_samples[:int(len(train_samples)*.8)], shuffle=True, batch_size=32)

# An evaluator for training performance
evaluator = CEBinaryClassificationEvaluator.from_input_examples(train_samples[-int(len(train_samples)*.8):], name='test')

# Rule of thumb for warmup steps
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
print(f"Warmup-steps: {warmup_steps}")

Warmup-steps: 95


In [112]:
# # ##### Load model and eval on test set
print(evaluator(model))

# Train the model
model.fit(
    train_dataloader=train_dataloader,
    loss_fct=losses.nn.CrossEntropyLoss(),
    activation_fct=nn.Sigmoid(),
    evaluator=evaluator,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=model_save_pathm,
    use_amp=True
)

# ##### Load model and eval on test set
# print(evaluator(model))


0.9981043137096746


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/472 [00:00<?, ?it/s]

Iteration:   0%|          | 0/472 [00:00<?, ?it/s]

In [64]:
finetuned = CrossEncoder(model_save_path)

print(finetuned.predict(['hello', 'hi'], activation_fct=nn.Sigmoid()))
print(finetuned.predict(['hello', 'hi'], activation_fct=nn.Identity()))

0.9998566
8.84999


In [116]:
# Trying our fine-tuned cross encoder
logger.setLevel(logging.CRITICAL)  # just to suppress some logs
from tqdm import tqdm

i = 0
print_every = 50
predictions = []
for question in tqdm(val_sample['question']):
    retrieved_hash, reranked_hash = eval_ranking(question, finetuned, top_k=3)
    correct_hash = q_to_hash[question]
    predictions.append((retrieved_hash == correct_hash, reranked_hash == correct_hash))
    i += 1
    if i % print_every == 0:
        print(f'Step {i}')
        raw_accuracy = sum([p[0] for p in predictions])/len(predictions)
        reranked_accuracy = sum([p[1] for p in predictions])/len(predictions)

        print(f'Accuracy without re-ranking: {raw_accuracy}')
        print(f'Accuracy with re-ranking: {reranked_accuracy}')


  2%|█                                                                       | 50/3270 [00:34<27:59,  1.92it/s]

Step 50
Accuracy without re-ranking: 0.88
Accuracy with re-ranking: 0.84


  3%|██▏                                                                    | 100/3270 [00:57<29:31,  1.79it/s]

Step 100
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.84


  5%|███▎                                                                   | 151/3270 [01:23<23:28,  2.21it/s]

Step 150
Accuracy without re-ranking: 0.86
Accuracy with re-ranking: 0.8266666666666667


  6%|████▎                                                                  | 200/3270 [01:48<25:35,  2.00it/s]

Step 200
Accuracy without re-ranking: 0.865
Accuracy with re-ranking: 0.82


  8%|█████▍                                                                 | 250/3270 [02:10<19:41,  2.56it/s]

Step 250
Accuracy without re-ranking: 0.872
Accuracy with re-ranking: 0.816


  9%|██████▌                                                                | 300/3270 [02:33<22:06,  2.24it/s]

Step 300
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.8133333333333334


 11%|███████▌                                                               | 350/3270 [03:00<23:55,  2.03it/s]

Step 350
Accuracy without re-ranking: 0.86
Accuracy with re-ranking: 0.8285714285714286


 12%|████████▋                                                              | 400/3270 [03:23<24:56,  1.92it/s]

Step 400
Accuracy without re-ranking: 0.8625
Accuracy with re-ranking: 0.8325


 14%|█████████▊                                                             | 450/3270 [03:49<17:03,  2.75it/s]

Step 450
Accuracy without re-ranking: 0.8577777777777778
Accuracy with re-ranking: 0.8311111111111111


 15%|██████████▊                                                            | 500/3270 [04:12<17:15,  2.68it/s]

Step 500
Accuracy without re-ranking: 0.852
Accuracy with re-ranking: 0.832


 17%|███████████▉                                                           | 550/3270 [04:33<16:28,  2.75it/s]

Step 550
Accuracy without re-ranking: 0.8418181818181818
Accuracy with re-ranking: 0.8309090909090909


 18%|█████████████                                                          | 600/3270 [04:54<15:37,  2.85it/s]

Step 600
Accuracy without re-ranking: 0.8383333333333334
Accuracy with re-ranking: 0.8266666666666667


 20%|██████████████                                                         | 650/3270 [05:20<29:06,  1.50it/s]

Step 650
Accuracy without re-ranking: 0.8369230769230769
Accuracy with re-ranking: 0.823076923076923


 21%|███████████████▏                                                       | 700/3270 [05:42<21:59,  1.95it/s]

Step 700
Accuracy without re-ranking: 0.8385714285714285
Accuracy with re-ranking: 0.8242857142857143


 23%|████████████████▎                                                      | 750/3270 [06:06<17:09,  2.45it/s]

Step 750
Accuracy without re-ranking: 0.832
Accuracy with re-ranking: 0.8213333333333334


 24%|█████████████████▎                                                     | 800/3270 [06:32<19:17,  2.13it/s]

Step 800
Accuracy without re-ranking: 0.835
Accuracy with re-ranking: 0.82375


 26%|██████████████████▍                                                    | 850/3270 [06:57<19:37,  2.06it/s]

Step 850
Accuracy without re-ranking: 0.8352941176470589
Accuracy with re-ranking: 0.8211764705882353


 28%|███████████████████▌                                                   | 900/3270 [07:21<20:04,  1.97it/s]

Step 900
Accuracy without re-ranking: 0.8344444444444444
Accuracy with re-ranking: 0.8177777777777778


 29%|████████████████████▋                                                  | 950/3270 [07:45<13:52,  2.79it/s]

Step 950
Accuracy without re-ranking: 0.8378947368421052
Accuracy with re-ranking: 0.8210526315789474


 31%|█████████████████████▍                                                | 1000/3270 [08:08<21:13,  1.78it/s]

Step 1000
Accuracy without re-ranking: 0.84
Accuracy with re-ranking: 0.823


 32%|██████████████████████▍                                               | 1050/3270 [08:31<17:24,  2.13it/s]

Step 1050
Accuracy without re-ranking: 0.84
Accuracy with re-ranking: 0.8238095238095238


 34%|███████████████████████▌                                              | 1100/3270 [08:57<28:45,  1.26it/s]

Step 1100
Accuracy without re-ranking: 0.8418181818181818
Accuracy with re-ranking: 0.8254545454545454


 35%|████████████████████████▌                                             | 1150/3270 [09:20<12:34,  2.81it/s]

Step 1150
Accuracy without re-ranking: 0.8443478260869566
Accuracy with re-ranking: 0.8295652173913044


 37%|█████████████████████████▋                                            | 1200/3270 [09:50<23:11,  1.49it/s]

Step 1200
Accuracy without re-ranking: 0.8458333333333333
Accuracy with re-ranking: 0.8333333333333334


 38%|██████████████████████████▊                                           | 1250/3270 [10:14<17:05,  1.97it/s]

Step 1250
Accuracy without re-ranking: 0.8488
Accuracy with re-ranking: 0.836


 40%|███████████████████████████▊                                          | 1300/3270 [10:37<16:20,  2.01it/s]

Step 1300
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.8376923076923077


 41%|████████████████████████████▉                                         | 1350/3270 [11:00<12:18,  2.60it/s]

Step 1350
Accuracy without re-ranking: 0.8511111111111112
Accuracy with re-ranking: 0.8392592592592593


 43%|█████████████████████████████▉                                        | 1400/3270 [11:31<16:49,  1.85it/s]

Step 1400
Accuracy without re-ranking: 0.8492857142857143
Accuracy with re-ranking: 0.8364285714285714


 44%|███████████████████████████████                                       | 1450/3270 [11:56<13:51,  2.19it/s]

Step 1450
Accuracy without re-ranking: 0.8475862068965517
Accuracy with re-ranking: 0.836551724137931


 46%|████████████████████████████████                                      | 1500/3270 [12:26<27:23,  1.08it/s]

Step 1500
Accuracy without re-ranking: 0.846
Accuracy with re-ranking: 0.8333333333333334


 47%|█████████████████████████████████▏                                    | 1550/3270 [12:59<12:16,  2.33it/s]

Step 1550
Accuracy without re-ranking: 0.8464516129032258
Accuracy with re-ranking: 0.8367741935483871


 49%|██████████████████████████████████▎                                   | 1600/3270 [13:26<12:14,  2.27it/s]

Step 1600
Accuracy without re-ranking: 0.845625
Accuracy with re-ranking: 0.835625


 50%|███████████████████████████████████▎                                  | 1650/3270 [13:54<22:36,  1.19it/s]

Step 1650
Accuracy without re-ranking: 0.8460606060606061
Accuracy with re-ranking: 0.8363636363636363


 52%|████████████████████████████████████▍                                 | 1700/3270 [14:22<13:02,  2.01it/s]

Step 1700
Accuracy without re-ranking: 0.8482352941176471
Accuracy with re-ranking: 0.8382352941176471


 54%|█████████████████████████████████████▍                                | 1750/3270 [14:51<11:45,  2.15it/s]

Step 1750
Accuracy without re-ranking: 0.848
Accuracy with re-ranking: 0.8394285714285714


 55%|██████████████████████████████████████▌                               | 1800/3270 [15:15<13:31,  1.81it/s]

Step 1800
Accuracy without re-ranking: 0.8483333333333334
Accuracy with re-ranking: 0.8416666666666667


 57%|███████████████████████████████████████▌                              | 1850/3270 [15:43<11:41,  2.02it/s]

Step 1850
Accuracy without re-ranking: 0.8475675675675676
Accuracy with re-ranking: 0.8416216216216216


 58%|████████████████████████████████████████▋                             | 1900/3270 [16:06<09:07,  2.50it/s]

Step 1900
Accuracy without re-ranking: 0.8494736842105263
Accuracy with re-ranking: 0.8436842105263158


 60%|█████████████████████████████████████████▋                            | 1950/3270 [16:33<09:21,  2.35it/s]

Step 1950
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.8441025641025641


 61%|██████████████████████████████████████████▊                           | 2000/3270 [16:58<09:06,  2.32it/s]

Step 2000
Accuracy without re-ranking: 0.851
Accuracy with re-ranking: 0.846


 63%|███████████████████████████████████████████▉                          | 2050/3270 [17:29<09:37,  2.11it/s]

Step 2050
Accuracy without re-ranking: 0.8526829268292683
Accuracy with re-ranking: 0.848780487804878


 64%|████████████████████████████████████████████▉                         | 2100/3270 [17:59<07:29,  2.60it/s]

Step 2100
Accuracy without re-ranking: 0.8514285714285714
Accuracy with re-ranking: 0.8476190476190476


 66%|██████████████████████████████████████████████                        | 2150/3270 [18:24<11:03,  1.69it/s]

Step 2150
Accuracy without re-ranking: 0.8502325581395349
Accuracy with re-ranking: 0.8483720930232558


 67%|███████████████████████████████████████████████                       | 2200/3270 [18:55<06:32,  2.73it/s]

Step 2200
Accuracy without re-ranking: 0.8504545454545455
Accuracy with re-ranking: 0.8481818181818181


 69%|████████████████████████████████████████████████▏                     | 2250/3270 [19:22<08:23,  2.02it/s]

Step 2250
Accuracy without re-ranking: 0.8506666666666667
Accuracy with re-ranking: 0.8475555555555555


 70%|█████████████████████████████████████████████████▏                    | 2300/3270 [19:47<06:57,  2.33it/s]

Step 2300
Accuracy without re-ranking: 0.8504347826086956
Accuracy with re-ranking: 0.8465217391304348


 72%|██████████████████████████████████████████████████▎                   | 2350/3270 [20:17<11:49,  1.30it/s]

Step 2350
Accuracy without re-ranking: 0.8506382978723405
Accuracy with re-ranking: 0.8472340425531915


 73%|███████████████████████████████████████████████████▍                  | 2400/3270 [20:39<04:52,  2.97it/s]

Step 2400
Accuracy without re-ranking: 0.85125
Accuracy with re-ranking: 0.8479166666666667


 75%|████████████████████████████████████████████████████▍                 | 2450/3270 [21:09<07:23,  1.85it/s]

Step 2450
Accuracy without re-ranking: 0.8526530612244898
Accuracy with re-ranking: 0.8497959183673469


 76%|█████████████████████████████████████████████████████▌                | 2500/3270 [21:35<05:00,  2.56it/s]

Step 2500
Accuracy without re-ranking: 0.8524
Accuracy with re-ranking: 0.8492


 78%|██████████████████████████████████████████████████████▌               | 2550/3270 [21:57<05:31,  2.17it/s]

Step 2550
Accuracy without re-ranking: 0.8509803921568627
Accuracy with re-ranking: 0.8470588235294118


 80%|███████████████████████████████████████████████████████▋              | 2600/3270 [22:24<06:32,  1.71it/s]

Step 2600
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.8457692307692307


 81%|████████████████████████████████████████████████████████▋             | 2650/3270 [22:53<07:52,  1.31it/s]

Step 2650
Accuracy without re-ranking: 0.849811320754717
Accuracy with re-ranking: 0.8456603773584905


 83%|█████████████████████████████████████████████████████████▊            | 2700/3270 [23:16<04:34,  2.08it/s]

Step 2700
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.8462962962962963


 84%|██████████████████████████████████████████████████████████▊           | 2750/3270 [23:42<03:28,  2.49it/s]

Step 2750
Accuracy without re-ranking: 0.8501818181818181
Accuracy with re-ranking: 0.8458181818181818


 86%|███████████████████████████████████████████████████████████▉          | 2800/3270 [24:05<03:52,  2.02it/s]

Step 2800
Accuracy without re-ranking: 0.8492857142857143
Accuracy with re-ranking: 0.8457142857142858


 87%|█████████████████████████████████████████████████████████████         | 2850/3270 [24:31<02:52,  2.44it/s]

Step 2850
Accuracy without re-ranking: 0.8501754385964913
Accuracy with re-ranking: 0.8470175438596491


 89%|██████████████████████████████████████████████████████████████        | 2900/3270 [24:51<02:10,  2.84it/s]

Step 2900
Accuracy without re-ranking: 0.8510344827586207
Accuracy with re-ranking: 0.8472413793103448


 90%|███████████████████████████████████████████████████████████████▏      | 2950/3270 [25:14<02:49,  1.89it/s]

Step 2950
Accuracy without re-ranking: 0.8501694915254238
Accuracy with re-ranking: 0.847457627118644


 92%|████████████████████████████████████████████████████████████████▏     | 3000/3270 [25:48<01:59,  2.27it/s]

Step 3000
Accuracy without re-ranking: 0.851
Accuracy with re-ranking: 0.8486666666666667


 93%|█████████████████████████████████████████████████████████████████▎    | 3050/3270 [26:12<01:26,  2.56it/s]

Step 3050
Accuracy without re-ranking: 0.8511475409836066
Accuracy with re-ranking: 0.8491803278688524


 95%|██████████████████████████████████████████████████████████████████▎   | 3100/3270 [26:35<01:36,  1.76it/s]

Step 3100
Accuracy without re-ranking: 0.8522580645161291
Accuracy with re-ranking: 0.8493548387096774


 96%|███████████████████████████████████████████████████████████████████▍  | 3150/3270 [27:01<00:51,  2.33it/s]

Step 3150
Accuracy without re-ranking: 0.8526984126984127
Accuracy with re-ranking: 0.8501587301587301


 98%|████████████████████████████████████████████████████████████████████▌ | 3200/3270 [27:50<00:38,  1.83it/s]

Step 3200
Accuracy without re-ranking: 0.8525
Accuracy with re-ranking: 0.8496875


 99%|█████████████████████████████████████████████████████████████████████▌| 3250/3270 [28:18<00:08,  2.36it/s]

Step 3250
Accuracy without re-ranking: 0.8526153846153847
Accuracy with re-ranking: 0.8492307692307692


100%|██████████████████████████████████████████████████████████████████████| 3270/3270 [28:31<00:00,  1.91it/s]


In [None]:
# Re-ranking got slightly better after 2 epochs.

In [120]:
raw_accuracy = sum([p[0] for p in predictions])/len(predictions)
reranked_accuracy = sum([p[1] for p in predictions])/len(predictions)

print(f'Using cross-encoder: {finetuned.config._name_or_path}')
print(f'Accuracy without re-ranking: {raw_accuracy}')
print(f'Accuracy with re-ranking: {reranked_accuracy}')


Using cross-encoder: <sentence_transformers.cross_encoder.CrossEncoder.CrossEncoder object at 0x158c9cc70>
Accuracy without re-ranking: 0.8522935779816514
Accuracy with re-ranking: 0.8495412844036697


In [121]:
reference = 'I am a data scientist'
candidates = ['I am an analyst and machine learning engineer', 'I am a baker',
              'I am a biologist', 'I used to be a data scientist',
              'I am a data scientist', 'I love science fields like biology becasue they use data']

sentence_combinations = [[reference, text] for text in candidates]
cosine_scores = cosine_similarity(get_embeddings([reference] + candidates, engine=ENGINE))[0][1:]

# Compute the similarity scores for these combinations
similarity_scores = cross_encoder.predict(sentence_combinations, activation_fct=nn.Sigmoid())

# Sort the scores in decreasing order
sim_scores_argsort = reversed(np.argsort(cosine_scores))

# Print the scores
for idx in sim_scores_argsort:
    print(f"{candidates[idx]}\t{similarity_scores[idx]:.2f}\t{cosine_scores[idx]:.2f}")
        

I am a data scientist	1.00	1.00
I used to be a data scientist	0.94	0.96
I am an analyst and machine learning engineer	0.04	0.92
I am a biologist	0.01	0.88
I love science fields like biology becasue they use data	0.30	0.85
I am a baker	0.00	0.85


In [122]:
pinecone.delete_index(INDEX_NAME)  # delete the index

# OPEN SOURCE ALTERNATIVE TO EMBEDDING

In [81]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-cos-v1')

docs = ["Around 9 Million people live in London", "London is known for its financial district"]

doc_emb = model.encode(docs, batch_size=32, show_progress_bar=True)

doc_emb.shape#  == ('2, 768')


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(2, 768)

In [5]:
#Encode query and documents
docs = dataset['validation']['passage']
doc_emb = model.encode(docs, batch_size=32, show_progress_bar=True)

Batches:   0%|          | 0/103 [00:00<?, ?it/s]

In [9]:
from random import sample

query = sample(dataset['validation']['question'], 1)[0]
print(query)
final_results = get_results_from_pinecone(query, top_k=3, re_rank=True)


were the us and soviet union allies in the cold war


In [16]:
query_emb = model.encode(query)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs[:3]:
    print(score, doc)


0.6090121269226074 The Cold War was a state of geopolitical tension after World War II between powers in the Eastern Bloc (the Soviet Union and its satellite states) and powers in the Western Bloc (the United States, its NATO allies and others). Historians do not fully agree on the dates, but a common timeframe is the period between 1947, the year the Truman Doctrine, a U.S. foreign policy pledging to aid nations threatened by Soviet expansionism, was announced, and either 1989, when communism fell in Eastern Europe, or 1991, when the Soviet Union collapsed. The term ``cold'' is used because there was no large-scale fighting directly between the two sides, but they each supported major regional wars known as proxy wars.
0.48948991298675537 At the start of the war on 1 September 1939, the Allies consisted of France, Poland and the United Kingdom, as well as their dependent states, such as British India. Within days they were joined by the independent Dominions of the British Commonwealt

In [53]:
logger.setLevel(logging.CRITICAL)  # just to suppress some logs


def eval_ranking_open_source(query, cross_encoder, top_k=3):
    query_emb = model.encode(query)

    #Compute dot score between query and all document embeddings
    scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

    #Combine docs & scores
    doc_score_pairs = list(zip(docs, scores))

    #Sort by decreasing score
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)[:top_k]

    retrieved_hash = my_hash(doc_score_pairs[0][0])
    if cross_encoder:
        sentence_combinations = [[query, doc_score_pair[0]] for doc_score_pair in doc_score_pairs]
        similarity_scores = cross_encoder.predict(sentence_combinations)
        sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
        reranked_hash = my_hash(doc_score_pairs[sim_scores_argsort[0]][0])
    else:
        reranked_hash = None
    return retrieved_hash, reranked_hash


In [65]:
eval_ranking_open_source(query, finetuned)

('932d90b691c75f972ccb0182c4be1977', '932d90b691c75f972ccb0182c4be1977')

In [69]:
logger.setLevel(logging.CRITICAL)

i = 0
print_every = 50
predictions = []
for question in tqdm(val_sample['question']):
    retrieved_hash, reranked_hash = eval_ranking_open_source(question, finetuned, top_k=3)
    correct_hash = q_to_hash[question]
    predictions.append((retrieved_hash == correct_hash, reranked_hash == correct_hash))
    i += 1
    if i % print_every == 0:
        print(f'Step {i}')
        raw_accuracy = sum([p[0] for p in predictions])/len(predictions)
        reranked_accuracy = sum([p[1] for p in predictions])/len(predictions)

        print(f'Accuracy without re-ranking: {raw_accuracy}')
        print(f'Accuracy with re-ranking: {reranked_accuracy}')


  2%|█                                                                       | 50/3270 [00:07<09:44,  5.51it/s]

Step 50
Accuracy without re-ranking: 0.82
Accuracy with re-ranking: 0.84


  3%|██▏                                                                    | 101/3270 [00:15<07:26,  7.10it/s]

Step 100
Accuracy without re-ranking: 0.83
Accuracy with re-ranking: 0.84


  5%|███▎                                                                   | 151/3270 [00:23<07:31,  6.91it/s]

Step 150
Accuracy without re-ranking: 0.8466666666666667
Accuracy with re-ranking: 0.84


  6%|████▎                                                                  | 200/3270 [00:31<11:45,  4.35it/s]

Step 200
Accuracy without re-ranking: 0.84
Accuracy with re-ranking: 0.825


  8%|█████▍                                                                 | 251/3270 [00:39<08:13,  6.12it/s]

Step 250
Accuracy without re-ranking: 0.832
Accuracy with re-ranking: 0.82


  9%|██████▌                                                                | 301/3270 [00:47<06:26,  7.69it/s]

Step 300
Accuracy without re-ranking: 0.8233333333333334
Accuracy with re-ranking: 0.8233333333333334


 11%|███████▌                                                               | 351/3270 [00:54<06:28,  7.52it/s]

Step 350
Accuracy without re-ranking: 0.8428571428571429
Accuracy with re-ranking: 0.8371428571428572


 12%|████████▋                                                              | 401/3270 [01:01<06:08,  7.78it/s]

Step 400
Accuracy without re-ranking: 0.835
Accuracy with re-ranking: 0.8375


 14%|█████████▊                                                             | 451/3270 [01:10<09:55,  4.74it/s]

Step 450
Accuracy without re-ranking: 0.8311111111111111
Accuracy with re-ranking: 0.8355555555555556


 15%|██████████▉                                                            | 501/3270 [01:17<07:53,  5.85it/s]

Step 500
Accuracy without re-ranking: 0.83
Accuracy with re-ranking: 0.834


 17%|███████████▉                                                           | 550/3270 [01:25<07:16,  6.24it/s]

Step 550
Accuracy without re-ranking: 0.82
Accuracy with re-ranking: 0.8272727272727273


 18%|█████████████                                                          | 601/3270 [01:32<07:05,  6.28it/s]

Step 600
Accuracy without re-ranking: 0.82
Accuracy with re-ranking: 0.8233333333333334


 20%|██████████████▏                                                        | 652/3270 [01:41<05:09,  8.46it/s]

Step 650
Accuracy without re-ranking: 0.816923076923077
Accuracy with re-ranking: 0.8184615384615385


 21%|███████████████▏                                                       | 701/3270 [01:48<06:28,  6.61it/s]

Step 700
Accuracy without re-ranking: 0.8214285714285714
Accuracy with re-ranking: 0.8214285714285714


 23%|████████████████▎                                                      | 750/3270 [01:55<06:40,  6.29it/s]

Step 750
Accuracy without re-ranking: 0.816
Accuracy with re-ranking: 0.82


 24%|█████████████████▎                                                     | 800/3270 [02:02<05:14,  7.86it/s]

Step 800
Accuracy without re-ranking: 0.815
Accuracy with re-ranking: 0.81875


 26%|██████████████████▍                                                    | 851/3270 [02:10<06:58,  5.78it/s]

Step 850
Accuracy without re-ranking: 0.8129411764705883
Accuracy with re-ranking: 0.8152941176470588


 28%|███████████████████▌                                                   | 901/3270 [02:19<06:05,  6.47it/s]

Step 900
Accuracy without re-ranking: 0.8088888888888889
Accuracy with re-ranking: 0.8111111111111111


 29%|████████████████████▋                                                  | 950/3270 [02:28<06:03,  6.39it/s]

Step 950
Accuracy without re-ranking: 0.8052631578947368
Accuracy with re-ranking: 0.8147368421052632


 31%|█████████████████████▍                                                | 1000/3270 [02:37<08:52,  4.27it/s]

Step 1000
Accuracy without re-ranking: 0.805
Accuracy with re-ranking: 0.814


 32%|██████████████████████▍                                               | 1051/3270 [02:46<05:09,  7.17it/s]

Step 1050
Accuracy without re-ranking: 0.8047619047619048
Accuracy with re-ranking: 0.8142857142857143


 34%|███████████████████████▌                                              | 1101/3270 [02:55<06:37,  5.46it/s]

Step 1100
Accuracy without re-ranking: 0.8054545454545454
Accuracy with re-ranking: 0.8145454545454546


 35%|████████████████████████▋                                             | 1151/3270 [03:01<05:02,  7.02it/s]

Step 1150
Accuracy without re-ranking: 0.808695652173913
Accuracy with re-ranking: 0.8182608695652174


 37%|█████████████████████████▋                                            | 1201/3270 [03:10<05:19,  6.47it/s]

Step 1200
Accuracy without re-ranking: 0.8125
Accuracy with re-ranking: 0.8225


 38%|██████████████████████████▊                                           | 1251/3270 [03:17<04:52,  6.90it/s]

Step 1250
Accuracy without re-ranking: 0.8144
Accuracy with re-ranking: 0.8248


 40%|███████████████████████████▊                                          | 1300/3270 [03:26<06:12,  5.29it/s]

Step 1300
Accuracy without re-ranking: 0.8146153846153846
Accuracy with re-ranking: 0.8276923076923077


 41%|████████████████████████████▉                                         | 1351/3270 [03:36<07:20,  4.36it/s]

Step 1350
Accuracy without re-ranking: 0.8148148148148148
Accuracy with re-ranking: 0.8288888888888889


 43%|█████████████████████████████▉                                        | 1401/3270 [03:46<05:30,  5.65it/s]

Step 1400
Accuracy without re-ranking: 0.8128571428571428
Accuracy with re-ranking: 0.8271428571428572


 44%|███████████████████████████████                                       | 1450/3270 [03:55<04:25,  6.85it/s]

Step 1450
Accuracy without re-ranking: 0.8103448275862069
Accuracy with re-ranking: 0.8255172413793104


 46%|████████████████████████████████▏                                     | 1501/3270 [04:03<03:52,  7.60it/s]

Step 1500
Accuracy without re-ranking: 0.8106666666666666
Accuracy with re-ranking: 0.8233333333333334


 47%|█████████████████████████████████▏                                    | 1550/3270 [04:11<06:51,  4.18it/s]

Step 1550
Accuracy without re-ranking: 0.8116129032258065
Accuracy with re-ranking: 0.8264516129032258


 49%|██████████████████████████████████▎                                   | 1601/3270 [04:20<04:29,  6.19it/s]

Step 1600
Accuracy without re-ranking: 0.81125
Accuracy with re-ranking: 0.82625


 50%|███████████████████████████████████▎                                  | 1651/3270 [04:27<03:44,  7.21it/s]

Step 1650
Accuracy without re-ranking: 0.8109090909090909
Accuracy with re-ranking: 0.8272727272727273


 52%|████████████████████████████████████▍                                 | 1700/3270 [04:36<05:31,  4.73it/s]

Step 1700
Accuracy without re-ranking: 0.8123529411764706
Accuracy with re-ranking: 0.8282352941176471


 54%|█████████████████████████████████████▍                                | 1751/3270 [04:46<03:53,  6.51it/s]

Step 1750
Accuracy without re-ranking: 0.8137142857142857
Accuracy with re-ranking: 0.8285714285714286


 55%|██████████████████████████████████████▌                               | 1801/3270 [04:54<04:14,  5.78it/s]

Step 1800
Accuracy without re-ranking: 0.8166666666666667
Accuracy with re-ranking: 0.8311111111111111


 57%|███████████████████████████████████████▌                              | 1851/3270 [05:03<03:25,  6.92it/s]

Step 1850
Accuracy without re-ranking: 0.8178378378378378
Accuracy with re-ranking: 0.8324324324324325


 58%|████████████████████████████████████████▋                             | 1901/3270 [05:10<03:04,  7.43it/s]

Step 1900
Accuracy without re-ranking: 0.8189473684210526
Accuracy with re-ranking: 0.8336842105263158


 60%|█████████████████████████████████████████▊                            | 1951/3270 [05:18<03:04,  7.15it/s]

Step 1950
Accuracy without re-ranking: 0.8194871794871795
Accuracy with re-ranking: 0.8343589743589743


 61%|██████████████████████████████████████████▊                           | 2001/3270 [05:26<03:32,  5.97it/s]

Step 2000
Accuracy without re-ranking: 0.821
Accuracy with re-ranking: 0.836


 63%|███████████████████████████████████████████▉                          | 2050/3270 [05:33<02:45,  7.39it/s]

Step 2050
Accuracy without re-ranking: 0.8234146341463414
Accuracy with re-ranking: 0.8385365853658536


 64%|████████████████████████████████████████████▉                         | 2101/3270 [05:42<02:40,  7.27it/s]

Step 2100
Accuracy without re-ranking: 0.8233333333333334
Accuracy with re-ranking: 0.8371428571428572


 66%|██████████████████████████████████████████████                        | 2149/3270 [05:48<02:58,  6.29it/s]

Step 2150
Accuracy without re-ranking: 0.8237209302325581
Accuracy with re-ranking: 0.8367441860465116


 67%|███████████████████████████████████████████████                       | 2200/3270 [05:57<02:36,  6.84it/s]

Step 2200
Accuracy without re-ranking: 0.8236363636363636
Accuracy with re-ranking: 0.835


 69%|████████████████████████████████████████████████▏                     | 2251/3270 [06:04<02:03,  8.27it/s]

Step 2250
Accuracy without re-ranking: 0.8235555555555556
Accuracy with re-ranking: 0.8342222222222222


 70%|█████████████████████████████████████████████████▏                    | 2300/3270 [06:12<02:11,  7.35it/s]

Step 2300
Accuracy without re-ranking: 0.8221739130434783
Accuracy with re-ranking: 0.8330434782608696


 72%|██████████████████████████████████████████████████▎                   | 2351/3270 [06:21<02:40,  5.73it/s]

Step 2350
Accuracy without re-ranking: 0.8238297872340425
Accuracy with re-ranking: 0.8344680851063829


 73%|███████████████████████████████████████████████████▍                  | 2400/3270 [06:28<01:55,  7.51it/s]

Step 2400
Accuracy without re-ranking: 0.8241666666666667
Accuracy with re-ranking: 0.8345833333333333


 75%|████████████████████████████████████████████████████▍                 | 2450/3270 [06:38<02:32,  5.39it/s]

Step 2450
Accuracy without re-ranking: 0.8248979591836735
Accuracy with re-ranking: 0.8363265306122449


 76%|█████████████████████████████████████████████████████▌                | 2501/3270 [06:46<01:59,  6.45it/s]

Step 2500
Accuracy without re-ranking: 0.8244
Accuracy with re-ranking: 0.836


 78%|██████████████████████████████████████████████████████▌               | 2550/3270 [06:53<01:47,  6.68it/s]

Step 2550
Accuracy without re-ranking: 0.8231372549019608
Accuracy with re-ranking: 0.8337254901960784


 80%|███████████████████████████████████████████████████████▋              | 2601/3270 [07:01<01:47,  6.23it/s]

Step 2600
Accuracy without re-ranking: 0.823076923076923
Accuracy with re-ranking: 0.833076923076923


 81%|████████████████████████████████████████████████████████▋             | 2651/3270 [07:09<01:36,  6.43it/s]

Step 2650
Accuracy without re-ranking: 0.8233962264150944
Accuracy with re-ranking: 0.8328301886792453


 83%|█████████████████████████████████████████████████████████▊            | 2701/3270 [07:16<01:39,  5.69it/s]

Step 2700
Accuracy without re-ranking: 0.8244444444444444
Accuracy with re-ranking: 0.8340740740740741


 84%|██████████████████████████████████████████████████████████▉           | 2751/3270 [07:24<01:18,  6.62it/s]

Step 2750
Accuracy without re-ranking: 0.8236363636363636
Accuracy with re-ranking: 0.8334545454545454


 86%|███████████████████████████████████████████████████████████▉          | 2801/3270 [07:32<01:18,  6.00it/s]

Step 2800
Accuracy without re-ranking: 0.8235714285714286
Accuracy with re-ranking: 0.8332142857142857


 87%|█████████████████████████████████████████████████████████████         | 2851/3270 [07:42<01:22,  5.08it/s]

Step 2850
Accuracy without re-ranking: 0.8231578947368421
Accuracy with re-ranking: 0.8333333333333334


 89%|██████████████████████████████████████████████████████████████        | 2900/3270 [07:50<00:55,  6.69it/s]

Step 2900
Accuracy without re-ranking: 0.823103448275862
Accuracy with re-ranking: 0.8344827586206897


 90%|███████████████████████████████████████████████████████████████▏      | 2950/3270 [07:59<00:55,  5.75it/s]

Step 2950
Accuracy without re-ranking: 0.8213559322033899
Accuracy with re-ranking: 0.8349152542372882


 92%|████████████████████████████████████████████████████████████████▏     | 3001/3270 [08:06<00:34,  7.79it/s]

Step 3000
Accuracy without re-ranking: 0.822
Accuracy with re-ranking: 0.8356666666666667


 93%|█████████████████████████████████████████████████████████████████▎    | 3051/3270 [08:13<00:28,  7.57it/s]

Step 3050
Accuracy without re-ranking: 0.8226229508196722
Accuracy with re-ranking: 0.8360655737704918


 95%|██████████████████████████████████████████████████████████████████▍   | 3101/3270 [08:21<00:22,  7.41it/s]

Step 3100
Accuracy without re-ranking: 0.8229032258064516
Accuracy with re-ranking: 0.8367741935483871


 96%|███████████████████████████████████████████████████████████████████▍  | 3151/3270 [08:28<00:16,  7.23it/s]

Step 3150
Accuracy without re-ranking: 0.8241269841269842
Accuracy with re-ranking: 0.8377777777777777


 98%|████████████████████████████████████████████████████████████████████▌ | 3201/3270 [08:37<00:13,  5.07it/s]

Step 3200
Accuracy without re-ranking: 0.8240625
Accuracy with re-ranking: 0.8375


 99%|█████████████████████████████████████████████████████████████████████▌| 3251/3270 [08:45<00:02,  6.83it/s]

Step 3250
Accuracy without re-ranking: 0.8233846153846154
Accuracy with re-ranking: 0.8375384615384616


100%|██████████████████████████████████████████████████████████████████████| 3270/3270 [08:48<00:00,  6.19it/s]


In [75]:
raw_accuracy = sum([p[0] for p in predictions])/len(predictions)
reranked_accuracy = sum([p[1] for p in predictions])/len(predictions)

print(f'Using cross-encoder: {cross_encoder.config._name_or_path}')
print(f'Accuracy without re-ranking: {raw_accuracy}')
print(f'Accuracy with re-ranking: {reranked_accuracy}')


Using cross-encoder: cross-encoder/mmarco-mMiniLMv2-L12-H384-v1
Accuracy without re-ranking: 0.8238532110091743
Accuracy with re-ranking: 0.8376146788990826
