In [None]:
# This notebook has been updated to use the latest openai package version! At the time, 1.6.1

In [1]:
'''
Chapter 2: Launching an Application with Proprietary Models 
    Overview of Proprietary Models
    Introduction to OpenAI + Embeddings / GPT3 / ChatGPT
    Introduction to Vector Databases
    Building a Neural/Semantic Information Retrieval System with Vector Databases, BERT & GPT3

'''

'\nChapter 2: Launching an Application with Proprietary Models \n    Overview of Proprietary Modelsa\n    Introduction to OpenAI + Embeddings / GPT3 / ChatGPT\n    Introduction to Vector Databases\n    Building a Neural/Semantic Information Retrieval System with Vector Databases, BERT & GPT3\n\n'

In [4]:
from openai import OpenAI
from datetime import datetime
import hashlib
import re
import os
from tqdm import tqdm
import numpy as np

import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


In [15]:
pinecone_key = os.environ.get('PINECONE_API_KEY')

if not pinecone_key:
    print(f'Pinecone key is null.')

openai_key = os.environ.get("OPENAI_API_KEY")

if not openai_key:
    print(f'OpenAI key is null.')

client = OpenAI(
    api_key=openai_key
)

INDEX_NAME = 'semantic-search'
NAMESPACE = 'default'
ENGINE = 'text-embedding-ada-002'

In [13]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_key)

In [32]:
# helper functions to get lists of embeddings from the OpenAI API
def get_embeddings(texts, engine=ENGINE):
    response = client.embeddings.create(
        input=texts,
        model=engine
    )
    
    return [d.embedding for d in list(response.data)]

def get_embedding(text):
    return get_embeddings([text])[0]
    
len(get_embedding('hi')), len(get_embeddings(['hi', 'hello']))

(1536, 2)

In [15]:
if INDEX_NAME not in pc.list_indexes().names():
    print(f'Creating index {INDEX_NAME}')
    pc.create_index(
        name=INDEX_NAME,  # The name of the index
        dimension=1536,  # The dimensionality of the vectors for our OpenAI embedder
        metric='cosine',  # The similarity metric to use when searching the index
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

# Store the index as a variable
index = pc.Index(name=INDEX_NAME)

In [16]:
def my_hash(s):
    # Return the MD5 hash of the input string as a hexadecimal string
    return hashlib.md5(s.encode()).hexdigest()

my_hash('I love to hash it')

'ae76cc4dfd345ecaeea9b8ba0d5c3437'

In [17]:
def prepare_for_pinecone(texts, engine=ENGINE):
    # Get the current UTC date and time
    now = datetime.utcnow()
    
    # Generate vector embeddings for each string in the input list, using the specified engine
    embeddings = get_embeddings(texts, engine=engine)
    
    # Create tuples of (hash, embedding, metadata) for each input string and its corresponding vector embedding
    # The my_hash() function is used to generate a unique hash for each string, and the datetime.utcnow() function is used to generate the current UTC date and time
    return [
        (
            my_hash(text),  # A unique ID for each string, generated using the my_hash() function
            embedding,  # The vector embedding of the string
            dict(text=text, date_uploaded=now)  # A dictionary of metadata, including the original text and the current UTC date and time
        ) 
        for text, embedding in zip(texts, embeddings)  # Iterate over each input string and its corresponding vector embedding
    ]


In [18]:
texts = ['hi']

In [19]:
prepare_for_pinecone(texts)[0]

('49f68a5c8493ec2c0bf489821c21fc3b',
 [-0.035158250480890274,
  -0.02061408758163452,
  -0.015398157760500908,
  -0.03980923071503639,
  -0.027459172531962395,
  0.02111334539949894,
  -0.022059306502342224,
  -0.019418496638536453,
  -0.009446481242775917,
  -0.013131790794432163,
  0.02958758734166622,
  -0.004680540878325701,
  -0.015227359719574451,
  -0.014058045111596584,
  0.008966931141912937,
  0.015187944285571575,
  0.03833773359656334,
  -0.005708617623895407,
  0.023951230570673943,
  -0.012783624231815338,
  -0.014951453544199467,
  -0.0031351412180811167,
  -0.006910777185112238,
  -0.008474241942167282,
  -0.022729363292455673,
  -0.00014329023542813957,
  0.013479957357048988,
  -0.01696162298321724,
  0.0044933189637959,
  -0.02238776534795761,
  0.014544164761900902,
  -0.0009976942092180252,
  -0.044880639761686325,
  -0.009676402434706688,
  -0.009788078255951405,
  -0.01576603204011917,
  0.00986033957451582,
  -0.021218450739979744,
  0.015069698914885521,
  -0.0

In [20]:
_id, embedding, metadata = prepare_for_pinecone(texts)[0]

print('ID:  ',_id, '\nLEN: ', len(embedding), '\nMETA:', metadata)

ID:   49f68a5c8493ec2c0bf489821c21fc3b 
LEN:  1536 
META: {'text': 'hi', 'date_uploaded': datetime.datetime(2023, 7, 31, 13, 9, 40, 102999)}


In [21]:
def upload_texts_to_pinecone(texts, namespace=NAMESPACE, batch_size=None, show_progress_bar=False):
    # Call the prepare_for_pinecone function to prepare the input texts for indexing
    total_upserted = 0
    if not batch_size:
        batch_size = len(texts)

    _range = range(0, len(texts), batch_size)
    for i in tqdm(_range) if show_progress_bar else _range:
        batch = texts[i: i + batch_size]
        prepared_texts = prepare_for_pinecone(batch)

        # Use the upsert() method of the index object to upload the prepared texts to Pinecone
        total_upserted += index.upsert(
            prepared_texts,
            namespace=namespace
        )['upserted_count']

    return total_upserted


# Call the upload_texts_to_pinecone() function with the input texts
upload_texts_to_pinecone(texts)


1

In [50]:
def query_from_pinecone(query, top_k=3):
    # get embedding from THE SAME embedder as the documents
    query_embedding = get_embedding(query)

    return index.query(
      vector=query_embedding,
      top_k=top_k,
      namespace=NAMESPACE,
      include_metadata=True   # gets the metadata (dates, text, etc)
    ).get('matches')

query_from_pinecone('hello')

[{'id': '093601540a641d12a6f734a9fa624ce5',
  'metadata': {'date_uploaded': datetime.datetime(2023, 7, 31, 13, 17, 57, 390314),
               'text': "Alexander Graham Bell originally suggested 'ahoy-hoy' "
                       'be adopted as the standard greeting when answering a '
                       "telephone, before 'hello' (suggested by Thomas Edison) "
                       'became common.'},
  'score': 0.780635893,
  'values': []},
 {'id': '9588c26cecaaf486eae14858827a6699',
  'metadata': {'date_uploaded': datetime.datetime(2023, 7, 31, 13, 17, 45, 374073),
               'text': 'The Abbott family -- wife Evelyn, husband Lee, '
                       'congenitally deaf daughter Regan, and sons Marcus and '
                       'Beau -- silently scavenge for supplies in a deserted '
                       'town. While out in the open, the family communicates '
                       'with American Sign Language (ASL). Four-year-old Beau '
                       'is dra

In [23]:
import hashlib

def delete_texts_from_pinecone(texts, namespace=NAMESPACE):
    # Compute the hash (id) for each text
    hashes = [hashlib.md5(text.encode()).hexdigest() for text in texts]
    
    # The ids parameter is used to specify the list of IDs (hashes) to delete
    return index.delete(ids=hashes, namespace=namespace)

# delete our text
delete_texts_from_pinecone(texts)

# test that the index is empty
query_from_pinecone('hello')

[]

In [24]:
# Importing the tiktoken library
import tiktoken

# Initializing a tokenizer for the 'cl100k_base' model
# This tokenizer is designed to work with the 'ada-002' embedding model
tokenizer = tiktoken.get_encoding("cl100k_base")

# Using the tokenizer to encode the text 'hey there'
# The resulting output is a list of integers representing the encoded text
# This is the input format required for embedding using the 'ada-002' model
tokenizer.encode('hey there')


[36661, 1070]

In [25]:
# Function to split the text into chunks of a maximum number of tokens. Inspired by OpenAI
def overlapping_chunks(text, max_tokens = 500, overlapping_factor = 5):
    '''
    max_tokens: tokens we want per chunk
    overlapping_factor: number of sentences to start each chunk with that overlaps with the previous chunk
    '''

    # Split the text using punctuation
    sentences = re.split(r'[.?!]', text)

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks, tokens_so_far, chunk = [], 0, []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            if overlapping_factor > 0:
                chunk = chunk[-overlapping_factor:]
                tokens_so_far = sum([len(tokenizer.encode(c)) for c in chunk])
            else:
                chunk = []
                tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1
    if chunk:
        chunks.append(". ".join(chunk) + ".")

    return chunks

In [26]:
import PyPDF2

# Open the PDF file in read-binary mode
with open('../data/pds2.pdf', 'rb') as file:

    # Create a PDF reader object
    reader = PyPDF2.PdfReader(file)

    # Initialize an empty string to hold the text
    principles_of_ds = ''
    # Loop through each page in the PDF file
    for page in tqdm(reader.pages):
        text = page.extract_text()
        principles_of_ds += '\n\n' + text[text.find(' ]')+2:]

# Print the final string containing all the text from the PDF file
principles_of_ds = principles_of_ds.strip()

print(len(principles_of_ds))


100%|█████████████████████████████████████████| 428/428 [07:01<00:00,  1.01it/s]

575490





In [27]:
from urllib.request import urlopen

#

# A textbook about insects
text = urlopen('https://www.gutenberg.org/cache/epub/10834/pg10834.txt').read().decode()


In [28]:
split = overlapping_chunks(principles_of_ds, overlapping_factor=0)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'non-overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

non-overlapping chunking approach has 286 documents with average length 474.1 tokens


In [29]:
split = overlapping_chunks(principles_of_ds)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

overlapping chunking approach has 392 documents with average length 485.2 tokens


In [30]:
# Importing the Counter and re libraries
from collections import Counter
import re

# Find all occurrences of one or more spaces in 'principles_of_ds'
matches = re.findall(r'[\s]{1,}', principles_of_ds)

# The 10 most frequent spaces that occur in the document
most_common_spaces = Counter(matches).most_common(10)

# Print the most common spaces and their frequencies
print(most_common_spaces)


[(' ', 82259), ('\n', 9220), ('  ', 1592), ('\n\n', 333), ('\n   ', 250), ('\n\n\n', 82), ('\n    ', 73), ('\n ', 46), (' \n', 39), ('     ', 34)]


In [31]:
# Only keep documents of at least 100 characters split by a custom delimiter
split = list(filter(lambda x: len(x) > 50, principles_of_ds.split('\n\n')))

avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'custom delimiter approach has {len(split)} documents with average length {avg_length:.1f} tokens')

custom delimiter approach has 426 documents with average length 316.3 tokens


In [33]:
embeddings = None
for s in tqdm(range(0, len(split), 100)):
    if embeddings is None:
        embeddings = np.array(get_embeddings(split[s:s+100], engine=ENGINE))
    else:
        embeddings = np.vstack([embeddings, np.array(get_embeddings(split[s:s+100], engine=ENGINE))])
    

100%|█████████████████████████████████████████████| 5/5 [00:05<00:00,  1.12s/it]


In [34]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Assume you have a list of text embeddings called `embeddings`
# First, compute the cosine similarity matrix between all pairs of embeddings
cosine_sim_matrix = cosine_similarity(embeddings)

# Instantiate the AgglomerativeClustering model
agg_clustering = AgglomerativeClustering(
    n_clusters=None,         # the algorithm will determine the optimal number of clusters based on the data
    distance_threshold=0.1,  # clusters will be formed until all pairwise distances between clusters are greater than 0.1
    affinity='precomputed',  # we are providing a precomputed distance matrix (1 - similarity matrix) as input
    linkage='complete'       # form clusters by iteratively merging the smallest clusters based on the maximum distance between their components
)

# Fit the model to the cosine distance matrix (1 - similarity matrix)
agg_clustering.fit(1 - cosine_sim_matrix)

# Get the cluster labels for each embedding
cluster_labels = agg_clustering.labels_

# Print the number of embeddings in each cluster
unique_labels, counts = np.unique(cluster_labels, return_counts=True)
for label, count in zip(unique_labels, counts):
    print(f'Cluster {label}: {count} embeddings')




Cluster 0: 2 embeddings
Cluster 1: 3 embeddings
Cluster 2: 4 embeddings
Cluster 3: 2 embeddings
Cluster 4: 2 embeddings
Cluster 5: 3 embeddings
Cluster 6: 3 embeddings
Cluster 7: 2 embeddings
Cluster 8: 2 embeddings
Cluster 9: 2 embeddings
Cluster 10: 2 embeddings
Cluster 11: 2 embeddings
Cluster 12: 3 embeddings
Cluster 13: 2 embeddings
Cluster 14: 3 embeddings
Cluster 15: 4 embeddings
Cluster 16: 2 embeddings
Cluster 17: 2 embeddings
Cluster 18: 2 embeddings
Cluster 19: 2 embeddings
Cluster 20: 2 embeddings
Cluster 21: 2 embeddings
Cluster 22: 2 embeddings
Cluster 23: 2 embeddings
Cluster 24: 2 embeddings
Cluster 25: 2 embeddings
Cluster 26: 2 embeddings
Cluster 27: 2 embeddings
Cluster 28: 2 embeddings
Cluster 29: 2 embeddings
Cluster 30: 2 embeddings
Cluster 31: 2 embeddings
Cluster 32: 2 embeddings
Cluster 33: 1 embeddings
Cluster 34: 2 embeddings
Cluster 35: 2 embeddings
Cluster 36: 1 embeddings
Cluster 37: 2 embeddings
Cluster 38: 2 embeddings
Cluster 39: 2 embeddings
Cluster 40

In [35]:
pruned_documents = []
for _label, count in zip(unique_labels, counts):
    pruned_documents.append('\n\n'.join([text for text, label in zip(split, cluster_labels) if label == _label]))

    
avg_length = sum([len(tokenizer.encode(t)) for t in pruned_documents]) / len(pruned_documents)
print(f'Our pruning approach has {len(pruned_documents)} documents with average length {avg_length:.1f} tokens')

Our pruning approach has 337 documents with average length 399.9 tokens


In [36]:
print(pruned_documents[0])

We get the following table:
In the preceding table, each row represents a single passenger on the ship, and, for now, we
are looking at two specific features: the sex of the individual and whether or not they
survived the sinking. For example, the first row represents a man who did not survive,
while the fourth row (with index 3, remember how Python indexes lists) represents a
female who did survive.
Let's start with some basics. We'll start by calculating the probability that any given person
on the ship survived, regardless of their gender. To do this, we'll count the number of yeses
in the Survived  column and divide this figure by the total number of rows, as shown here:
num_rows = float(titanic.shape[0]) # == 891 rows
p_survived = (titanic.Survived=="yes").sum() / num_rows # == .38
p_notsurvived = 1 - p_survived                          # == .61
Note that I only had to calculate P(Survived) , and I used the law of conjugate probabilities to
calculate P(Died)  because those two eve

In [37]:
upload_texts_to_pinecone(pruned_documents, batch_size=128)

337

In [38]:
query = 'How do z scores work?'

results_from_pinecone = query_from_pinecone(query, top_k=5)

for result_from_pinecone in results_from_pinecone:
    print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")
    

fe05666e4653e007201c336eba0b0c89	0.85	Let's begin by learning a very  important value  i
28c9f36450ba7f7a78fcf29f2e4e9909	0.85	Z-scores are an effective way to standardize  data
268922260e7eabb5ed6170d07ec4585a	0.83	The preceding code gives us this graph:
Now, our d
7da124bb8c927e0c392cf75d4bfb2c76	0.82	
Basic Statistics
This chapter will focus on the s
f90c995652f29ac9c2a4ca94f0e6809c	0.81	
We can think of this problem like as follows:
The


In [39]:
"""
This example computes the score between a query and all possible
sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
It output then the most similar sentences for the given query.
"""
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np
from torch import nn

# Pre-trained cross encoder
cross_encoder = CrossEncoder('nreimers/mmarco-mMiniLMv2-L12-H384-v1')


In [40]:
def get_results_from_pinecone(query, top_k=3, re_rank=False, verbose=True):

    results_from_pinecone = query_from_pinecone(query, top_k=top_k)
    if not results_from_pinecone:
        return []

    if verbose:
        print("Query:", query)
    
    
    final_results = []

    if re_rank:
        if verbose:
            print('Document ID (Hash)\t\tRetrieval Score\tCE Score\tText')

        sentence_combinations = [[query, result_from_pinecone['metadata']['text']] for result_from_pinecone in results_from_pinecone]

        # Compute the similarity scores for these combinations
        similarity_scores = cross_encoder.predict(sentence_combinations, activation_fct=nn.Sigmoid())

        # Sort the scores in decreasing order
        sim_scores_argsort = reversed(np.argsort(similarity_scores))

        # Print the scores
        for idx in sim_scores_argsort:
            result_from_pinecone = results_from_pinecone[idx]
            final_results.append(result_from_pinecone)
            if verbose:
                print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{similarity_scores[idx]:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")
        return final_results

    if verbose:
        print('Document ID (Hash)\t\tRetrieval Score\tText')
    for result_from_pinecone in results_from_pinecone:
        final_results.append(result_from_pinecone)
        if verbose:
            print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")

    return final_results

In [41]:
final_results = get_results_from_pinecone(query, top_k=3, re_rank=True)

Query: How do z scores work?
Document ID (Hash)		Retrieval Score	CE Score	Text


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

28c9f36450ba7f7a78fcf29f2e4e9909	0.85	0.99	Z-scores are an effective way to standardize  data
fe05666e4653e007201c336eba0b0c89	0.85	0.99	Let's begin by learning a very  important value  i
268922260e7eabb5ed6170d07ec4585a	0.83	0.02	The preceding code gives us this graph:
Now, our d


In [42]:
final_results = get_results_from_pinecone(query, top_k=10, re_rank=True)

Query: How do z scores work?
Document ID (Hash)		Retrieval Score	CE Score	Text


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

28c9f36450ba7f7a78fcf29f2e4e9909	0.85	0.99	Z-scores are an effective way to standardize  data
fe05666e4653e007201c336eba0b0c89	0.85	0.99	Let's begin by learning a very  important value  i
7da124bb8c927e0c392cf75d4bfb2c76	0.82	0.38	
Basic Statistics
This chapter will focus on the s
268922260e7eabb5ed6170d07ec4585a	0.83	0.02	The preceding code gives us this graph:
Now, our d
75cf0bcd29070579356e3c50e9440d94	0.80	0.01	The following is the probability distribution of o
c41c22f849c21729d2ccad5aa5da0047	0.79	0.01	Choose a significance level (usually called alpha 
e9ec4bdefba2f80157ffce04812a1cf4	0.80	0.01	
Let's look at each of the elements in this formul
5bf0b775d7e41ee065e79ef1d2ac9f2c	0.80	0.00	The empirical rule
Recall that a normal distributi
00c617088f8b57db6cc63331603a47ed	0.79	0.00	We can see that our p-value is quite large; theref
f90c995652f29ac9c2a4ca94f0e6809c	0.81	0.00	
We can think of this problem like as follows:
The


In [43]:
delete_texts_from_pinecone(pruned_documents)

{}

# BoolQ

In [44]:
from datasets import load_dataset
from evaluate import load


dataset = load_dataset("boolq")

Found cached dataset boolq (/Users/sinanozdemir/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5)


  0%|          | 0/2 [00:00<?, ?it/s]

In [45]:
dataset['validation'][0]

{'question': 'does ethanol take more energy make that produces',
 'answer': False,
 'passage': "All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a sep

In [46]:
for idx in tqdm(range(0, len(dataset['validation']), 256)):
    data_sample = dataset['validation'][idx:idx + 256]

    passages = data_sample['passage']

    upload_texts_to_pinecone(passages)

100%|███████████████████████████████████████████| 13/13 [00:39<00:00,  3.04s/it]


In [47]:
from random import sample

query = sample(dataset['validation']['question'], 1)[0]
print(query)
final_results = get_results_from_pinecone(query, top_k=3, re_rank=True)


is the barber of seville a true story
Query: is the barber of seville a true story
Document ID (Hash)		Retrieval Score	CE Score	Text


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

22433164547c459ac32cd575eb78a917	0.74	0.01	Sweeney Todd: The Demon Barber of Fleet Street is 
c3225f28c8c4cbfb81f3b01715f6d3f4	0.76	0.00	Rossini's opera recounts the events of the first o
50b66c4c465aa18ad8e8589c7ba15728	0.73	0.00	While the film was a box office hit and raised Str


In [48]:
q_to_hash = {data['question']: my_hash(data['passage']) for data in dataset['validation']}

q_to_hash[query]

'c3225f28c8c4cbfb81f3b01715f6d3f4'

In [49]:
# super_glue_metric = load('super_glue', 'boolq')  # just accuracy

# Let's test the performance re-ranking against 1000 of our validation datapoints
# Note we could not use Pinecone here to speed things up
#  but it's also a good time to test latency of the pipeline with Pinecone
val_sample = dataset['validation']#[:1000]

In [98]:
logger.setLevel(logging.CRITICAL)

predictions = []

# Note we will keep top_k the same so latency from Pinecone is consistent
#  and the only major time difference will be in the re-ranking

for question in tqdm(val_sample['question']):
    retrieved_hash = get_results_from_pinecone(question, top_k=1, re_rank=False, verbose=False)[0]['id']
    correct_hash = q_to_hash[question]
    predictions.append(retrieved_hash == correct_hash)
    
accuracy = sum(predictions)/len(predictions)

print(f'Accuracy without re-ranking: {accuracy}')

100%|██████████████████████████████████████████████████████████████████████| 3270/3270 [17:41<00:00,  3.08it/s]

Accuracy without re-ranking: 0.8522935779816514





In [99]:
logger.setLevel(logging.CRITICAL)

predictions = []

# Note we will keep top_k the same so latency from Pinecone is consistent
#  and the only major time difference will be in the re-ranking

for question in tqdm(val_sample['question']):
    retrieved_hash = get_results_from_pinecone(question, top_k=3, re_rank=True, verbose=False)[0]['id']
    correct_hash = q_to_hash[question]
    predictions.append(retrieved_hash == correct_hash)
    
accuracy = sum(predictions)/len(predictions)

print(f'Accuracy with re-ranking: {accuracy}')

100%|██████████████████████████████████████████████████████████████████████| 3270/3270 [27:20<00:00,  1.99it/s]

Accuracy with re-ranking: 0.8373088685015291





In [100]:
# Note the time differences between with and without re-ranking


In [362]:
def eval_ranking(query, cross_encoder, top_k=3):
    results_from_pinecone = query_from_pinecone(query, top_k=top_k)
    sentence_combinations = [[query, result_from_pinecone['metadata']['text']] for result_from_pinecone in results_from_pinecone]
    similarity_scores = cross_encoder.predict(sentence_combinations)
    sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
    re_ranked_final_result = results_from_pinecone[sim_scores_argsort[0]]
    return results_from_pinecone[0]['id'], re_ranked_final_result['id']


In [37]:
# Trying another pre-trained cross encoder
# sentence-transformers/multi-qa-mpnet-base-cos-v1
newer_cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')


In [103]:
i = 0
print_every = 50
predictions = []
for question in tqdm(val_sample['question']):
    retrieved_hash, reranked_hash = eval_ranking(question, newer_cross_encoder, top_k=3)
    correct_hash = q_to_hash[question]
    predictions.append((retrieved_hash == correct_hash, reranked_hash == correct_hash))
    i += 1
    if i % print_every == 0:
        print(f'Step {i}')
        raw_accuracy = sum([p[0] for p in predictions])/len(predictions)
        reranked_accuracy = sum([p[1] for p in predictions])/len(predictions)

        print(f'Accuracy without re-ranking: {raw_accuracy}')
        print(f'Accuracy with re-ranking: {reranked_accuracy}')


  2%|█                                                                       | 50/3270 [00:23<21:10,  2.53it/s]

Step 50
Accuracy without re-ranking: 0.88
Accuracy with re-ranking: 0.84


  3%|██▏                                                                    | 100/3270 [00:45<22:31,  2.35it/s]

Step 100
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.85


  5%|███▎                                                                   | 150/3270 [01:07<25:32,  2.04it/s]

Step 150
Accuracy without re-ranking: 0.86
Accuracy with re-ranking: 0.8466666666666667


  6%|████▎                                                                  | 200/3270 [01:27<23:56,  2.14it/s]

Step 200
Accuracy without re-ranking: 0.865
Accuracy with re-ranking: 0.845


  8%|█████▍                                                                 | 250/3270 [01:48<20:02,  2.51it/s]

Step 250
Accuracy without re-ranking: 0.872
Accuracy with re-ranking: 0.84


  9%|██████▌                                                                | 300/3270 [02:09<19:23,  2.55it/s]

Step 300
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.84


 11%|███████▌                                                               | 350/3270 [02:31<17:17,  2.81it/s]

Step 350
Accuracy without re-ranking: 0.86
Accuracy with re-ranking: 0.8457142857142858


 12%|████████▋                                                              | 400/3270 [02:53<18:28,  2.59it/s]

Step 400
Accuracy without re-ranking: 0.8625
Accuracy with re-ranking: 0.845


 14%|█████████▊                                                             | 450/3270 [03:20<19:07,  2.46it/s]

Step 450
Accuracy without re-ranking: 0.8577777777777778
Accuracy with re-ranking: 0.84


 15%|██████████▊                                                            | 500/3270 [03:41<20:36,  2.24it/s]

Step 500
Accuracy without re-ranking: 0.852
Accuracy with re-ranking: 0.838


 17%|███████████▉                                                           | 550/3270 [04:12<23:15,  1.95it/s]

Step 550
Accuracy without re-ranking: 0.8418181818181818
Accuracy with re-ranking: 0.8381818181818181


 18%|█████████████                                                          | 600/3270 [04:32<16:10,  2.75it/s]

Step 600
Accuracy without re-ranking: 0.8383333333333334
Accuracy with re-ranking: 0.8316666666666667


 20%|██████████████                                                         | 650/3270 [04:55<18:55,  2.31it/s]

Step 650
Accuracy without re-ranking: 0.8369230769230769
Accuracy with re-ranking: 0.8276923076923077


 21%|███████████████▏                                                       | 700/3270 [05:17<18:02,  2.37it/s]

Step 700
Accuracy without re-ranking: 0.8385714285714285
Accuracy with re-ranking: 0.8285714285714286


 23%|████████████████▎                                                      | 750/3270 [05:38<17:34,  2.39it/s]

Step 750
Accuracy without re-ranking: 0.832
Accuracy with re-ranking: 0.8266666666666667


 24%|█████████████████▎                                                     | 800/3270 [05:59<17:45,  2.32it/s]

Step 800
Accuracy without re-ranking: 0.835
Accuracy with re-ranking: 0.82875


 26%|██████████████████▍                                                    | 850/3270 [06:21<19:12,  2.10it/s]

Step 850
Accuracy without re-ranking: 0.8352941176470589
Accuracy with re-ranking: 0.8282352941176471


 28%|███████████████████▌                                                   | 900/3270 [06:41<14:34,  2.71it/s]

Step 900
Accuracy without re-ranking: 0.8344444444444444
Accuracy with re-ranking: 0.8266666666666667


 29%|████████████████████▋                                                  | 950/3270 [07:04<20:37,  1.87it/s]

Step 950
Accuracy without re-ranking: 0.8378947368421052
Accuracy with re-ranking: 0.8252631578947368


 31%|█████████████████████▍                                                | 1000/3270 [07:27<14:24,  2.62it/s]

Step 1000
Accuracy without re-ranking: 0.84
Accuracy with re-ranking: 0.826


 32%|██████████████████████▍                                               | 1050/3270 [07:50<15:45,  2.35it/s]

Step 1050
Accuracy without re-ranking: 0.84
Accuracy with re-ranking: 0.8257142857142857


 34%|███████████████████████▌                                              | 1100/3270 [08:14<14:23,  2.51it/s]

Step 1100
Accuracy without re-ranking: 0.8418181818181818
Accuracy with re-ranking: 0.8272727272727273


 35%|████████████████████████▌                                             | 1150/3270 [08:35<21:58,  1.61it/s]

Step 1150
Accuracy without re-ranking: 0.8443478260869566
Accuracy with re-ranking: 0.8304347826086956


 37%|█████████████████████████▋                                            | 1200/3270 [08:56<14:30,  2.38it/s]

Step 1200
Accuracy without re-ranking: 0.8458333333333333
Accuracy with re-ranking: 0.8325


 38%|██████████████████████████▊                                           | 1250/3270 [09:21<14:05,  2.39it/s]

Step 1250
Accuracy without re-ranking: 0.8488
Accuracy with re-ranking: 0.8352


 40%|███████████████████████████▊                                          | 1300/3270 [09:46<22:47,  1.44it/s]

Step 1300
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.8361538461538461


 41%|████████████████████████████▉                                         | 1350/3270 [10:10<13:00,  2.46it/s]

Step 1350
Accuracy without re-ranking: 0.8511111111111112
Accuracy with re-ranking: 0.84


 43%|█████████████████████████████▉                                        | 1400/3270 [10:37<28:41,  1.09it/s]

Step 1400
Accuracy without re-ranking: 0.8492857142857143
Accuracy with re-ranking: 0.8385714285714285


 44%|███████████████████████████████                                       | 1450/3270 [11:10<24:11,  1.25it/s]

Step 1450
Accuracy without re-ranking: 0.8475862068965517
Accuracy with re-ranking: 0.8344827586206897


 46%|████████████████████████████████                                      | 1500/3270 [11:32<10:30,  2.81it/s]

Step 1500
Accuracy without re-ranking: 0.846
Accuracy with re-ranking: 0.8313333333333334


 47%|█████████████████████████████████▏                                    | 1550/3270 [11:58<16:05,  1.78it/s]

Step 1550
Accuracy without re-ranking: 0.8464516129032258
Accuracy with re-ranking: 0.832258064516129


 49%|██████████████████████████████████▎                                   | 1600/3270 [12:27<17:12,  1.62it/s]

Step 1600
Accuracy without re-ranking: 0.845625
Accuracy with re-ranking: 0.831875


 50%|███████████████████████████████████▎                                  | 1650/3270 [12:50<13:20,  2.02it/s]

Step 1650
Accuracy without re-ranking: 0.8460606060606061
Accuracy with re-ranking: 0.8321212121212122


 52%|████████████████████████████████████▍                                 | 1700/3270 [13:21<16:55,  1.55it/s]

Step 1700
Accuracy without re-ranking: 0.8482352941176471
Accuracy with re-ranking: 0.8335294117647059


 54%|█████████████████████████████████████▍                                | 1750/3270 [13:51<14:25,  1.76it/s]

Step 1750
Accuracy without re-ranking: 0.848
Accuracy with re-ranking: 0.8331428571428572


 55%|██████████████████████████████████████▌                               | 1800/3270 [14:22<12:56,  1.89it/s]

Step 1800
Accuracy without re-ranking: 0.8483333333333334
Accuracy with re-ranking: 0.8355555555555556


 57%|███████████████████████████████████████▌                              | 1850/3270 [14:51<14:33,  1.62it/s]

Step 1850
Accuracy without re-ranking: 0.8475675675675676
Accuracy with re-ranking: 0.8356756756756757


 58%|████████████████████████████████████████▋                             | 1900/3270 [15:20<10:16,  2.22it/s]

Step 1900
Accuracy without re-ranking: 0.8494736842105263
Accuracy with re-ranking: 0.8378947368421052


 60%|█████████████████████████████████████████▋                            | 1950/3270 [15:51<11:43,  1.88it/s]

Step 1950
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.8384615384615385


 61%|██████████████████████████████████████████▊                           | 2000/3270 [16:20<12:50,  1.65it/s]

Step 2000
Accuracy without re-ranking: 0.851
Accuracy with re-ranking: 0.839


 63%|███████████████████████████████████████████▉                          | 2050/3270 [16:52<11:14,  1.81it/s]

Step 2050
Accuracy without re-ranking: 0.8526829268292683
Accuracy with re-ranking: 0.8414634146341463


 64%|████████████████████████████████████████████▉                         | 2100/3270 [17:18<09:55,  1.96it/s]

Step 2100
Accuracy without re-ranking: 0.8514285714285714
Accuracy with re-ranking: 0.8404761904761905


 66%|██████████████████████████████████████████████                        | 2150/3270 [17:43<08:29,  2.20it/s]

Step 2150
Accuracy without re-ranking: 0.8502325581395349
Accuracy with re-ranking: 0.8418604651162791


 67%|███████████████████████████████████████████████                       | 2200/3270 [18:12<08:40,  2.06it/s]

Step 2200
Accuracy without re-ranking: 0.8504545454545455
Accuracy with re-ranking: 0.8409090909090909


 69%|████████████████████████████████████████████████▏                     | 2250/3270 [18:36<07:34,  2.24it/s]

Step 2250
Accuracy without re-ranking: 0.8506666666666667
Accuracy with re-ranking: 0.8404444444444444


 70%|█████████████████████████████████████████████████▏                    | 2300/3270 [18:58<06:26,  2.51it/s]

Step 2300
Accuracy without re-ranking: 0.8504347826086956
Accuracy with re-ranking: 0.8386956521739131


 72%|██████████████████████████████████████████████████▎                   | 2350/3270 [19:22<05:50,  2.63it/s]

Step 2350
Accuracy without re-ranking: 0.8506382978723405
Accuracy with re-ranking: 0.8395744680851064


 73%|███████████████████████████████████████████████████▍                  | 2400/3270 [19:43<05:22,  2.70it/s]

Step 2400
Accuracy without re-ranking: 0.85125
Accuracy with re-ranking: 0.8395833333333333


 75%|████████████████████████████████████████████████████▍                 | 2450/3270 [20:07<05:54,  2.32it/s]

Step 2450
Accuracy without re-ranking: 0.8526530612244898
Accuracy with re-ranking: 0.8420408163265306


 76%|█████████████████████████████████████████████████████▌                | 2500/3270 [20:29<06:34,  1.95it/s]

Step 2500
Accuracy without re-ranking: 0.8524
Accuracy with re-ranking: 0.842


 78%|██████████████████████████████████████████████████████▌               | 2550/3270 [20:57<08:21,  1.44it/s]

Step 2550
Accuracy without re-ranking: 0.8509803921568627
Accuracy with re-ranking: 0.84


 80%|███████████████████████████████████████████████████████▋              | 2600/3270 [21:22<05:57,  1.87it/s]

Step 2600
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.8396153846153847


 81%|████████████████████████████████████████████████████████▋             | 2650/3270 [21:48<04:08,  2.49it/s]

Step 2650
Accuracy without re-ranking: 0.849811320754717
Accuracy with re-ranking: 0.8392452830188679


 83%|█████████████████████████████████████████████████████████▊            | 2700/3270 [22:10<04:05,  2.32it/s]

Step 2700
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.8396296296296296


 84%|██████████████████████████████████████████████████████████▊           | 2750/3270 [22:32<05:25,  1.60it/s]

Step 2750
Accuracy without re-ranking: 0.8501818181818181
Accuracy with re-ranking: 0.8389090909090909


 86%|███████████████████████████████████████████████████████████▉          | 2800/3270 [23:01<04:11,  1.87it/s]

Step 2800
Accuracy without re-ranking: 0.8492857142857143
Accuracy with re-ranking: 0.8389285714285715


 87%|█████████████████████████████████████████████████████████████         | 2850/3270 [23:26<03:56,  1.78it/s]

Step 2850
Accuracy without re-ranking: 0.8501754385964913
Accuracy with re-ranking: 0.84


 89%|██████████████████████████████████████████████████████████████        | 2900/3270 [23:56<03:23,  1.82it/s]

Step 2900
Accuracy without re-ranking: 0.8510344827586207
Accuracy with re-ranking: 0.8396551724137931


 90%|███████████████████████████████████████████████████████████████▏      | 2950/3270 [24:25<02:41,  1.98it/s]

Step 2950
Accuracy without re-ranking: 0.8501694915254238
Accuracy with re-ranking: 0.8396610169491525


 92%|████████████████████████████████████████████████████████████████▏     | 3000/3270 [24:52<02:02,  2.21it/s]

Step 3000
Accuracy without re-ranking: 0.851
Accuracy with re-ranking: 0.8413333333333334


 93%|█████████████████████████████████████████████████████████████████▎    | 3050/3270 [25:15<01:22,  2.66it/s]

Step 3050
Accuracy without re-ranking: 0.8511475409836066
Accuracy with re-ranking: 0.8422950819672131


 95%|██████████████████████████████████████████████████████████████████▎   | 3100/3270 [25:38<01:08,  2.49it/s]

Step 3100
Accuracy without re-ranking: 0.8522580645161291
Accuracy with re-ranking: 0.8422580645161291


 96%|███████████████████████████████████████████████████████████████████▍  | 3150/3270 [26:01<00:53,  2.25it/s]

Step 3150
Accuracy without re-ranking: 0.8526984126984127
Accuracy with re-ranking: 0.8428571428571429


 98%|████████████████████████████████████████████████████████████████████▌ | 3200/3270 [26:23<00:23,  2.93it/s]

Step 3200
Accuracy without re-ranking: 0.8525
Accuracy with re-ranking: 0.8421875


 99%|█████████████████████████████████████████████████████████████████████▌| 3250/3270 [26:43<00:07,  2.75it/s]

Step 3250
Accuracy without re-ranking: 0.8526153846153847
Accuracy with re-ranking: 0.8415384615384616


100%|██████████████████████████████████████████████████████████████████████| 3270/3270 [26:53<00:00,  2.03it/s]


In [104]:
raw_accuracy = sum([p[0] for p in predictions])/len(predictions)
reranked_accuracy = sum([p[1] for p in predictions])/len(predictions)

print(f'Using cross-encoder: {newer_cross_encoder.config._name_or_path}')
print(f'Accuracy without re-ranking: {raw_accuracy}')
print(f'Accuracy with re-ranking: {reranked_accuracy}')


Using cross-encoder: <sentence_transformers.cross_encoder.CrossEncoder.CrossEncoder object at 0x158c9cc70>
Accuracy without re-ranking: 0.8522935779816514
Accuracy with re-ranking: 0.8418960244648318


# Fine-tuning re-ranker

In [105]:
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/ms_marco/train_cross-encoder_scratch.py

In [52]:
dataset['train'][0]

{'question': 'do iran and afghanistan speak the same language',
 'answer': True,
 'passage': 'Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.'}

In [54]:
dataset['train'][1]

{'question': 'do good samaritan laws protect those who help at an accident',
 'answer': True,
 'passage': "Good Samaritan laws offer legal protection to people who give reasonable assistance to those who are, or who they believe to be, injured, ill, in peril, or otherwise incapacitated. The protection is intended to reduce bystanders' hesitation to assist, for fear of being sued or prosecuted for unintentional injury or wrongful death. An example of such a law in common-law areas of Canada: a good Samaritan doctrine is a legal principle that prevents a rescuer who has voluntarily helped a victim in distress from being successfully sued for wrongdoing. Its purpose is to keep people from being reluctant to help a stranger in need for fear of legal repercussions should they make some mistake in treatment. By contrast, a duty to rescue law requires people to offer assistance and holds those who fail to do so liable."}

In [349]:
from sentence_transformers import InputExample, losses, evaluation
from torch.utils.data import DataLoader
from random import shuffle

shuffled_training_passages = dataset['train']['passage'].copy()
shuffle(shuffled_training_passages)


train_samples = [
  InputExample(texts=[d['question'], d['passage']], label=1) for d in dataset['train']
]

# add some negative examples
train_samples += [
  InputExample(texts=[d['question'], shuffled_training_passages[i]], label=0) for i, d in enumerate(dataset['train'])
]

shuffle(train_samples)

# running the risk of overfitting on my data but maybe I want that. 
#  Combined with sufficient input and output validation, we can make a viable product with a model overfit to my data


In [350]:
len(train_samples)

18854

In [351]:
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2', num_labels=1)

In [352]:
train_samples[0].__dict__

{'guid': '',
 'texts': ['is 44 special the same as 44 mag',
  'Sleeping while on duty or sleeping on the job refers to falling asleep while on the time clock or equivalent, or else while responsible for performing some active or passive job duty. While in some jobs, this is a minor transgression or not even worthy of sanctioning, in other workplaces, this is considered gross misconduct and may be grounds for disciplinary action, including possible termination of employment. Recently however, there has been a movement in support of sleeping, or napping at work, with scientific studies highlighting health and productivity benefits, and over 6% of employers in some countries providing facilities to do so. In some types of work, such as firefighting or live-in caregiving, sleeping at least part of the shift may be an expected part of paid work time. While some employees who sleep while on duty in violation do so intentionally and hope not to get caught, others intend in good faith to stay 

In [353]:
model.predict(train_samples[0].texts, activation_fct=nn.Sigmoid())

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

1.3383447e-05

In [354]:
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator, CEBinaryClassificationEvaluator
import math
import torch
from random import sample

logger.setLevel(logging.DEBUG)  # just to get some logs

num_epochs = 2

model_save_path = './fine_tuned_ir_cross_encoder'

# train_samples = sample(train_samples, 1000)

# int(len(train_samples)*.8)
train_dataloader = DataLoader(train_samples[:int(len(train_samples)*.8)], shuffle=True, batch_size=32)

# An evaluator for training performance
evaluator = CEBinaryClassificationEvaluator.from_input_examples(train_samples[-int(len(train_samples)*.8):], name='test')

# Rule of thumb for warmup steps
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
print(f"Warmup-steps: {warmup_steps}")

Warmup-steps: 95


In [358]:
# # ##### Load model and eval on test set
# print(evaluator(model))

# Train the model
model.fit(
    train_dataloader=train_dataloader,
    loss_fct=losses.nn.CrossEntropyLoss(),
    activation_fct=nn.Sigmoid(),
    evaluator=evaluator,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    use_amp=True
)

# ##### Load model and eval on test set
# print(evaluator(model))




Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/472 [00:00<?, ?it/s]



Iteration:   0%|          | 0/472 [00:00<?, ?it/s]



In [None]:
# run the more fine tuned version on open source as well to match??
# depends if it does better here

In [359]:
finetuned = CrossEncoder(model_save_path)

print(finetuned.predict(['hello', 'hi'], activation_fct=nn.Sigmoid()))
print(finetuned.predict(['hello', 'hi'], activation_fct=nn.Identity()))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.9999926


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

11.819853


In [363]:
# Trying our fine-tuned cross encoder
logger.setLevel(logging.CRITICAL)  # just to suppress some logs
from tqdm import tqdm

i = 0
print_every = 50
predictions = []
for question in tqdm(val_sample['question']):
    retrieved_hash, reranked_hash = eval_ranking(question, finetuned, top_k=3)
    correct_hash = q_to_hash[question]
    predictions.append((retrieved_hash == correct_hash, reranked_hash == correct_hash))
    i += 1
    if i % print_every == 0:
        print(f'Step {i}')
        raw_accuracy = sum([p[0] for p in predictions])/len(predictions)
        reranked_accuracy = sum([p[1] for p in predictions])/len(predictions)

        print(f'Accuracy without re-ranking: {raw_accuracy}')
        print(f'Accuracy with re-ranking: {reranked_accuracy}')


  2%|█                                                                       | 51/3270 [00:44<19:37,  2.73it/s]

Step 50
Accuracy without re-ranking: 0.88
Accuracy with re-ranking: 0.82


  3%|██▏                                                                    | 100/3270 [01:07<23:44,  2.23it/s]

Step 100
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.84


  5%|███▎                                                                   | 150/3270 [01:32<24:54,  2.09it/s]

Step 150
Accuracy without re-ranking: 0.86
Accuracy with re-ranking: 0.8466666666666667


  6%|████▎                                                                  | 200/3270 [02:06<30:03,  1.70it/s]

Step 200
Accuracy without re-ranking: 0.865
Accuracy with re-ranking: 0.85


  8%|█████▍                                                                 | 250/3270 [02:32<22:50,  2.20it/s]

Step 250
Accuracy without re-ranking: 0.872
Accuracy with re-ranking: 0.832


  9%|██████▌                                                                | 300/3270 [02:56<25:25,  1.95it/s]

Step 300
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.8266666666666667


 11%|███████▌                                                               | 350/3270 [03:24<27:41,  1.76it/s]

Step 350
Accuracy without re-ranking: 0.86
Accuracy with re-ranking: 0.8342857142857143


 12%|████████▋                                                              | 400/3270 [03:58<28:05,  1.70it/s]

Step 400
Accuracy without re-ranking: 0.8625
Accuracy with re-ranking: 0.835


 14%|█████████▊                                                             | 450/3270 [04:23<29:22,  1.60it/s]

Step 450
Accuracy without re-ranking: 0.8577777777777778
Accuracy with re-ranking: 0.8333333333333334


 15%|██████████▊                                                            | 500/3270 [04:53<29:04,  1.59it/s]

Step 500
Accuracy without re-ranking: 0.852
Accuracy with re-ranking: 0.832


 17%|███████████▉                                                           | 550/3270 [05:17<20:42,  2.19it/s]

Step 550
Accuracy without re-ranking: 0.8418181818181818
Accuracy with re-ranking: 0.8290909090909091


 18%|█████████████                                                          | 600/3270 [05:39<17:47,  2.50it/s]

Step 600
Accuracy without re-ranking: 0.8383333333333334
Accuracy with re-ranking: 0.8266666666666667


 20%|██████████████                                                         | 650/3270 [06:06<23:15,  1.88it/s]

Step 650
Accuracy without re-ranking: 0.8369230769230769
Accuracy with re-ranking: 0.8276923076923077


 21%|███████████████▏                                                       | 700/3270 [06:31<16:14,  2.64it/s]

Step 700
Accuracy without re-ranking: 0.8385714285714285
Accuracy with re-ranking: 0.8285714285714286


 23%|████████████████▎                                                      | 750/3270 [07:09<21:55,  1.92it/s]

Step 750
Accuracy without re-ranking: 0.832
Accuracy with re-ranking: 0.8226666666666667


 24%|█████████████████▎                                                     | 800/3270 [07:28<15:52,  2.59it/s]

Step 800
Accuracy without re-ranking: 0.835
Accuracy with re-ranking: 0.82625


 26%|██████████████████▍                                                    | 851/3270 [07:56<14:30,  2.78it/s]

Step 850
Accuracy without re-ranking: 0.8352941176470589
Accuracy with re-ranking: 0.8270588235294117


 28%|███████████████████▌                                                   | 900/3270 [08:19<16:32,  2.39it/s]

Step 900
Accuracy without re-ranking: 0.8344444444444444
Accuracy with re-ranking: 0.8233333333333334


 29%|████████████████████▋                                                  | 950/3270 [08:48<16:46,  2.31it/s]

Step 950
Accuracy without re-ranking: 0.8378947368421052
Accuracy with re-ranking: 0.8210526315789474


 31%|█████████████████████▍                                                | 1000/3270 [09:09<15:09,  2.50it/s]

Step 1000
Accuracy without re-ranking: 0.84
Accuracy with re-ranking: 0.822


 32%|██████████████████████▍                                               | 1050/3270 [09:33<13:29,  2.74it/s]

Step 1050
Accuracy without re-ranking: 0.84
Accuracy with re-ranking: 0.82


 34%|███████████████████████▌                                              | 1100/3270 [09:56<14:04,  2.57it/s]

Step 1100
Accuracy without re-ranking: 0.8418181818181818
Accuracy with re-ranking: 0.8209090909090909


 35%|████████████████████████▌                                             | 1150/3270 [10:16<19:39,  1.80it/s]

Step 1150
Accuracy without re-ranking: 0.8443478260869566
Accuracy with re-ranking: 0.8234782608695652


 37%|█████████████████████████▋                                            | 1200/3270 [10:38<21:26,  1.61it/s]

Step 1200
Accuracy without re-ranking: 0.8458333333333333
Accuracy with re-ranking: 0.8266666666666667


 38%|██████████████████████████▊                                           | 1250/3270 [11:02<09:46,  3.44it/s]

Step 1250
Accuracy without re-ranking: 0.8488
Accuracy with re-ranking: 0.8288


 40%|███████████████████████████▊                                          | 1300/3270 [11:20<10:53,  3.01it/s]

Step 1300
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.8292307692307692


 41%|████████████████████████████▉                                         | 1350/3270 [11:37<19:39,  1.63it/s]

Step 1350
Accuracy without re-ranking: 0.8511111111111112
Accuracy with re-ranking: 0.8318518518518518


 43%|█████████████████████████████▉                                        | 1400/3270 [12:26<15:02,  2.07it/s]

Step 1400
Accuracy without re-ranking: 0.8492857142857143
Accuracy with re-ranking: 0.8307142857142857


 44%|███████████████████████████████                                       | 1450/3270 [12:48<09:38,  3.15it/s]

Step 1450
Accuracy without re-ranking: 0.8475862068965517
Accuracy with re-ranking: 0.8289655172413793


 46%|████████████████████████████████                                      | 1500/3270 [13:21<12:32,  2.35it/s]

Step 1500
Accuracy without re-ranking: 0.846
Accuracy with re-ranking: 0.828


 47%|█████████████████████████████████▏                                    | 1550/3270 [13:54<13:40,  2.10it/s]

Step 1550
Accuracy without re-ranking: 0.8464516129032258
Accuracy with re-ranking: 0.8316129032258065


 49%|██████████████████████████████████▎                                   | 1600/3270 [14:17<13:13,  2.10it/s]

Step 1600
Accuracy without re-ranking: 0.845625
Accuracy with re-ranking: 0.83


 50%|███████████████████████████████████▎                                  | 1650/3270 [14:43<08:55,  3.03it/s]

Step 1650
Accuracy without re-ranking: 0.8460606060606061
Accuracy with re-ranking: 0.8303030303030303


 52%|████████████████████████████████████▍                                 | 1700/3270 [15:05<06:57,  3.76it/s]

Step 1700
Accuracy without re-ranking: 0.8482352941176471
Accuracy with re-ranking: 0.8305882352941176


 54%|█████████████████████████████████████▍                                | 1750/3270 [15:24<07:15,  3.49it/s]

Step 1750
Accuracy without re-ranking: 0.848
Accuracy with re-ranking: 0.832


 55%|██████████████████████████████████████▌                               | 1800/3270 [15:44<08:02,  3.05it/s]

Step 1800
Accuracy without re-ranking: 0.8483333333333334
Accuracy with re-ranking: 0.8333333333333334


 57%|███████████████████████████████████████▌                              | 1850/3270 [16:06<09:23,  2.52it/s]

Step 1850
Accuracy without re-ranking: 0.8475675675675676
Accuracy with re-ranking: 0.8345945945945946


 58%|████████████████████████████████████████▋                             | 1900/3270 [16:25<08:27,  2.70it/s]

Step 1900
Accuracy without re-ranking: 0.8494736842105263
Accuracy with re-ranking: 0.8373684210526315


 60%|█████████████████████████████████████████▋                            | 1950/3270 [16:42<08:18,  2.65it/s]

Step 1950
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.8374358974358974


 61%|██████████████████████████████████████████▊                           | 2000/3270 [17:03<07:24,  2.86it/s]

Step 2000
Accuracy without re-ranking: 0.851
Accuracy with re-ranking: 0.838


 63%|███████████████████████████████████████████▉                          | 2050/3270 [17:18<06:03,  3.36it/s]

Step 2050
Accuracy without re-ranking: 0.8526829268292683
Accuracy with re-ranking: 0.8409756097560975


 64%|████████████████████████████████████████████▉                         | 2100/3270 [17:50<08:31,  2.29it/s]

Step 2100
Accuracy without re-ranking: 0.8514285714285714
Accuracy with re-ranking: 0.84


 66%|██████████████████████████████████████████████                        | 2150/3270 [18:25<11:22,  1.64it/s]

Step 2150
Accuracy without re-ranking: 0.8502325581395349
Accuracy with re-ranking: 0.8404651162790697


 67%|███████████████████████████████████████████████                       | 2200/3270 [18:56<06:31,  2.73it/s]

Step 2200
Accuracy without re-ranking: 0.8504545454545455
Accuracy with re-ranking: 0.8413636363636363


 69%|██████████████████████████████████████████████▊                     | 2250/3270 [19:53<1:04:46,  3.81s/it]

Step 2250
Accuracy without re-ranking: 0.8506666666666667
Accuracy with re-ranking: 0.8408888888888889


 70%|█████████████████████████████████████████████████▏                    | 2300/3270 [20:55<10:53,  1.48it/s]

Step 2300
Accuracy without re-ranking: 0.8504347826086956
Accuracy with re-ranking: 0.84


 72%|██████████████████████████████████████████████████▎                   | 2350/3270 [21:17<04:13,  3.64it/s]

Step 2350
Accuracy without re-ranking: 0.8506382978723405
Accuracy with re-ranking: 0.8408510638297872


 73%|███████████████████████████████████████████████████▍                  | 2400/3270 [21:36<03:47,  3.82it/s]

Step 2400
Accuracy without re-ranking: 0.85125
Accuracy with re-ranking: 0.8416666666666667


 75%|████████████████████████████████████████████████████▍                 | 2450/3270 [21:55<05:10,  2.64it/s]

Step 2450
Accuracy without re-ranking: 0.8526530612244898
Accuracy with re-ranking: 0.843265306122449


 76%|█████████████████████████████████████████████████████▌                | 2500/3270 [22:18<03:46,  3.40it/s]

Step 2500
Accuracy without re-ranking: 0.8524
Accuracy with re-ranking: 0.8428


 78%|██████████████████████████████████████████████████████▌               | 2550/3270 [22:49<05:34,  2.15it/s]

Step 2550
Accuracy without re-ranking: 0.8509803921568627
Accuracy with re-ranking: 0.84


 80%|███████████████████████████████████████████████████████▋              | 2600/3270 [23:17<10:46,  1.04it/s]

Step 2600
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.84


 81%|████████████████████████████████████████████████████████▋             | 2650/3270 [23:44<04:25,  2.33it/s]

Step 2650
Accuracy without re-ranking: 0.849811320754717
Accuracy with re-ranking: 0.8392452830188679


 83%|█████████████████████████████████████████████████████████▊            | 2701/3270 [24:09<03:46,  2.51it/s]

Step 2700
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.8392592592592593


 84%|██████████████████████████████████████████████████████████▊           | 2750/3270 [24:32<03:55,  2.21it/s]

Step 2750
Accuracy without re-ranking: 0.8501818181818181
Accuracy with re-ranking: 0.8385454545454546


 86%|███████████████████████████████████████████████████████████▉          | 2800/3270 [24:59<03:24,  2.30it/s]

Step 2800
Accuracy without re-ranking: 0.8492857142857143
Accuracy with re-ranking: 0.8382142857142857


 87%|█████████████████████████████████████████████████████████████         | 2850/3270 [25:23<03:55,  1.78it/s]

Step 2850
Accuracy without re-ranking: 0.8501754385964913
Accuracy with re-ranking: 0.8378947368421052


 89%|██████████████████████████████████████████████████████████████        | 2900/3270 [25:42<03:53,  1.59it/s]

Step 2900
Accuracy without re-ranking: 0.8510344827586207
Accuracy with re-ranking: 0.8379310344827586


 90%|███████████████████████████████████████████████████████████████▏      | 2950/3270 [26:01<02:22,  2.24it/s]

Step 2950
Accuracy without re-ranking: 0.8501694915254238
Accuracy with re-ranking: 0.8372881355932204


 92%|████████████████████████████████████████████████████████████████▏     | 3000/3270 [26:15<01:27,  3.08it/s]

Step 3000
Accuracy without re-ranking: 0.851
Accuracy with re-ranking: 0.8393333333333334


 93%|█████████████████████████████████████████████████████████████████▎    | 3050/3270 [26:31<00:55,  3.94it/s]

Step 3050
Accuracy without re-ranking: 0.8511475409836066
Accuracy with re-ranking: 0.839344262295082


 95%|██████████████████████████████████████████████████████████████████▎   | 3100/3270 [26:50<00:49,  3.41it/s]

Step 3100
Accuracy without re-ranking: 0.8522580645161291
Accuracy with re-ranking: 0.8390322580645161


 96%|███████████████████████████████████████████████████████████████████▍  | 3150/3270 [27:07<00:33,  3.57it/s]

Step 3150
Accuracy without re-ranking: 0.8526984126984127
Accuracy with re-ranking: 0.8396825396825397


 98%|████████████████████████████████████████████████████████████████████▌ | 3201/3270 [27:31<00:20,  3.36it/s]

Step 3200
Accuracy without re-ranking: 0.8525
Accuracy with re-ranking: 0.83875


 99%|█████████████████████████████████████████████████████████████████████▌| 3250/3270 [27:50<00:05,  3.50it/s]

Step 3250
Accuracy without re-ranking: 0.8526153846153847
Accuracy with re-ranking: 0.8384615384615385


100%|██████████████████████████████████████████████████████████████████████| 3270/3270 [27:57<00:00,  1.95it/s]


In [364]:
# Re-ranking got slightly better after 2 epochs.

In [120]:
raw_accuracy = sum([p[0] for p in predictions])/len(predictions)
reranked_accuracy = sum([p[1] for p in predictions])/len(predictions)

print(f'Using cross-encoder: {finetuned.config._name_or_path}')
print(f'Accuracy without re-ranking: {raw_accuracy}')
print(f'Accuracy with re-ranking: {reranked_accuracy}')


Using cross-encoder: <sentence_transformers.cross_encoder.CrossEncoder.CrossEncoder object at 0x158c9cc70>
Accuracy without re-ranking: 0.8522935779816514
Accuracy with re-ranking: 0.8495412844036697


In [122]:
# pinecone.delete_index(INDEX_NAME)  # delete the index

# OPEN SOURCE ALTERNATIVE TO EMBEDDING

In [81]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-cos-v1')

docs = ["Around 9 Million people live in London", "London is known for its financial district"]

doc_emb = model.encode(docs, batch_size=32, show_progress_bar=True)

doc_emb.shape#  == ('2, 768')


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(2, 768)

In [379]:
#Encode query and documents
docs = dataset['validation']['passage']
doc_emb = model.encode(docs, batch_size=32, show_progress_bar=True)

100%|██████████████████████████████████████████████████████████████████████████| 33/33 [00:29<00:00,  1.11it/s]


In [380]:
from random import sample

query = sample(dataset['validation']['question'], 1)[0]
print(query)
final_results = get_results_from_pinecone(query, top_k=3, re_rank=True)


did greg oden win a championship with miami
Query: did greg oden win a championship with miami
Document ID (Hash)		Retrieval Score	CE Score	Text
dc737eb4be84a308b945fb697a67f922	0.82	0.35	On January 15, 2014, Oden made his long-awaited re
275270d2171ffe4d024015ab4ea3343b	0.78	0.05	Selected 5th overall in the 2003 NBA draft by the 
ed060ec9bb1129f14f06186a2c5d29b4	0.78	0.02	The 2012 NBA Finals was the championship series of


In [381]:
query_emb = model.encode(query)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs[:3]:
    print(score, doc)


0.8174281120809472 On January 15, 2014, Oden made his long-awaited return to the court. In his first regular season game since December 2009, he recorded 6 points and 2 rebounds in 8 minutes of game time in a 114--97 loss to the Washington Wizards. On February 23, 2014, Oden made his first start since December 2009 in the Heat's 93--79 win over the Chicago Bulls. The Heat made the 2014 NBA Finals where they faced the San Antonio Spurs. They went on to lose the series in five games.
0.7841596747078865 Selected 5th overall in the 2003 NBA draft by the Miami Heat, Wade quickly emerged as a productive player on a youthful Miami Heat team and averaged 16.2 points on 46.5% shooting with averages of 4.0 rebounds and 4.5 assists per game. Wade is one of only four Marquette University players to be drafted in the first round; his is the highest draft selection in school history. After a 5--15 start, the Heat would gradually improve and finish 42--40 to qualify for the NBA playoffs. He further d

In [382]:
logger.setLevel(logging.CRITICAL)  # just to suppress some logs


def eval_ranking_open_source(query, cross_encoder, top_k=3):
    query_emb = np.array(get_embedding(query, engine=ENGINE))

    #Compute dot score between query and all document embeddings
    scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

    #Combine docs & scores
    doc_score_pairs = list(zip(docs, scores))

    #Sort by decreasing score
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)[:top_k]

    retrieved_hash = my_hash(doc_score_pairs[0][0])
    if cross_encoder:
        sentence_combinations = [[query, doc_score_pair[0]] for doc_score_pair in doc_score_pairs]
        similarity_scores = cross_encoder.predict(sentence_combinations)
        sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
        reranked_hash = my_hash(doc_score_pairs[sim_scores_argsort[0]][0])
    else:
        reranked_hash = None
    return retrieved_hash, reranked_hash


In [383]:
eval_ranking_open_source(query, finetuned)

('dc737eb4be84a308b945fb697a67f922', 'dc737eb4be84a308b945fb697a67f922')

In [384]:
logger.setLevel(logging.CRITICAL)

i = 0
print_every = 50
predictions = []
for question in tqdm(val_sample['question']):
    retrieved_hash, reranked_hash = eval_ranking_open_source(question, finetuned, top_k=3)
    correct_hash = q_to_hash[question]
    predictions.append((retrieved_hash == correct_hash, reranked_hash == correct_hash))
    i += 1
    if i % print_every == 0:
        print(f'Step {i}')
        raw_accuracy = sum([p[0] for p in predictions])/len(predictions)
        reranked_accuracy = sum([p[1] for p in predictions])/len(predictions)

        print(f'Accuracy without re-ranking: {raw_accuracy}')
        print(f'Accuracy with re-ranking: {reranked_accuracy}')


  2%|█                                                                       | 51/3270 [00:16<11:01,  4.86it/s]

Step 50
Accuracy without re-ranking: 0.88
Accuracy with re-ranking: 0.84


  3%|██▏                                                                    | 100/3270 [00:31<14:50,  3.56it/s]

Step 100
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.85


  5%|███▎                                                                   | 151/3270 [00:49<10:45,  4.83it/s]

Step 150
Accuracy without re-ranking: 0.86
Accuracy with re-ranking: 0.8533333333333334


  6%|████▎                                                                  | 200/3270 [01:09<15:19,  3.34it/s]

Step 200
Accuracy without re-ranking: 0.865
Accuracy with re-ranking: 0.855


  8%|█████▍                                                                 | 250/3270 [01:21<13:01,  3.86it/s]

Step 250
Accuracy without re-ranking: 0.872
Accuracy with re-ranking: 0.844


  9%|██████▌                                                                | 301/3270 [01:35<11:34,  4.27it/s]

Step 300
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.8366666666666667


 11%|███████▌                                                               | 350/3270 [01:49<16:06,  3.02it/s]

Step 350
Accuracy without re-ranking: 0.86
Accuracy with re-ranking: 0.8457142857142858


 12%|████████▋                                                              | 401/3270 [02:00<08:31,  5.61it/s]

Step 400
Accuracy without re-ranking: 0.8625
Accuracy with re-ranking: 0.8475


 14%|█████████▊                                                             | 451/3270 [02:16<09:10,  5.12it/s]

Step 450
Accuracy without re-ranking: 0.8577777777777778
Accuracy with re-ranking: 0.8466666666666667


 15%|██████████▊                                                            | 500/3270 [02:34<12:59,  3.55it/s]

Step 500
Accuracy without re-ranking: 0.852
Accuracy with re-ranking: 0.844


 17%|███████████▉                                                           | 551/3270 [02:51<12:22,  3.66it/s]

Step 550
Accuracy without re-ranking: 0.8418181818181818
Accuracy with re-ranking: 0.8381818181818181


 18%|█████████████                                                          | 600/3270 [03:03<09:46,  4.55it/s]

Step 600
Accuracy without re-ranking: 0.8383333333333334
Accuracy with re-ranking: 0.8366666666666667


 20%|██████████████▏                                                        | 651/3270 [03:17<11:08,  3.92it/s]

Step 650
Accuracy without re-ranking: 0.8369230769230769
Accuracy with re-ranking: 0.8369230769230769


 21%|███████████████▏                                                       | 701/3270 [03:30<15:07,  2.83it/s]

Step 700
Accuracy without re-ranking: 0.8385714285714285
Accuracy with re-ranking: 0.8371428571428572


 23%|████████████████▎                                                      | 751/3270 [03:42<08:56,  4.69it/s]

Step 750
Accuracy without re-ranking: 0.832
Accuracy with re-ranking: 0.8306666666666667


 24%|█████████████████▎                                                     | 800/3270 [03:58<09:16,  4.44it/s]

Step 800
Accuracy without re-ranking: 0.835
Accuracy with re-ranking: 0.83375


 26%|██████████████████▍                                                    | 851/3270 [04:13<12:11,  3.31it/s]

Step 850
Accuracy without re-ranking: 0.8352941176470589
Accuracy with re-ranking: 0.8341176470588235


 28%|███████████████████▌                                                   | 900/3270 [04:25<15:00,  2.63it/s]

Step 900
Accuracy without re-ranking: 0.8344444444444444
Accuracy with re-ranking: 0.83


 29%|████████████████████▋                                                  | 950/3270 [04:39<09:11,  4.21it/s]

Step 950
Accuracy without re-ranking: 0.8378947368421052
Accuracy with re-ranking: 0.8294736842105264


 31%|█████████████████████▍                                                | 1001/3270 [04:53<12:52,  2.94it/s]

Step 1000
Accuracy without re-ranking: 0.84
Accuracy with re-ranking: 0.83


 32%|██████████████████████▍                                               | 1050/3270 [05:06<08:25,  4.39it/s]

Step 1050
Accuracy without re-ranking: 0.84
Accuracy with re-ranking: 0.8276190476190476


 34%|███████████████████████▌                                              | 1100/3270 [05:19<10:14,  3.53it/s]

Step 1100
Accuracy without re-ranking: 0.8418181818181818
Accuracy with re-ranking: 0.8281818181818181


 35%|████████████████████████▌                                             | 1150/3270 [05:33<21:36,  1.63it/s]

Step 1150
Accuracy without re-ranking: 0.8443478260869566
Accuracy with re-ranking: 0.8295652173913044


 37%|█████████████████████████▋                                            | 1200/3270 [05:51<12:57,  2.66it/s]

Step 1200
Accuracy without re-ranking: 0.8458333333333333
Accuracy with re-ranking: 0.8325


 38%|██████████████████████████▊                                           | 1251/3270 [06:08<06:36,  5.09it/s]

Step 1250
Accuracy without re-ranking: 0.8488
Accuracy with re-ranking: 0.8344


 40%|███████████████████████████▊                                          | 1301/3270 [06:21<08:24,  3.90it/s]

Step 1300
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.8369230769230769


 41%|████████████████████████████▉                                         | 1351/3270 [06:35<07:35,  4.21it/s]

Step 1350
Accuracy without re-ranking: 0.8511111111111112
Accuracy with re-ranking: 0.8385185185185186


 43%|█████████████████████████████▉                                        | 1401/3270 [06:49<09:22,  3.32it/s]

Step 1400
Accuracy without re-ranking: 0.8492857142857143
Accuracy with re-ranking: 0.8371428571428572


 44%|███████████████████████████████                                       | 1450/3270 [07:05<06:45,  4.48it/s]

Step 1450
Accuracy without re-ranking: 0.8475862068965517
Accuracy with re-ranking: 0.8358620689655173


 46%|████████████████████████████████                                      | 1500/3270 [07:16<05:52,  5.03it/s]

Step 1500
Accuracy without re-ranking: 0.846
Accuracy with re-ranking: 0.834


 47%|█████████████████████████████████▏                                    | 1550/3270 [07:29<06:51,  4.18it/s]

Step 1550
Accuracy without re-ranking: 0.8464516129032258
Accuracy with re-ranking: 0.8374193548387097


 49%|██████████████████████████████████▎                                   | 1600/3270 [07:43<05:22,  5.19it/s]

Step 1600
Accuracy without re-ranking: 0.845625
Accuracy with re-ranking: 0.835625


 50%|███████████████████████████████████▎                                  | 1650/3270 [07:58<21:24,  1.26it/s]

Step 1650
Accuracy without re-ranking: 0.8460606060606061
Accuracy with re-ranking: 0.8357575757575758


 52%|████████████████████████████████████▍                                 | 1700/3270 [08:13<07:15,  3.61it/s]

Step 1700
Accuracy without re-ranking: 0.8482352941176471
Accuracy with re-ranking: 0.8358823529411765


 54%|█████████████████████████████████████▍                                | 1751/3270 [08:30<05:48,  4.36it/s]

Step 1750
Accuracy without re-ranking: 0.848
Accuracy with re-ranking: 0.8371428571428572


 55%|██████████████████████████████████████▌                               | 1800/3270 [08:45<06:40,  3.67it/s]

Step 1800
Accuracy without re-ranking: 0.8483333333333334
Accuracy with re-ranking: 0.8383333333333334


 57%|███████████████████████████████████████▌                              | 1850/3270 [08:58<11:07,  2.13it/s]

Step 1850
Accuracy without re-ranking: 0.8475675675675676
Accuracy with re-ranking: 0.8394594594594594


 58%|████████████████████████████████████████▋                             | 1901/3270 [09:12<05:18,  4.30it/s]

Step 1900
Accuracy without re-ranking: 0.8494736842105263
Accuracy with re-ranking: 0.8415789473684211


 60%|█████████████████████████████████████████▊                            | 1951/3270 [09:28<03:51,  5.69it/s]

Step 1950
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.841025641025641


 61%|██████████████████████████████████████████▊                           | 2001/3270 [09:41<04:26,  4.77it/s]

Step 2000
Accuracy without re-ranking: 0.851
Accuracy with re-ranking: 0.8415


 63%|███████████████████████████████████████████▉                          | 2050/3270 [09:52<04:36,  4.41it/s]

Step 2050
Accuracy without re-ranking: 0.8526829268292683
Accuracy with re-ranking: 0.844390243902439


 64%|████████████████████████████████████████████▉                         | 2101/3270 [10:10<04:09,  4.68it/s]

Step 2100
Accuracy without re-ranking: 0.8514285714285714
Accuracy with re-ranking: 0.8433333333333334


 66%|██████████████████████████████████████████████                        | 2150/3270 [10:20<04:18,  4.33it/s]

Step 2150
Accuracy without re-ranking: 0.8502325581395349
Accuracy with re-ranking: 0.8437209302325581


 67%|███████████████████████████████████████████████                       | 2200/3270 [10:34<03:47,  4.71it/s]

Step 2200
Accuracy without re-ranking: 0.8504545454545455
Accuracy with re-ranking: 0.8445454545454546


 69%|████████████████████████████████████████████████▏                     | 2251/3270 [10:49<03:06,  5.45it/s]

Step 2250
Accuracy without re-ranking: 0.8506666666666667
Accuracy with re-ranking: 0.8444444444444444


 70%|█████████████████████████████████████████████████▏                    | 2300/3270 [11:03<04:24,  3.66it/s]

Step 2300
Accuracy without re-ranking: 0.8504347826086956
Accuracy with re-ranking: 0.8434782608695652


 72%|██████████████████████████████████████████████████▎                   | 2350/3270 [11:20<05:43,  2.68it/s]

Step 2350
Accuracy without re-ranking: 0.8506382978723405
Accuracy with re-ranking: 0.8442553191489361


 73%|███████████████████████████████████████████████████▍                  | 2400/3270 [11:38<03:53,  3.73it/s]

Step 2400
Accuracy without re-ranking: 0.85125
Accuracy with re-ranking: 0.845


 75%|████████████████████████████████████████████████████▍                 | 2450/3270 [11:49<03:46,  3.62it/s]

Step 2450
Accuracy without re-ranking: 0.8526530612244898
Accuracy with re-ranking: 0.846530612244898


 76%|█████████████████████████████████████████████████████▌                | 2501/3270 [12:03<08:41,  1.47it/s]

Step 2500
Accuracy without re-ranking: 0.8524
Accuracy with re-ranking: 0.846


 78%|██████████████████████████████████████████████████████▌               | 2550/3270 [12:15<02:05,  5.73it/s]

Step 2550
Accuracy without re-ranking: 0.8509803921568627
Accuracy with re-ranking: 0.8431372549019608


 80%|███████████████████████████████████████████████████████▋              | 2600/3270 [12:29<03:24,  3.28it/s]

Step 2600
Accuracy without re-ranking: 0.8492307692307692
Accuracy with re-ranking: 0.8430769230769231


 81%|████████████████████████████████████████████████████████▋             | 2650/3270 [12:43<02:06,  4.89it/s]

Step 2650
Accuracy without re-ranking: 0.849811320754717
Accuracy with re-ranking: 0.8422641509433962


 83%|█████████████████████████████████████████████████████████▊            | 2700/3270 [13:02<03:49,  2.48it/s]

Step 2700
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.8422222222222222


 84%|██████████████████████████████████████████████████████████▊           | 2750/3270 [13:15<02:16,  3.81it/s]

Step 2750
Accuracy without re-ranking: 0.8501818181818181
Accuracy with re-ranking: 0.8414545454545455


 86%|███████████████████████████████████████████████████████████▉          | 2801/3270 [13:36<01:48,  4.34it/s]

Step 2800
Accuracy without re-ranking: 0.8492857142857143
Accuracy with re-ranking: 0.8410714285714286


 87%|█████████████████████████████████████████████████████████████         | 2851/3270 [13:47<01:30,  4.61it/s]

Step 2850
Accuracy without re-ranking: 0.8501754385964913
Accuracy with re-ranking: 0.8417543859649123


 89%|██████████████████████████████████████████████████████████████        | 2900/3270 [14:00<01:12,  5.10it/s]

Step 2900
Accuracy without re-ranking: 0.8510344827586207
Accuracy with re-ranking: 0.8420689655172414


 90%|███████████████████████████████████████████████████████████████▏      | 2951/3270 [14:14<01:44,  3.06it/s]

Step 2950
Accuracy without re-ranking: 0.8501694915254238
Accuracy with re-ranking: 0.8410169491525423


 92%|████████████████████████████████████████████████████████████████▏     | 3001/3270 [14:26<00:49,  5.39it/s]

Step 3000
Accuracy without re-ranking: 0.851
Accuracy with re-ranking: 0.8426666666666667


 93%|█████████████████████████████████████████████████████████████████▎    | 3051/3270 [14:39<00:47,  4.66it/s]

Step 3050
Accuracy without re-ranking: 0.8511475409836066
Accuracy with re-ranking: 0.8426229508196721


 95%|██████████████████████████████████████████████████████████████████▎   | 3100/3270 [14:53<00:36,  4.63it/s]

Step 3100
Accuracy without re-ranking: 0.8522580645161291
Accuracy with re-ranking: 0.8425806451612903


 96%|███████████████████████████████████████████████████████████████████▍  | 3150/3270 [15:06<00:24,  4.81it/s]

Step 3150
Accuracy without re-ranking: 0.8526984126984127
Accuracy with re-ranking: 0.8431746031746031


 98%|████████████████████████████████████████████████████████████████████▌ | 3201/3270 [15:21<00:17,  4.06it/s]

Step 3200
Accuracy without re-ranking: 0.8525
Accuracy with re-ranking: 0.8425


 99%|█████████████████████████████████████████████████████████████████████▌| 3251/3270 [15:31<00:03,  5.11it/s]

Step 3250
Accuracy without re-ranking: 0.8529230769230769
Accuracy with re-ranking: 0.8427692307692307


100%|██████████████████████████████████████████████████████████████████████| 3270/3270 [15:35<00:00,  3.49it/s]


In [385]:
raw_accuracy = sum([p[0] for p in predictions])/len(predictions)
reranked_accuracy = sum([p[1] for p in predictions])/len(predictions)

print(f'Using cross-encoder: {finetuned.config._name_or_path}')
print(f'Accuracy without re-ranking: {raw_accuracy}')
print(f'Accuracy with re-ranking: {reranked_accuracy}')


Using cross-encoder: cross-encoder/mmarco-mMiniLMv2-L12-H384-v1
Accuracy without re-ranking: 0.8525993883792049
Accuracy with re-ranking: 0.8434250764525993
