## IN THIS NOTEBOOK, WE WILL EXPLORE THE COLBERT AS A RERANKER AND RETRIEVER IN LOCAL MODE. 

* If you want to build a server from your colbert local index, please refer [here](https://github.com/stanford-futuredata/ColBERT/blob/main/server.py)

In [2]:
from colbert.infra.config import ColBERTConfig

In [3]:
import os
# You can set this environment variable for debugging purposes
os.environ['COLBERT_LOAD_TORCH_EXTENSION_VERBOSE'] = "True"

## Let's review the colbert config class

In [1]:
# You can view the different attributes of the colbert config by uncommenting cell below
# for k,v in ColBERTConfig().__dict__.items():
#     print(f"{k} --> {v}")

In [3]:
passages =  ["It's a piece of cake.", "Don't put off until tomorrow what you can do today.", 'To kill two birds with one stone.', 'Actions speak louder than words.', 'Honesty is the best policy.', 'If you want something done right, do it yourself.', 'The best things in life are free.', "Don't count your chickens before they hatch.", 'She sells seashells by the seashore.', 'Practice makes perfect.', "Where there's a will, there's a way.", 'Absence makes the heart grow fonder.', 'When the going gets tough, the tough get going.', 'A journey of a thousand miles begins with a single step.', "You can't have your cake and eat it too.", "If you can't beat them, join them.", 'Keep your friends close and your enemies closer.', "Don't put all your eggs in one basket.", "All's fair in love and war.", 'Every dog has its day.', 'All good things must come to an end.', 'Once bitten, twice shy.', "The apple doesn't fall far from the tree.", 'A penny saved is a penny earned.', "Don't bite the hand that feeds you.", 'You reap what you sow.', 'An apple a day keeps the doctor away.', "One man's trash is another man's treasure.", 'The squeaky wheel gets the grease.', 'A picture is worth a thousand words.', 'Fortune favors the bold.', 'Practice what you preach.', 'A watched pot never boils.', 'No pain, no gain.', "You can't make an omelet without breaking eggs.", "There's no place like home.", 'Ask and you shall receive.', 'Let sleeping dogs lie.', 'If the shoe fits, wear it.', 'Every cloud has a silver lining.', 'Look before you leap.', 'The more, the merrier.', 'The grass is always greener on the other side.', 'Beauty is only skin deep.', "Two wrongs don't make a right.", 'Beauty is in the eye of the beholder.', 'Necessity is the mother of invention.', 'Out of sight, out of mind.', 'Patience is a virtue.', 'Curiosity killed the cat.', "If at first you don't succeed, try, try again.", "Beggars can't be choosers.", 'Too many cooks spoil the broth.', 'Easy come, easy go.', "Don't cry over spilled milk.", "There's no such thing as a free lunch.", 'A bird in the hand is worth two in the bush.', 'Good things come to those who wait.', 'The quick brown fox jumps over the lazy dog.', 'It takes two to tango.', 'A friend in need is a friend indeed.', 'Like father, like son.', 'Let bygones be bygones.', 'Kill two birds with one stone.', 'A penny for your thoughts.', 'I am the master of my fate, I am the captain of my soul.', 'The pen is mightier than the sword.', 'When in Rome, do as the Romans do.', "Rome wasn't built in a day.", "You can't judge a book by its cover.", "It's raining cats and dogs.", 'Make hay while the sun shines.', "It's better to be safe than sorry.", 'The early bird catches the worm.', 'To be or not to be, that is the question.', 'Better late than never.']

## This tutorial is running from the `examples/integrations/tutorials folder`, hence we need to add the system path for dspy

* If you have installed the dspy package, then you don't need to run the below cell

In [4]:
import sys
sys.path.append("../../..")

## COLBERT AS RETRIEVER

In [7]:
import dspy
colbert_config = ColBERTConfig()
colbert_config.index_name = "Colbert-RM"
colbert_config.experiment = "Colbert-Experiment"
colbert_config.checkpoint = "colbert-ir/colbertv2.0"
colbert_retriever = dspy.ColBERTv2RetrieverLocal(
    passages = passages,load_only=False,
    colbert_config=colbert_config
)

In [8]:
#CONFIGURE COLBERT IN DSPY
dspy.settings.configure(rm=colbert_retriever)

retrieved_docs = dspy.Retrieve(k=5)

In [9]:
pred = retrieved_docs(
    "What is the meaning of life?"
)




#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . What is the meaning of life?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([ 101,    1, 2054, 2003, 1996, 3574, 1997, 2166, 1029,  102,  103,  103,
         103,  103,  103,  103,  103,  103,  103,  103,  103,  103,  103,  103,
         103,  103,  103,  103,  103,  103,  103,  103], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')



In [10]:
pred

Prediction(
    score=[nan, nan, nan, nan, nan],
    pid=[33, 6, 47, 74, 48],
    passages=['No pain, no gain.', 'The best things in life are free.', 'Out of sight, out of mind.', 'To be or not to be, that is the question.', 'Patience is a virtue.']
)

In [11]:
multiple_pred = retrieved_docs(
    ["What is the meaning of life?","Meaning of pain?"],by_prob=False
)



In [12]:
multiple_pred

[Prediction(
     score=[nan, nan, nan, nan, nan],
     pid=[33, 6, 47, 74, 48],
     passages=['No pain, no gain.', 'The best things in life are free.', 'Out of sight, out of mind.', 'To be or not to be, that is the question.', 'Patience is a virtue.']
 ),
 Prediction(
     score=[nan, nan, nan, nan, nan],
     pid=[16, 0, 47, 74, 26],
     passages=['Keep your friends close and your enemies closer.', "It's a piece of cake.", 'Out of sight, out of mind.', 'To be or not to be, that is the question.', 'An apple a day keeps the doctor away.']
 )]

## COLBERT AS RERANKER

In [13]:
colbert_config = ColBERTConfig()
colbert_config.index_name = 'colbert-ir-index'
colbert_reranker = dspy.ColBERTv2RerankerLocal(
    checkpoint='colbert-ir/colbertv2.0',colbert_config=colbert_config)

In [14]:
dspy.settings.configure(rm=colbert_retriever,reranker=colbert_reranker)

retrieve_rerank = dspy.RetrieveThenRerank(k=5)

In [15]:
pred = retrieve_rerank(
    ["What is the meaning of life?","Meaning of pain?"]
)

In [16]:
pred

[Prediction(
     score=[nan, nan, nan, nan, nan],
     pid=[6, 48, 74, 47, 33],
     rerank_score=[15.8359375, 14.2109375, 12.5703125, 11.7890625, 9.1796875],
     passages=['The best things in life are free.', 'Patience is a virtue.', 'To be or not to be, that is the question.', 'Out of sight, out of mind.', 'No pain, no gain.']
 ),
 Prediction(
     score=[nan, nan, nan, nan, nan],
     pid=[33, 0, 47, 74, 16],
     rerank_score=[19.828125, 12.2890625, 11.171875, 9.09375, 6.8984375],
     passages=['No pain, no gain.', "It's a piece of cake.", 'Out of sight, out of mind.', 'To be or not to be, that is the question.', 'Keep your friends close and your enemies closer.']
 )]

## YOU CAN ALSO COLBERT RERANKER AS STANDALONE MODEL

In [None]:
# !pip install tabulate

In [17]:
import numpy as np
import tabulate

scores_arr = colbert_reranker(
    "What is the meaning of life and pain?",
    # Pass a subset of passages
    passages[:10]
)

tabulate_data = []
for idx in np.argsort(scores_arr)[::-1]:
    # print(f"Passage = {passages[idx]} --> Score = {scores_arr[idx]}")
    tabulate_data.append([passages[idx],scores_arr[idx]])

table = tabulate.tabulate(tabulate_data,tablefmt="html",headers={'sentence','score'})

In [19]:
from IPython.display import HTML, display
display(HTML(table))

score,sentence
The best things in life are free.,12.5156
It's a piece of cake.,10.0
Practice makes perfect.,8.27344
Honesty is the best policy.,7.57422
To kill two birds with one stone.,7.51953
Actions speak louder than words.,7.05469
"If you want something done right, do it yourself.",6.52344
Don't put off until tomorrow what you can do today.,3.78711
She sells seashells by the seashore.,2.77148
Don't count your chickens before they hatch.,1.82227
