# Dense Retrieval

## Install Requirements

In [12]:
!pip install -q -r requirements.txt # remove -q to see the logs

# Cohere API key
Create an `.env` file and add the Cohere API key to the following key `COHERE_API_KEY`

In [13]:
import os
from dotenv import load_dotenv
# Load env variables
load_dotenv(".env")

# Get the API key
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

# Imports

In [1]:
import cohere
import numpy as np
import re
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from annoy import AnnoyIndex

# Interstellar

In [2]:
text = """
Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. 
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. 
Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind.
 
Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007. 
Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. 
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. 
Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles. 
Interstellar uses extensive practical and miniature effects and the company Double Negative created additional digital effects.
 
Interstellar premiered on October 26, 2014, in Los Angeles. 
In the United States, it was first released on film stock, expanding to venues using digital projectors. 
The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014. 
It received acclaim for its performances, direction, screenplay, musical score, visual effects, ambition, themes, and emotional weight. 
It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics. Since its premiere, Interstellar gained a cult following,[5] and now is regarded by many sci-fi experts as one of the best science-fiction films of all time.
Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades"""

In [3]:
# split text
split_mark = "."
splitted_text = text.split(split_mark)

In [4]:
# Remove empty space and clean
cleaned_texts = [t.strip() for t in splitted_text]
cleaned_texts[:2]

['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan',
 'It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine']

In [5]:
texts = np.array(cleaned_texts)

In [6]:
# save texts
np.save("texts.npy", texts)

## Create cohere connection

In [22]:
cohere_client = cohere.Client(COHERE_API_KEY)

In [23]:
# Get embeddings
response = cohere_client.embed(texts=cleaned_texts).embeddings
 
embeds = np.array(response)
print(embeds.shape)

(15, 4096)


# Save the embeddings(Optional)

In [24]:
np.save("interstellar_embeds.npy", embeds)

# Build the search Index

In [None]:
# Load embeds (Optional)
embeds = np.load("interstellar_embeds.npy")

In [25]:
print(embeds.shape) # make sure the loading is correct

(15, 4096)


In [26]:
# Create the search index, pass the size of embedding and search metric
search_index = AnnoyIndex(embeds.shape[1], 'angular')

In [28]:
# Add all the vectors to the search index
for index, embed in enumerate(embeds):
    search_index.add_item(index, embed)
print("[INFO] Add the embeddings are added to search index")

[INFO] Add the embeddings are added to search index


In [29]:
# After building no more items can be added
search_index.build(n_trees=10, n_jobs=4) 

True

In [30]:
# Save the search_index
search_index.save('interstellar_search_index.ann')

True

In [50]:
def search(query: str, n_neighbors: int = 3, include_distances: bool = True) -> pd.DataFrame:
  
  # 1. Get the query's embedding
  query_embed = cohere_client.embed(texts=[query]).embeddings[0]
 
  # 2. Retrieve the nearest neighbors
  similar_item_ids, distances = search_index.get_nns_by_vector(query_embed,
                                                    n=n_neighbors,
                                                    include_distances=include_distances)
    
  # 3. Format the results to pandas
  results = pd.DataFrame(data={'texts': texts[similar_item_ids], 
                              'distance': distances})
  
  return results

# Do some searches

In [51]:
query = "How much did the film make?"
search(query)

Unnamed: 0,texts,distance
0,The film had a worldwide gross over $677 milli...,0.815881
1,"It stars Matthew McConaughey, Anne Hathaway, J...",1.066906
2,"In the United States, it was first released on...",1.086965


In [48]:
query = "How was the movie released"
search(query, n_neighbors=1)

Unnamed: 0,texts,distance
0,"In the United States, it was first released on...",0.874286


# Caveats

In [49]:
query = "Who is Pooya Mohammadi?"
search(query, n_neighbors=1)

Unnamed: 0,texts,distance
0,Caltech theoretical physicist and 2017 Nobel l...,1.276832


As you can see if the answer is not in the embedded text, we would still have output which is not correct, therefore, we can add a threshold to search engine.

In [58]:
def search(query: str, n_neighbors: int = 3, include_distances: bool = True,
           t_distance: float = 1.0) -> pd.DataFrame:
  
  # 1. Get the query's embedding
  query_embed = cohere_client.embed(texts=[query]).embeddings[0]
 
  # 2. Retrieve the nearest neighbors
  similar_item_ids, distances = search_index.get_nns_by_vector(query_embed,
                                                    n=n_neighbors,
                                                    include_distances=include_distances)
  filtered_output = [(id_, distance) for id_, distance in zip(similar_item_ids, distances) if distance < t_distance]
  if filtered_output:
      similar_item_ids, distances = zip(*filtered_output)
      results = pd.DataFrame(data={'texts': texts[similar_item_ids], 
                              'distance': distances})
  else:
      print(f"[INFO] No results found with t: {t_distance}, setting results to empty Dataframe")
      results = pd.DataFrame(data={"texts": [], "distance": []})
  
  return results

In [59]:
query = "How was the movie released"
search(query, n_neighbors=1)

Unnamed: 0,texts,distance
0,"In the United States, it was first released on...",0.874286


In [60]:
query = "Who is Pooya Mohammadi?"
search(query, n_neighbors=1)

[INFO] No results found with t: 1.0, setting results to empty Dataframe


Unnamed: 0,texts,distance


*_:)_*