# Dense Retrieval

In [1]:
!pip install Annoy

Collecting Annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
     ---------------------------------------- 0.0/647.5 kB ? eta -:--:--
     ------------------------------ ------- 524.3/647.5 kB 3.4 MB/s eta 0:00:01
     -------------------------------------- 647.5/647.5 kB 2.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: Annoy
  Building wheel for Annoy (setup.py): started
  Building wheel for Annoy (setup.py): finished with status 'done'
  Created wheel for Annoy: filename=annoy-1.17.3-cp39-cp39-win_amd64.whl size=52380 sha256=9ac745a2f8548ea7f87c72e26a016ff19bc146fbf79e22408e7c9d2282ce1a36
  Stored in directory: c:\users\saleena das\appdata\local\pip\cache\wheels\09\a9\54\37478e65995fe712f7da465749da9ddb21db6b1a599d591ac7
Successfully built Annoy
Installing collected packages: Annoy
Successfully installed Annoy-1.17.3



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import sys
print(sys.version)

3.9.20 (main, Oct  3 2024, 07:38:01) [MSC v.1929 64 bit (AMD64)]


In [2]:
import annoy
print(annoy.__file__)

d:\Anaconda\envs\tf\lib\site-packages\annoy\__init__.py


In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [2]:
import cohere
import os
co = cohere.Client(os.environ['COHERE_API_KEY'])

In [3]:
import weaviate
import os
auth_config = weaviate.auth.AuthApiKey(
    api_key=os.environ['WEAVIATE_API_KEY'])

In [None]:
from weaviate import connect_to_weaviate_cloud
from weaviate.classes.init import Auth
import os

client = connect_to_weaviate_cloud(
    cluster_url=os.getenv("WEAVIATE_URL"),
    auth_credentials=Auth.api_key(os.getenv("WEAVIATE_API_KEY"))
)

print(client.is_ready())
client.close()


True


## Vector Database for semantic Search

In [15]:
def dense_retrieval(query, 
                    results_lang='en', 
                    properties = ["text", "title", "url", "views", "lang", "_additional {distance}"],
                    num_results=5):

    nearText = {"concepts": [query]}
    
    # To filter by language
    where_filter = {
    "path": ["lang"],
    "operator": "Equal",
    "valueString": results_lang
    }
    response = (
        client.query
        .get("Articles", properties)
        .with_near_text(nearText)
        .with_where(where_filter)
        .with_limit(num_results)
        .do()
    )

    result = response['data']['Get']['Articles']

    return result

In [24]:
from weaviate.collections.classes.filters import Filter
from weaviate.classes.query import MetadataQuery

def dense_retrieval(query,
                    results_lang='en',
                    num_results=5):
    
    articles = client.collections.get("Articles")

    lang_filter = Filter(
        path=["lang"],
        operator="Equal",
        value_text=results_lang
    )

    response = articles.query.near_text(
        query=query,
        filters=lang_filter,
        return_metadata=MetadataQuery(distance=True),
        limit=num_results
    )

    result = []
    for obj in response.objects:
        result.append({
            "title": obj.properties.get("title"),
            "text": obj.properties.get("text"),
            "url": obj.properties.get("url"),
            "views": obj.properties.get("views"),
            "lang": obj.properties.get("lang"),
            "distance": obj.metadata.distance
        })

    return result

In [27]:
from weaviate.collections.classes.filters import Filter
from weaviate.classes.query import MetadataQuery

def dense_retrieval(query, results_lang='en', num_results=5):

    articles = client.collections.get("Articles")

    lang_filter = Filter.by_property("lang").equal(results_lang)

    response = articles.query.near_text(
        query=query,
        filters=lang_filter,
        return_metadata=MetadataQuery(distance=True),
        limit=num_results
    )

    result = []
    for obj in response.objects:
        result.append({
            "title": obj.properties.get("title"),
            "text": obj.properties.get("text"),
            "url": obj.properties.get("url"),
            "views": obj.properties.get("views"),
            "lang": obj.properties.get("lang"),
            "distance": obj.metadata.distance
        })

    return result

In [28]:
from utils import print_result

### Bacic Query

In [36]:
from weaviate.classes.config import Property, DataType, Configure

client.collections.create(
    name="Articles",
    properties=[
        Property(name="title", data_type=DataType.TEXT),
        Property(name="text", data_type=DataType.TEXT),
        Property(name="url", data_type=DataType.TEXT),
        Property(name="views", data_type=DataType.INT),
        Property(name="lang", data_type=DataType.TEXT),
    ],
    vectorizer_config=Configure.Vectorizer.text2vec_openai(),
)


<weaviate.collections.collection.sync.Collection at 0x172b5849f10>

In [43]:
from dotenv import load_dotenv
import os
load_dotenv()
openai_api_key = os.getenv("OPENAI_APIKEY")

In [None]:
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure
import os

weaviate_url = os.getenv("WEAVIATE_URL")
weaviate_key = os.getenv("WEAVIATE_API_KEY")
openai_key = os.getenv("OPENAI_API_KEY")

headers = {
    "X-OpenAI-Api-Key": openai_key
}

In [85]:
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_key),
    headers=headers
)

from weaviate.classes.config import Configure, DataType

client.collections.create(
    name="DemoCollection",
    properties=[
        {"name": "title", "data_type": DataType.TEXT}
    ],
    vectorizer_config=Configure.Vectorizer.text2vec_openai(
        model="text-embedding-3-large",
        dimensions=1024
    )
)

client.close()

In [90]:
collection = client.collections.get("DemoCollection")

collection.data.insert_many([
    {"title": "Hamlet was written by William Shakespeare."},
    {"title": "The play Hamlet is one of Shakespeare’s most famous works."},
    {"title": "Shakespeare authored many plays including Hamlet, Macbeth, and Othello."},
    {"title": "Hamlet explores themes of revenge, madness, and mortality."},
    {"title": "Romeo and Juliet is another famous tragedy by Shakespeare."}
])

BatchObjectReturn(_all_responses=[UUID('e350a2ec-169e-45cf-8bde-a88377d1ea63'), UUID('d705943f-5f90-4dee-beb2-95a1be958196'), UUID('01d4e2de-acba-438c-9666-f95b6d0be79c'), UUID('54df2def-a378-49a0-abe1-e0c78a7135a0'), UUID('28592e98-9de4-40eb-b330-9c2ed3a07b43')], elapsed_seconds=6.69828462600708, errors={}, uuids={0: UUID('e350a2ec-169e-45cf-8bde-a88377d1ea63'), 1: UUID('d705943f-5f90-4dee-beb2-95a1be958196'), 2: UUID('01d4e2de-acba-438c-9666-f95b6d0be79c'), 3: UUID('54df2def-a378-49a0-abe1-e0c78a7135a0'), 4: UUID('28592e98-9de4-40eb-b330-9c2ed3a07b43')}, has_errors=False)

In [92]:
print("Top results:")
for i, obj in enumerate(results.objects, 1):
    print(f"{i}. {obj.properties['title']}")

Top results:
1. Hamlet was written by William Shakespeare.
2. The play Hamlet is one of Shakespeare’s most famous works.
3. Shakespeare authored many plays including Hamlet, Macbeth, and Othello.
4. Hamlet explores themes of revenge, madness, and mortality.
5. Romeo and Juliet is another famous tragedy by Shakespeare.


In [37]:
print(client.collections.list_all())

{'Articles': _CollectionConfigSimple(name='Articles', description=None, generative_config=None, properties=[_Property(name='title', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=_PropertyVectorizerConfig(skip=False, vectorize_property_name=True), vectorizer='text2vec-openai', vectorizer_configs=None), _Property(name='text', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=_PropertyVectorizerConfig(skip=False, vectorize_property_name=True), vectorizer='text2vec-openai', vectorizer_configs=None), _Property(name='url', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tok

In [93]:
def semantic_search(query, k=5):
    collection = client.collections.get("DemoCollection")
    results = collection.query.near_text(query=query, limit=k)
    for i, obj in enumerate(results.objects, 1):
        print(f"{i}. {obj.properties['title']}")

In [94]:
semantic_search("What is Hamlet about?")

1. Hamlet explores themes of revenge, madness, and mortality.
2. The play Hamlet is one of Shakespeare’s most famous works.
3. Hamlet was written by William Shakespeare.
4. Shakespeare authored many plays including Hamlet, Macbeth, and Othello.
5. Romeo and Juliet is another famous tragedy by Shakespeare.


In [96]:
results = collection.query.near_text(
    query="Who wrote Hamlet?",
    limit=5,
    return_metadata=["distance"]
)

In [97]:
for i, obj in enumerate(results.objects, 1):
    title = obj.properties.get("title", "[No title]")
    distance = obj.metadata.distance
    print(f"{i}. {title} (score: {1 - distance:.4f})")

1. Hamlet was written by William Shakespeare. (score: 0.6778)
2. The play Hamlet is one of Shakespeare’s most famous works. (score: 0.5670)
3. Shakespeare authored many plays including Hamlet, Macbeth, and Othello. (score: 0.5435)
4. Hamlet explores themes of revenge, madness, and mortality. (score: 0.5048)
5. Romeo and Juliet is another famous tragedy by Shakespeare. (score: 0.3859)


### Medium Query

In [100]:
docs = [
    {"title": "Ottawa is the capital of Canada."},
    {"title": "Canada has 10 provinces and 3 territories."},
    {"title": "Toronto is the largest city in Canada."},
]

collection.data.insert_many(docs)

BatchObjectReturn(_all_responses=[UUID('28428526-3831-43c6-a888-e7f478eab041'), UUID('e261c654-5e10-48d0-853b-07655aeb3705'), UUID('f13b54e7-0638-4baa-9ef9-2fe0c2261571')], elapsed_seconds=1.245448112487793, errors={}, uuids={0: UUID('28428526-3831-43c6-a888-e7f478eab041'), 1: UUID('e261c654-5e10-48d0-853b-07655aeb3705'), 2: UUID('f13b54e7-0638-4baa-9ef9-2fe0c2261571')}, has_errors=False)

In [101]:
results = collection.query.near_text(
    query="What is the capital of Canada?",
    limit=3,
    return_metadata=["distance"]
)

for i, obj in enumerate(results.objects, 1):
    print(f"{i}. {obj.properties['title']} (score: {1 - obj.metadata.distance:.4f})")

1. Ottawa is the capital of Canada. (score: 0.6313)
2. Toronto is the largest city in Canada. (score: 0.3942)
3. Canada has 10 provinces and 3 territories. (score: 0.3849)


In [106]:
query = "What is the capital of Canada?"

results = client.collections.get("DemoCollection").query.bm25(
    query=query,
    limit=5
)

for i, obj in enumerate(results.objects, 1):
    print(f"{i}. {obj.properties['title']}")

1. Ottawa is the capital of Canada.
2. Toronto is the largest city in Canada.
3. Canada has 10 provinces and 3 territories.


### Complicated Query

In [108]:
new_document = {
    "title": "Robert Wadlow - The Tallest Person in History",
    "content": (
        "Robert Wadlow, also known as the Alton Giant, was the tallest person in recorded history. "
        "He was born in 1918 and stood 8 feet 11.1 inches (2.72 meters) tall. "
        "His height was due to a condition called gigantism, caused by an overproduction of growth hormone. "
        "Wadlow passed away in 1940 at the age of 22."
    )
}

client.collections.get("DemoCollection").data.insert(new_document)

UUID('264a5796-1eb7-49fc-aad0-b9461e283d8a')

In [109]:
results = collection.query.near_text(
    query="Tallest person in history?",
    limit=3,
    return_metadata=["distance"]
)

for i, obj in enumerate(results.objects, 1):
    print(f"{i}. {obj.properties['title']} (score: {1 - obj.metadata.distance:.4f})")

1. Robert Wadlow - The Tallest Person in History (score: 0.6983)
2. Toronto is the largest city in Canada. (score: 0.1431)
3. The play Hamlet is one of Shakespeare’s most famous works. (score: 0.1259)


In [110]:
results = collection.query.bm25(
    query="Tallest person in history",
    limit=3
)

for i, obj in enumerate(results.objects, 1):
    print(f"{i}. {obj.properties['title']}")

1. Robert Wadlow - The Tallest Person in History


In [144]:
results = collection.query.hybrid(
    query="أطول رجل في التاريخ",
    alpha=0.5,
    limit=3
)

for i, obj in enumerate(results.objects, 1):
    print(f"{i}. {obj.properties['title']}")

1. Robert Wadlow - The Tallest Person in History
2. The play Hamlet is one of Shakespeare’s most famous works.
3. Hamlet was written by William Shakespeare.


### Building Semantic Search from Scratch

In [121]:
from annoy import AnnoyIndex
import numpy as np
import pandas as pd
import re

In [127]:
text = """
Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan.
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine.
Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind.

Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007.
Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar.
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm.
Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles.
Interstellar uses extensive practical and miniature effects and the company Double Negative created additional digital effects.

Interstellar premiered on October 26, 2014, in Los Angeles.
In the United States, it was first released on film stock, expanding to venues using digital projectors.
The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014.
It received acclaim for its performances, direction, screenplay, musical score, visual effects, ambition, themes, and emotional weight.
It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics. Since its premiere, Interstellar gained a cult following,[5] and now is regarded by many sci-fi experts as one of the best science-fiction films of all time.
Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades"""

### Chunking

In [128]:
texts = text.split('.')
texts = np.array([t.strip(' \n') for t in texts])

In [129]:
texts

array(['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan',
       'It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine',
       'Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind',
       'Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007',
       'Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar',
       'Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm',
       'Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles',

In [130]:
texts = text.split('\n\n')
texts = np.array([t.strip(' \n') for t in texts])

In [131]:
texts

array(['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan.\nIt stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine.\nSet in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind.',
       'Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007.\nCaltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar.\nCinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm.\nPrincipal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles.\nInterstellar uses extensive practical 

In [132]:
texts = text.split('.')
texts = np.array([t.strip(' \n') for t in texts])

In [133]:
title = 'Interstellar (film)'
texts = np.array([f"{title} {t}" for t in texts])

In [134]:
texts

array(['Interstellar (film) Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan',
       'Interstellar (film) It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine',
       'Interstellar (film) Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind',
       'Interstellar (film) Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007',
       'Interstellar (film) Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar',
       'Interstellar (film) Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format

### Get the embeddings

In [135]:
response = co.embed(
    texts=texts.tolist()
).embeddings

d:\Anaconda\envs\tf\lib\site-packages\cohere\core\unchecked_base_model.py:165: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  if inner_type.__fields__[metadata.discriminant].default == objects_discriminant:


In [136]:
embeds = np.array(response)
embeds.shape

(15, 4096)

### Create the search index

In [137]:
search_index = AnnoyIndex(embeds.shape[1], 'angular')
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i])

search_index.build(10) # 10 trees
search_index.save('test.ann')

True

In [138]:
pd.set_option('display.max_colwidth', None)

def search(query):

  query_embed = co.embed(texts=[query]).embeddings
  similar_item_ids = search_index.get_nns_by_vector(query_embed[0],
                                                    3,
                                                  include_distances=True)
  results = pd.DataFrame(data={'texts': texts[similar_item_ids[0]],
                              'distance': similar_item_ids[1]})

  print(texts[similar_item_ids[0]])
  return results

In [139]:
query = "How much did the film make?"
search(query)

['Interstellar (film) The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014'
 'Interstellar (film) Interstellar premiered on October 26, 2014, in Los Angeles'
 'Interstellar (film) In the United States, it was first released on film stock, expanding to venues using digital projectors']


Unnamed: 0,texts,distance
0,"Interstellar (film) The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014",1.019056
1,"Interstellar (film) Interstellar premiered on October 26, 2014, in Los Angeles",1.144951
2,"Interstellar (film) In the United States, it was first released on film stock, expanding to venues using digital projectors",1.167268
