In [1]:
!pip install cohere 
!pip install weaviate-client Annoy
# !pip install newspaper3k

Collecting cohere
  Downloading cohere-5.18.0-py3-none-any.whl.metadata (3.4 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.12.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (5.7 kB)
Collecting httpx-sse==0.4.0 (from cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere)
  Downloading types_requests-2.32.4.20250913-py3-none-any.whl.metadata (2.0 kB)
Downloading cohere-5.18.0-py3-none-any.whl (295 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.4/295.4 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Downloading fastavro-1.12.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[

In [2]:
def print_result(result):
    """ Print results with colorful formatting """
    for i,item in enumerate(result):
        print(f'item {i}')
        for key in item.keys():
            print(f"{key}:{item.get(key)}")
            print()
        print()


### Revised version
def keyword_search(query, 
                   client,
                   results_lang='en', 
                   properties = ["title","url","text"],
                   num_results=3):

    where_filter = {
    "path": ["lang"],
    "operator": "Equal",
    "valueString": results_lang
    }

    response = (
        client.query.get("Articles", properties)
        .with_bm25(
          query=query
        )
        .with_where(where_filter)
        .with_limit(num_results)
        .do()
        )
    
    result = response['data']['Get']['Articles']
    return result

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
cohere_api_key = user_secrets.get_secret("COHERE_API_KEY")
gemini_api_key = user_secrets.get_secret("GOOGLE_API_KEY")
weaviate_api_key = user_secrets.get_secret("WEAVIATE_API_KEY")
weaviate_api_url = user_secrets.get_secret("WEAVIATE_API_URL")

In [4]:
import cohere
co = cohere.Client(cohere_api_key)

In [5]:
import weaviate
print(weaviate.__version__)

4.16.9


In [6]:
import weaviate
auth_config = weaviate.auth.AuthApiKey(
    api_key=weaviate_api_key)

In [7]:
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_api_url,
    auth_credentials=auth_config,
    headers={
        "X-Cohere-Api-Key": cohere_api_key,
    }
)

In [8]:
client.is_ready() #check if True

True

## Part 1: Vector Database for semantic Search

In [9]:
from weaviate.classes.query import Filter, MetadataQuery

def dense_retrieval(query, 
                    results_lang='en', 
                    properties=["text", "title", "url", "views", "lang"],
                    num_results=5):
    
    # Get the collection (replace 'Articles' with your actual class name if different)
    collection = client.collections.get("Articles")
    
    # Build the filter to restrict by language
    where_filter = Filter.by_property("lang").equal(results_lang)

    # Perform the semantic search with near_text
    response = collection.query.near_text(
        query=query,
        filters=where_filter,
        return_properties=properties,
        return_metadata=MetadataQuery(distance=True),
        limit=num_results
    )

    # Collect and return the result objects
    results = [
        {
            **obj.properties,
            "distance": obj.metadata.distance  # if needed
        }
        for obj in response.objects
    ]

    return results

In [10]:
print(client.collections.list_all())

{}


### Basic Query

In [11]:
# query = "Who wrote Hamlet?"
# dense_retrieval_results = dense_retrieval(query)
# print_result(dense_retrieval_results)

### Medium Query

In [12]:
# query = "What is the capital of Canada?"
# dense_retrieval_results = dense_retrieval(query)
# print_result(dense_retrieval_results)

In [13]:
# query = "What is the capital of Canada?"
# keyword_search_results = keyword_search(query, client)
# print_result(keyword_search_results)

### Complicated Query

In [15]:
# query = "Tallest person in history?"
# keyword_search_results = keyword_search(query, client)
# print_result(keyword_search_results)

In [16]:
# query = "Tallest person in history"
# dense_retrieval_results = dense_retrieval(query)
# print_result(dense_retrieval_results)

In [17]:
# query = "أطول رجل في التاريخ"
# dense_retrieval_results = dense_retrieval(query)
# print_result(dense_retrieval_results)

In [18]:
# query = "film about a time travel paradox"
# dense_retrieval_results = dense_retrieval(query)
# print_result(dense_retrieval_results)

## Part 2: Building Semantic Search from Scratch

### Get the text archive:

In [19]:
from annoy import AnnoyIndex
import numpy as np
import pandas as pd
import re

In [20]:
text = """
Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan.
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine.
Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind.

Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007.
Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar.
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm.
Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles.
Interstellar uses extensive practical and miniature effects and the company Double Negative created additional digital effects.

Interstellar premiered on October 26, 2014, in Los Angeles.
In the United States, it was first released on film stock, expanding to venues using digital projectors.
The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014.
It received acclaim for its performances, direction, screenplay, musical score, visual effects, ambition, themes, and emotional weight.
It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics. Since its premiere, Interstellar gained a cult following,[5] and now is regarded by many sci-fi experts as one of the best science-fiction films of all time.
Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades"""

### Chunking: 

In [21]:
# Split into a list of sentences
texts = text.split('.')

# Clean up to remove empty spaces and new lines
texts = np.array([t.strip(' \n') for t in texts])

In [22]:
texts

array(['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan',
       'It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine',
       'Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind',
       'Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007',
       'Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar',
       'Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm',
       'Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles',

In [23]:
# Split into a list of paragraphs
texts = text.split('\n\n')

# Clean up to remove empty spaces and new lines
texts = np.array([t.strip(' \n') for t in texts])

In [24]:
texts

array(['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan.\nIt stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine.\nSet in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind.',
       'Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007.\nCaltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar.\nCinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm.\nPrincipal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles.\nInterstellar uses extensive practical 

In [25]:
# Split into a list of sentences
texts = text.split('.')

# Clean up to remove empty spaces and new lines
texts = np.array([t.strip(' \n') for t in texts])

In [26]:
title = 'Interstellar (film)'

texts = np.array([f"{title} {t}" for t in texts])

In [27]:
texts

array(['Interstellar (film) Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan',
       'Interstellar (film) It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine',
       'Interstellar (film) Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind',
       'Interstellar (film) Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007',
       'Interstellar (film) Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar',
       'Interstellar (film) Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format

### Get the embeddings:

In [28]:
response = co.embed(
    texts=texts.tolist()
).embeddings

/usr/local/lib/python3.11/dist-packages/cohere/core/unchecked_base_model.py:163: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  if inner_type.__fields__[metadata.discriminant].default == objects_discriminant:


In [29]:
embeds = np.array(response)
embeds.shape

(15, 4096)

### Create the search index:

In [30]:
search_index = AnnoyIndex(embeds.shape[1], 'angular')
# Add all the vectors to the search index
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i])

search_index.build(10) # 10 trees
search_index.save('test.ann')

True

In [31]:
pd.set_option('display.max_colwidth', None)

def search(query):

  # Get the query's embedding
  query_embed = co.embed(texts=[query]).embeddings

  # Retrieve the nearest neighbors
  similar_item_ids = search_index.get_nns_by_vector(query_embed[0],
                                                    3,
                                                  include_distances=True)
  # Format the results
  results = pd.DataFrame(data={'texts': texts[similar_item_ids[0]],
                              'distance': similar_item_ids[1]})

  print(texts[similar_item_ids[0]])
    
  return results

In [32]:
query = "How much did the film make?"
search(query)

['Interstellar (film) The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014'
 'Interstellar (film) Interstellar premiered on October 26, 2014, in Los Angeles'
 'Interstellar (film) In the United States, it was first released on film stock, expanding to venues using digital projectors']


Unnamed: 0,texts,distance
0,"Interstellar (film) The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014",1.019055
1,"Interstellar (film) Interstellar premiered on October 26, 2014, in Los Angeles",1.144951
2,"Interstellar (film) In the United States, it was first released on film stock, expanding to venues using digital projectors",1.167268
