#### Thursday, January 18, 2024

This code is from the book "Vector Search for Practitioners with Elastic" by <Bahaaldine Azarmi> and <Jeff Vestal>.

https://github.com/PacktPublishing/Vector-Search-for-Practitioners-with-Elastic/blob/main/chapter2/knn-search.ipynb

This code requires the "elasticsearch" container to be running.

This all runs.

In [1]:
import numpy as np 
from transformers import AutoTokenizer, AutoModel 
from elasticsearch import Elasticsearch 
import torch 

In [2]:
# Password for the 'elastic' user generated by Elasticsearch
# esHost = "https://172.19.0.3:9200"
esHost = "https://172.19.0.2:9200"

ELASTIC_PASSWORD = "1A*JdSMVLz2wOZmplLx8"

path2cert = "/elasticsearch/http_ca.crt"

In [3]:
esClient = Elasticsearch(esHost, ca_certs=path2cert, basic_auth=("elastic", ELASTIC_PASSWORD))

In [4]:
# Nice! It works!
esClient.info()

ObjectApiResponse({'name': '74fc4cf508d8', 'cluster_name': 'docker-cluster', 'cluster_uuid': '3138ncAQQAKAhwWvcDQbHQ', 'version': {'number': '8.11.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '76013fa76dcbf144c886990c6290715f5dc2ae20', 'build_date': '2023-12-05T10:03:47.729926671Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [5]:
# Define the mapping for the dense vector field 
mapping = { 
    'properties': { 
        'embedding': { 
            'type': 'dense_vector', 
            'dims': 768, # the number of dimensions of the dense vector 
            'index': 'true',
            "similarity": "cosine"
        } 
    } 
} 


In [6]:
# Create an index with the defined mapping 
esClient.indices.create(index='jokes-index', body={'mappings': mapping}) 

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'jokes-index'})

In [7]:
# Define a set of jokes 
jokes = [ 
    { 
        'text': 'Why do cats make terrible storytellers? Because they only have one tail.', 
        'category': 'cat' 
    }, 
    { 
        'text': 'What did the cat say when he lost all his money? I am paw.', 
        'category': 'cat' 
    }, 
    { 
        'text': 'Why don\'t cats play poker in the jungle? Too many cheetahs.', 
        'category': 'cat' 
    },
    { 
        'text': 'Why did the tomato turn red? Because it saw the salad dressing!', 
        'category': 'vegetable' 
    },
    { 
        'text': 'Why did the scarecrow win an award? Because he was outstanding in his field.', 
        'category': 'farm' 
    },
    { 
        'text': 'Why did the hipster burn his tongue? Because he drank his coffee before it was cool.', 
        'category': 'hipster' 
    },    
    {
        'text': 'Why did the tomato turn red? Because it saw the salad dressing!', 
        'category': 'food' 
    },
    {
        'text': 'Why did the scarecrow win an award? Because he was out-standing in his field!', 
        'category': 'puns' 
    },
    {
        'text': 'What do you call a fake noodle? An impasta!', 
        'category': 'food' 
    },
    {
        'text': 'What do you call a belt made out of watches? A waist of time!', 
        'category': 'puns' 
    },
    {
        'text': 'Why did the math book look sad? Because it had too many problems!', 
        'category': 'math' 
    },
    {
        'text': 'Why did the gym close down? It just didn\'t work out!', 
        'category': 'exercise' 
    },
    {
        'text': 'Why don\'t scientists trust atoms? Because they make up everything!', 
        'category': 'science' 
    },
    {
        'text': 'What do you call a fake noodle? An impasta!', 
        'category': 'food' 
    },
    {
        'text': 'Why did the chicken cross the playground? To get to the other slide!', 
        'category': 'kids' 
    },
    {
        'text': 'Why did the frog call his insurance company? He had a jump in his car!', 
        'category': 'puns' 
    }

] 

In [9]:
!ls /root/.cache/huggingface/hub

models--NousResearch--Llama-2-7b-chat-hf
models--bert-base-uncased
models--ehartford--samantha-mistral-7b
models--mistralai--Mistral-7B-Instruct-v0.2
models--mlabonne--NeuralBeagle14-7B
models--teknium--OpenHermes-2.5-Mistral-7B
models--unsloth--mistral-7b-bnb-4bit
version.txt
version_diffusers_cache.txt


In [None]:
# docker cp /home/rob/Data3/huggingface/transformers/models--bert-base-uncased hfpt_Dec14://root/.cache/huggingface/hub
# docker cp hfpt_Dec14://root/.cache/huggingface/hub/models--bert-base-uncased  /home/rob/Data3/huggingface/transformers

In [10]:
# Load the BERT tokenizer and model 
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') 
model = AutoModel.from_pretrained('bert-base-uncased') 

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [11]:
# Generate embeddings for the jokes using BERT 
for joke in jokes: 
    text = joke['text'] 
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True) 
    with torch.no_grad(): 
        output = model(**inputs).last_hidden_state.mean(dim=1).squeeze(0).numpy() 
        joke['embedding'] = output.tolist() 

In [12]:
# Index the jokes in Elasticsearch 
for joke in jokes: 
    esClient.index(index='jokes-index', body=joke) 

In [13]:
# Define the query vector 
# Define a query text and convert it to a dense vector using BERT
query = "What do you get when you cross a snowman and a shark?"
inputs = tokenizer(query, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    output = model(**inputs).last_hidden_state.mean(dim=1).squeeze(0).numpy()
query_vector = output


In [14]:
# Define the Elasticsearch KNN search 
search = {
    "knn": {
        "field": "embedding",
        "query_vector": query_vector.tolist(),
        "k": 3,
        "num_candidates": 100
    },
    "fields": [ "text" ]
}


In [15]:
# Perform the KNN search and print the results 
response = esClient.search(index='jokes-index', body=search)
for hit in response['hits']['hits']:
    print(f"Joke: {hit['_source']['text']}")


Joke: What do you call a fake noodle? An impasta!
Joke: What do you call a fake noodle? An impasta!
Joke: What did the cat say when he lost all his money? I am paw.
