In [1]:
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import faiss

In [2]:
### ====================================================
### Fundamentals of Vector Databases
### ====================================================

### ====================================================
### 1. Embeddings:
###    Definition and purpose:
###    Transform raw data (words) into numerical vectors.
### ====================================================

### Load spaCy's medium English model which provides 300-d word vectors.

In [3]:
# Load spaCy's medium English model which provides 300-d word vectors.
nlp = spacy.load("en_core_web_md")

In [4]:
# Define a sample set of words.
words = ["apple", "banana", "cat", "dog", "computer", "python", "data", "science", "machine", "learning"]

In [5]:
#Obtain embeddings for these words.
embeddings = np.array([nlp(word).vector for word in words]).astype("float32")
print("=== Embeddings for Sample Words ===")
for word, vec in zip(words, embeddings):
    # Display the first five dimensions for brevity.
    print(f"{word}: {vec[:5]} ...")

=== Embeddings for Sample Words ===
apple: [-0.6334   0.18981 -0.53544 -0.52658 -0.30001] ...
banana: [-0.6334   0.18981 -0.53544 -0.52658 -0.30001] ...
cat: [-0.72483   0.42538   0.025489 -0.39807   0.037463] ...
dog: [-0.72483   0.42538   0.025489 -0.39807   0.037463] ...
computer: [-0.65942   0.048198  0.3459   -0.57023   0.090037] ...
python: [-0.6037    -0.31122    0.29572   -0.0011134  0.31605  ] ...
data: [-0.60261  0.11757  0.2091   0.16977 -0.2427 ] ...
science: [-0.79222   0.69891  -0.033084  0.1249   -0.038876] ...
machine: [-0.72883    0.20718   -0.0033379 -0.0027673 -0.17204  ] ...
learning: [-0.9261   0.36204 -0.15093 -0.37449 -0.42103] ...


In [6]:
# ====================================================
# 2. Similarity Search:
#    - Key metrics: cosine similarity, Euclidean (L2) distance.
#    - How similarity measures help in finding “close” vectors.
# ====================================================

In [7]:
# Select two words to compare.
word1 = "apple"
word2 = "banana"

In [8]:
vec1 = nlp(word1).vector.reshape(1, -1)
vec2 = nlp(word2).vector.reshape(1, -1)


In [9]:
# Compute cosine similarity.
cos_sim = cosine_similarity(vec1, vec2)[0][0]

In [10]:
# Compute Euclidean (L2) distance.
euclidean = np.linalg.norm(vec1 - vec2)

In [11]:
print("=== Similarity Search ===")
print(f"Similarity between '{word1}' and '{word2}':")
print(f"Cosine Similarity: {cos_sim:.4f}")
print(f"Euclidean Distance: {euclidean:.4f}\n")

=== Similarity Search ===
Similarity between 'apple' and 'banana':
Cosine Similarity: 1.0000
Euclidean Distance: 0.0000



In [12]:
# ====================================================
# 3. Indexing Techniques:
#    Overview of indexing methods (FAISS, Annoy, HNSW).
#    Benefits of approximate nearest neighbor (ANN) searches.
# ====================================================

In [13]:
# We use FAISS for this demonstration.
# Get the dimensionality of our embeddings.
d = embeddings.shape[1]  # should be 300 for spaCy's en_core_web_md

In [14]:
# Create a FAISS index using L2 (Euclidean) distance.
index = faiss.IndexFlatL2(d)
index.add(embeddings)
print("=== FAISS Index ===")
print("FAISS index created with sample embeddings.")

=== FAISS Index ===
FAISS index created with sample embeddings.


In [15]:
# Perform a query: find the top 3 nearest neighbors to "computer".
query_word = "computer"
query_vec = nlp(query_word).vector.reshape(1, -1)
k = 3  # number of nearest neighbors

In [16]:
distances, indices = index.search(query_vec, k)
print(f"\nNearest neighbors for '{query_word}':")
for rank, idx in enumerate(indices[0]):
    print(f"{rank + 1}. {words[idx]} (Distance: {distances[0][rank]:.4f})")


Nearest neighbors for 'computer':
1. computer (Distance: 0.0000)
2. machine (Distance: 41.2062)
3. learning (Distance: 50.7203)


In [None]:
# ====================================================
# 4. Popular Tools & Comparison:
#    - Popular Tools: FAISS, Pinecone, Milvus, Qdrant.
#    - Differences between vector databases and traditional relational databases.
# ====================================================
#
# Popular Tools:
#   • FAISS: An efficient similarity search library developed by Facebook.
#   • Pinecone: A fully managed vector database service.
#   • Milvus: An open-source vector database optimized for scalable similarity search.
#   • Qdrant: A high-performance vector database designed for integration and ease-of-use.
#
# Differences:
#   • Vector databases are specialized to store and search high-dimensional vectors,
#     making them ideal for similarity search in unstructured data (e.g., images, text).
#   • Traditional relational databases excel in structured, tabular data with exact match queries.
#
# These comments serve as a conceptual overview. You can expand upon these points during your lecture.
