<a href="https://colab.research.google.com/github/shivag/cs145-public/blob/main/Hashing_Algos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install pybloom_live h3 faiss-cpu whoosh

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.8/468.8 KB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: whoosh
Successfully installed whoosh-2.7.4


In [7]:
import h3
# lat/lngs from SF mission, Stanford NVidia, Packard, Gates Buildings
locations = [(37.788022, -122.399797), (37.428226, -122.174722), 
             (37.429749, -122.1735490), (37.429761, -122.173290)]
for zoom in [8, 12]: # zoomlevels 8 and 12 (imagine the +/- in map zoom)
  for l in locations:
    cell_id = h3.geo_to_h3(l[0], l[1], zoom)
    print("@zoom[", zoom, "]:", cell_id)

@zoom[ 8 ]: 88283082a1fffff
@zoom[ 8 ]: 8828347417fffff
@zoom[ 8 ]: 8828347417fffff
@zoom[ 8 ]: 8828347417fffff
@zoom[ 12 ]: 8c283082a06bbff
@zoom[ 12 ]: 8c28347416257ff
@zoom[ 12 ]: 8c2834741602bff
@zoom[ 12 ]: 8c28347416021ff


In [8]:
from pybloom_live import BloomFilter

# Create a Bloom filter with 1000 items and a 0.1% false positive rate
bf = BloomFilter(capacity=1000, error_rate=0.001)

# Add some integers to the Bloom filter
bf.add(42); bf.add(30); bf.add(50)

# Check if each integer in the list is in the Bloom filter
integer_list = [42, 50, 100, 32]
for i in integer_list:
  if i in bf:
    print(f"{i} may be in the Bloom filter")
  else:
    print(f"{i} is definitely not in the Bloom filter")

42 may be in the Bloom filter
50 may be in the Bloom filter
100 is definitely not in the Bloom filter
32 is definitely not in the Bloom filter


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [12]:
import numpy as np, faiss
# Generate 1000 points in a 10-dimensional space
X = np.random.rand(1000, 10).astype('float32')

# Build an LSH index for the points
index = faiss.IndexLSH(10, 8)
index.add(X)

# Find the 5 nearest neighbors (distances and points) of xq, a new 10-dim point
xq = np.random.rand(10).astype('float32') 
D, I = index.search(np.expand_dims(xq, axis=0), k=5)

# Print the indices and distances of the nearest neighbors
print("Nearest neighbor of ", xq, " is:", I, " with distance: ", D)

Nearest neighbor of  [0.9562943  0.4509454  0.75692314 0.96686363 0.79955685 0.2184114
 0.5928665  0.6337663  0.41091037 0.99143934]  is: [[21 49 57 59 79]]  with distance:  [[0. 0. 0. 0. 0.]]


In [13]:
import faiss
import numpy as np
from PIL import Image

d = 256 # number of dimensions in the feature vector
# Map an image to a k-dimensions
def normed(fname):
  # Load the image
  img = Image.open(fname)
  img = img.resize((16, 16))
  img = np.array(img)
  # Convert the image to feature vector
  x = img.reshape(-1, d).astype('float32')
  faiss.normalize_L2(x) # Normalize the feature vector
  return x

# Build an index for the feature vectors
index = faiss.IndexFlatL2(d)
index.add(normed('stanford-logo.png'))
for img in ["stanford-logo.png", "stanford-logo2.jpg", "stanford-logo3.png"]:
  index.add(normed(img))
# Query an example image
query_x = normed('stanford-logo3.png')
D, I = index.search(query_x, k=2)
print(D)

[[0.         0.27528164]
 [0.         0.13537623]
 [0.         0.13537623]
 [0.         0.18201266]]


In [18]:
import os
from whoosh.fields import TEXT, Schema
from whoosh.index import create_in
from whoosh.qparser import QueryParser

# Define the index schema
schema = Schema(content=TEXT)
os.mkdir("my_index_dir")
# Create a new index
ix = create_in("my_index_dir", schema)

# Add some documents to the index
with ix.writer() as writer:
  writer.add_document(content="crazy in love - beyonce")
  writer.add_document(content="crazy love - van morrison")
  writer.add_document(content="crazy -- aerosmith")

# Search for documents containing the word "fox"
qries = ["crazy", "aerosmith", "crazy love"]
for q in qries:
  with ix.searcher() as searcher:
    results = searcher.search(QueryParser("content", schema).parse(q))
    print(results)


<Top 3 Results for Term('content', 'crazy') runtime=0.0009647299998505332>
<Top 1 Results for Term('content', 'aerosmith') runtime=0.00040564400023868075>
<Top 2 Results for And([Term('content', 'crazy'), Term('content', 'love')]) runtime=0.0009455250001337845>
