# Praktikum 3

Install hnswlib terlebih dahulu.

In [None]:
%pip install hnswlib

Note: you may need to restart the kernel to use updated packages.


Percobaan berikut akan membandingkan exact NN dengan HNSW pada 1000 data 2D.

In [None]:
import hnswlib
import numpy as np
import time
from sklearn.neighbors import NearestNeighbors

# ===========================
# 1. Buat data 2D acak
# ===========================
num_elements = 1000
dim = 2
data = np.random.random((num_elements, dim)).astype(np.float32)

# Query point
query = np.array([[0.5, 0.5]], dtype=np.float32)
k = 5  # cari 5 tetangga terdekat

# ===========================
# 2. Exact NN (Brute Force)
# ===========================
nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
nn.fit(data)

start = time.time()
distances, indices = nn.kneighbors(query)
end = time.time()

print("=== Exact NN ===")
print("Indices:", indices)
print("Distances:", distances)
print("Waktu:", end - start, "detik")

# ===========================
# 3. HNSW
# ===========================
# Inisialisasi index HNSW
p = hnswlib.Index(space='l2', dim=dim)  

# Ukuran maksimum elemen yang bisa ditampung
p.init_index(max_elements=num_elements, ef_construction=100, M=16)

# Tambahkan data
p.add_items(data)

# Set parameter pencarian
p.set_ef(50)   # tradeoff speed vs accuracy

start = time.time()
labels, distances = p.knn_query(query, k=k)
end = time.time()

print("\n=== HNSW ===")
print("Indices:", labels)
print("Distances:", distances)
print("Waktu:", end - start, "detik")


=== Exact NN ===
Indices: [[642 206 622 386 961]]
Distances: [[0.00571344 0.01733941 0.0183886  0.02030767 0.02526056]]
Waktu: 0.006781101226806641 detik

=== HNSW ===
Indices: [[642 206 622 386 961]]
Distances: [[3.2643413e-05 3.0065523e-04 3.3814073e-04 4.1240122e-04 6.3809589e-04]]
Waktu: 0.00012636184692382812 detik


Lakukan percobaan pada metric distance yang berbeda, 1000 vs 1jt data, 2D vs 5D data. catat hasilnya pada tabel yang anda buat sendiri seperti pada praktikum 1.

### Tabel eksperimen: Exact NN vs HNSW

Konfigurasi:
- Distance metrics: Euclidean (l2), Cosine (angular)
- Jumlah data: 1,000 dan 1,000,000
- Dimensi: 2D dan 5D
- K = 5 tetangga, HNSW: M=16, ef_construction=100, ef=50

Catatan:
- Waktu HNSW yang dilaporkan adalah waktu query saja (build index tidak dihitung), seperti praktik umum ANN.
- Exact untuk cosine menggunakan cosine distance (kecil = lebih mirip).
- Untuk 1 juta data, perhitungan Exact dilakukan per-chunk agar hemat memori.

In [None]:
import numpy as np
import pandas as pd
import time
import hnswlib
from sklearn.metrics import pairwise_distances

np.random.seed(42)

METRICS = [("l2", "Euclidean"), ("cosine", "Angular")]
DIMS = [2, 5]
SIZES = [1_000, 1_000_000]
K = 5
M = 16
EF_CONSTRUCTION = 100
EF = 50

# Ubah jika mesin tidak kuat untuk 1 juta
N_LARGE = 1_000_000
CHUNK = 100_000

rows = []

for space, label in METRICS:
    for d in DIMS:
        for n in SIZES:
            n_eff = N_LARGE if n == 1_000_000 else n
            X = np.random.rand(n_eff, d).astype(np.float32)
            q = np.random.rand(1, d).astype(np.float32)

            # Exact dengan chunk (euclidean atau cosine)
            start = time.time()
            if space == "l2":
                # euclidean chunked
                dists_parts = []
                for i in range(0, n_eff, CHUNK):
                    part = pairwise_distances(q, X[i:i+CHUNK], metric='euclidean')[0]
                    dists_parts.append(part)
                dists = np.concatenate(dists_parts)
            else:
                dists_parts = []
                for i in range(0, n_eff, CHUNK):
                    part = pairwise_distances(q, X[i:i+CHUNK], metric='cosine')[0]
                    dists_parts.append(part)
                dists = np.concatenate(dists_parts)
            idx_exact = np.argsort(dists)[:K].tolist()
            t_exact = time.time() - start

            # HNSW build (tidak dihitung), lalu query time
            index = hnswlib.Index(space=space, dim=d)
            index.init_index(max_elements=n_eff, ef_construction=EF_CONSTRUCTION, M=M)
            index.add_items(X)
            index.set_ef(EF)

            start = time.time()
            labels, _ = index.knn_query(q, k=K)
            t_hnsw = time.time() - start
            idx_hnsw = labels[0].tolist()

            rows.append({
                "Distance Metrics": label,
                "Dimensi": d,
                "Jumlah data": f"{n_eff:,}",
                "Index": f"HNSW(M={M}, ef={EF})",
                "Hasil Index terdekat ENN vs HNSW": f"{idx_exact}, {idx_hnsw}",
                "Waktu komputasi Vs": f"{round(t_exact, 6)}, {round(t_hnsw, 6)}"
            })

pd.DataFrame(rows)

Unnamed: 0,Distance Metrics,Dimensi,Jumlah data,Index,Hasil Index terdekat ENN vs HNSW,Waktu komputasi Vs
0,Euclidean,2,1000,"HNSW(M=16, ef=50)","[112, 535, 777, 246, 763], [112, 535, 777, 246...","0.004298, 4.2e-05"
1,Euclidean,2,1000000,"HNSW(M=16, ef=50)","[337023, 831197, 628699, 309892, 615834], [337...","0.08252, 4.6e-05"
2,Euclidean,5,1000,"HNSW(M=16, ef=50)","[468, 52, 885, 292, 929], [468, 52, 885, 292, ...","0.000729, 5.1e-05"
3,Euclidean,5,1000000,"HNSW(M=16, ef=50)","[112836, 698526, 151916, 220675, 883190], [112...","0.093704, 5.6e-05"
4,Angular,2,1000,"HNSW(M=16, ef=50)","[908, 906, 428, 410, 608], [908, 906, 428, 410...","0.000726, 3.4e-05"
5,Angular,2,1000000,"HNSW(M=16, ef=50)","[878753, 930577, 698177, 658808, 45117], [3347...","0.080009, 7e-05"
6,Angular,5,1000,"HNSW(M=16, ef=50)","[299, 927, 904, 455, 969], [299, 927, 904, 455...","0.000728, 5.5e-05"
7,Angular,5,1000000,"HNSW(M=16, ef=50)","[891953, 967055, 32877, 182792, 732813], [8919...","0.089084, 7e-05"


In [None]:
# Ringkasan kecepatan: hitung speedup Exact/HNSW
import pandas as pd

def parse_times(val: str):
    parts = [p.strip() for p in str(val).split(',')]
    if len(parts) >= 2:
        try:
            return float(parts[0]), float(parts[1])
        except ValueError:
            return None, None
    return None, None

_df = pd.DataFrame(rows)
_exact, _hnsw = zip(*[_t if (_t:=parse_times(v)) else (None, None) for v in _df["Waktu komputasi Vs"]])
_df["Exact (s)"] = _exact
_df["HNSW (s)"] = _hnsw
_df["Speedup (Exact/HNSW)"] = (_df["Exact (s)"] / _df["HNSW (s)"]).round(2)

cols = [
    "Distance Metrics", "Dimensi", "Jumlah data", "Index",
    "Hasil Index terdekat ENN vs HNSW", "Exact (s)", "HNSW (s)", "Speedup (Exact/HNSW)"
]

_df[cols]

Unnamed: 0,Distance Metrics,Dimensi,Jumlah data,Index,Hasil Index terdekat ENN vs HNSW,Exact (s),HNSW (s),Speedup (Exact/HNSW)
0,Euclidean,2,1000,"HNSW(M=16, ef=50)","[112, 535, 777, 246, 763], [112, 535, 777, 246...",0.004298,4.2e-05,102.33
1,Euclidean,2,1000000,"HNSW(M=16, ef=50)","[337023, 831197, 628699, 309892, 615834], [337...",0.08252,4.6e-05,1793.91
2,Euclidean,5,1000,"HNSW(M=16, ef=50)","[468, 52, 885, 292, 929], [468, 52, 885, 292, ...",0.000729,5.1e-05,14.29
3,Euclidean,5,1000000,"HNSW(M=16, ef=50)","[112836, 698526, 151916, 220675, 883190], [112...",0.093704,5.6e-05,1673.29
4,Angular,2,1000,"HNSW(M=16, ef=50)","[908, 906, 428, 410, 608], [908, 906, 428, 410...",0.000726,3.4e-05,21.35
5,Angular,2,1000000,"HNSW(M=16, ef=50)","[878753, 930577, 698177, 658808, 45117], [3347...",0.080009,7e-05,1142.99
6,Angular,5,1000,"HNSW(M=16, ef=50)","[299, 927, 904, 455, 969], [299, 927, 904, 455...",0.000728,5.5e-05,13.24
7,Angular,5,1000000,"HNSW(M=16, ef=50)","[891953, 967055, 32877, 182792, 732813], [8919...",0.089084,7e-05,1272.63
