# Praktikum 4

Percobaan kali ini kita akan melihat perbedaan ketiga model yang telah kita bahas dan bandingkan hasilnya.

In [None]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000   # bisa coba 100_000 dulu jika RAM terbatas
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10

# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")

# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")


=== Annoy ===
Build time: 2.668334484100342 detik
Query time: 0.0001990795135498047 detik
Neighbors: [968151, 819015, 152491, 752066, 63130] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.003208160400390625 detik
Query time: 0.0034418106079101562 detik
Neighbors: [968151 819015 152491 752066  63130] ...

=== HNSW (hnswlib) ===
Build time: 2.668334484100342 detik
Query time: 0.0001990795135498047 detik
Neighbors: [968151, 819015, 152491, 752066, 63130] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.003208160400390625 detik
Query time: 0.0034418106079101562 detik
Neighbors: [968151 819015 152491 752066  63130] ...

=== HNSW (hnswlib) ===
Build time: 11.284440755844116 detik
Query time: 0.00013637542724609375 detik
Neighbors: [968151 819015 152491 752066  63130] ...
Build time: 11.284440755844116 detik
Query time: 0.00013637542724609375 detik
Neighbors: [968151 819015 152491 752066  63130] ...


### Tabel eksperimen: Perbandingan metric distance (Praktikum 4)

Konfigurasi singkat:
- Algoritma: Annoy, FAISS, HNSW
- Metrics: Euclidean (L2) dan Cosine (Angular)
- Dataset: N=100,000, dim=5 (sesuaikan jika sumber daya terbatas)
- K=10 tetangga

Catatan:
- Build time dan Query time dilaporkan terpisah.
- FAISS cosine: gunakan normalisasi vektor + IndexFlatIP sebagai pendekatan umum untuk cosine similarity.
- Annoy angular = cosine distance. HNSW mendukung 'l2' dan 'cosine' secara langsung.

In [None]:
import numpy as np
import pandas as pd
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

np.random.seed(123)

N = 100_000   # ubah jika perlu
D = 5
K = 10
TREES = 10
EF = 50
M = 16
EF_CONS = 200

# Data acak
X = np.random.random((N, D)).astype(np.float32)
q = np.random.random((1, D)).astype(np.float32)

results = []

# ===== Annoy (Euclidean) =====
ann_eu = AnnoyIndex(D, 'euclidean')
start = time.time()
for i in range(N):
    ann_eu.add_item(i, X[i])
ann_eu.build(TREES)
build = time.time() - start
start = time.time()
idx = ann_eu.get_nns_by_vector(q[0], K)
query_t = time.time() - start
results.append({"Algoritma":"Annoy","Metric":"Euclidean","Build (s)":round(build,4),"Query (s)":round(query_t,4),"Neighbors":idx[:5]})

# ===== Annoy (Angular = Cosine) =====
ann_cos = AnnoyIndex(D, 'angular')
start = time.time()
for i in range(N):
    ann_cos.add_item(i, X[i])
ann_cos.build(TREES)
build = time.time() - start
start = time.time()
idx = ann_cos.get_nns_by_vector(q[0], K)
query_t = time.time() - start
results.append({"Algoritma":"Annoy","Metric":"Cosine","Build (s)":round(build,4),"Query (s)":round(query_t,4),"Neighbors":idx[:5]})

# ===== FAISS (L2) =====
faiss_l2 = faiss.IndexFlatL2(D)
start = time.time()
faiss_l2.add(X)
build = time.time() - start
start = time.time()
D_l2, I_l2 = faiss_l2.search(q, K)
query_t = time.time() - start
results.append({"Algoritma":"FAISS","Metric":"Euclidean","Build (s)":round(build,4),"Query (s)":round(query_t,4),"Neighbors":I_l2[0][:5].tolist()})

# ===== FAISS (Cosine via normalized IP) =====
# Normalize vectors to unit length and use inner product
Xn = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-12)
qn = q / (np.linalg.norm(q, axis=1, keepdims=True) + 1e-12)
faiss_ip = faiss.IndexFlatIP(D)
start = time.time()
faiss_ip.add(Xn)
build = time.time() - start
start = time.time()
S_ip, I_ip = faiss_ip.search(qn, K)
query_t = time.time() - start
# For cosine distance, higher IP means closer; we report neighbors as-is
results.append({"Algoritma":"FAISS","Metric":"Cosine","Build (s)":round(build,4),"Query (s)":round(query_t,4),"Neighbors":I_ip[0][:5].tolist()})

# ===== HNSW (l2) =====
h_l2 = hnswlib.Index(space='l2', dim=D)
start = time.time()
h_l2.init_index(max_elements=N, ef_construction=EF_CONS, M=M)
h_l2.add_items(X)
build = time.time() - start
h_l2.set_ef(EF)
start = time.time()
L_l2, _ = h_l2.knn_query(q, k=K)
query_t = time.time() - start
results.append({"Algoritma":"HNSW","Metric":"Euclidean","Build (s)":round(build,4),"Query (s)":round(query_t,4),"Neighbors":L_l2[0][:5].tolist()})

# ===== HNSW (cosine) =====
h_cs = hnswlib.Index(space='cosine', dim=D)
start = time.time()
h_cs.init_index(max_elements=N, ef_construction=EF_CONS, M=M)
h_cs.add_items(X)
build = time.time() - start
h_cs.set_ef(EF)
start = time.time()
L_cs, _ = h_cs.knn_query(q, k=K)
query_t = time.time() - start
results.append({"Algoritma":"HNSW","Metric":"Cosine","Build (s)":round(build,4),"Query (s)":round(query_t,4),"Neighbors":L_cs[0][:5].tolist()})

pd.DataFrame(results)

Unnamed: 0,Algoritma,Metric,Build (s),Query (s),Neighbors
0,Annoy,Euclidean,0.245,0.0002,"[97876, 23795, 6300, 35090, 69415]"
1,Annoy,Cosine,0.2984,0.0001,"[92706, 78796, 15225, 36016, 20140]"
2,FAISS,Euclidean,0.0005,0.0003,"[97876, 23795, 6300, 35090, 69415]"
3,FAISS,Cosine,0.0006,0.0006,"[92706, 78796, 15225, 36016, 20140]"
4,HNSW,Euclidean,0.6514,0.0001,"[97876, 23795, 6300, 35090, 69415]"
5,HNSW,Cosine,0.5913,0.0001,"[92706, 78796, 15225, 36016, 20140]"
