###Tugas 1

####Import Library

In [None]:
!pip install annoy



In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [None]:
!pip install hnswlib

Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hnswlib: filename=hnswlib-0.8.0-cp312-cp312-linux_x86_64.whl size=2528147 sha256=54f540dd17a0d38a23e450c849d2d627cb8240bc67d8dd15728a700695553e8d
  Stored in directory: /root/.cache/pip/wheels/ac/39/b3/cbd7f9cbb76501d2d5fbc84956e70d0b94e788aac87bda465e
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.8.0


In [None]:
import numpy as np
import time
import annoy
import faiss
import hnswlib
import pandas as pd

####Membuat Dataset

In [None]:
configs = [
    (1000, 2), # 1000 / 2D
    (1000, 5), # 1000 / 5D
    (1000000, 2), # 1000000 / 2D
    (1000000, 5) # 1000000 / 5D
]

all_datasets = []

for i, (n_points, n_dims) in enumerate(configs):
  np.random.seed(42 + i)

  X = np.float32(np.random.rand(n_points, n_dims) * 100)

  query_point = np.float32(np.random.rand(100, n_dims) * 100)

  all_datasets.append({
      "data": X,
      "queries": query_point,
      "n_points": n_points,
      "n_dims": n_dims,
      "info": f"{n_points} points, {n_dims}D"
  })

####Benchmark

In [None]:
results = []

k = 3

In [None]:
for dataset in all_datasets:
  data = dataset["data"]
  queries = dataset["queries"]
  n_points = dataset["n_points"]
  n_dims = dataset["n_dims"]
  info = dataset["info"]

  num_queries = len(queries)

  # ANNOY
  time_start = time.time()
  annoy_index = annoy.AnnoyIndex(n_dims, 'euclidean')
  for i in range(n_points):
    annoy_index.add_item(i, data[i])
  annoy_index.build(3)
  build_time_annoy = time.time() - time_start

  time_start = time.time()
  for q in queries:
    annoy_index.get_nns_by_vector(q, k)
  query_time_annoy = (time.time() - time_start) / num_queries * 1000

  # FAISS
  time_start = time.time()
  faiss_index = faiss.IndexFlatL2(n_dims)
  faiss_index.add(data)
  build_time_faiss = time.time() - time_start

  time_start = time.time()
  faiss_index.search(queries, k)
  query_time_faiss = (time.time() - time_start) / num_queries * 1000

  # HNSW
  time_start = time.time()
  hnsw_index = hnswlib.Index(space='l2', dim=n_dims)
  hnsw_index.init_index(max_elements=n_points, ef_construction=200, M=16)
  hnsw_index.add_items(data)
  build_time_hnsw = time.time() - time_start

  time_start = time.time()
  hnsw_index.knn_query(queries, k=k)
  query_time_hnsw = (time.time() - time_start) / num_queries * 1000

  results.append({
      "Config": info,
      "Metric": "Build Time (s)",
      "ANNOY": f"{build_time_annoy:.4f}",
      "FAISS": f"{build_time_faiss:.4f}",
      "HNSW": f"{build_time_hnsw:.4f}",
  })

  results.append({
      "Config": info,
      "Metric": "Query Time (ms)",
      "ANNOY": f"{query_time_annoy:.4f}",
      "FAISS": f"{query_time_faiss:.4f}",
      "HNSW": f"{build_time_hnsw:.4f}",
  })

df = pd.DataFrame(results)
print("\n\n--- Hasil Akhir Benchmark ---")
print(df.to_string(index=False))



--- Hasil Akhir Benchmark ---
            Config          Metric   ANNOY  FAISS     HNSW
   1000 points, 2D  Build Time (s)  0.0122 0.0001   0.1146
   1000 points, 2D Query Time (ms)  0.0072 0.0736   0.1146
   1000 points, 5D  Build Time (s)  0.0151 0.0001   0.1240
   1000 points, 5D Query Time (ms)  0.0067 0.0655   0.1240
1000000 points, 2D  Build Time (s) 10.0013 0.0115 114.1987
1000000 points, 2D Query Time (ms)  0.0135 1.6826 114.1987
1000000 points, 5D  Build Time (s)  8.3714 0.0258 185.9874
1000000 points, 5D Query Time (ms)  0.0129 1.4594 185.9874


Berdasarkan hasil akhir benchmark ke 3 metode yang didapatkan. Metode ANNOY memiliki waktu persiapan (build) paling lambat, namun memiliki waktu pencarian paling cepat.

###Tugas 2

####Instalasi Library

In [None]:
!pip install annoy

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/647.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m645.1/647.5 kB[0m [31m94.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp312-cp312-linux_x86_64.whl size=551809 sha256=38bae4dfcc40696a583992547d43b5e63d7e7a34db012805353adf61be031eee
  Stored in directory: /root/.cache/pip/wheels/db/b9/53/a3b2d1fe1743abadddec6aa541294b24fdbc39d7800bc57311
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [None]:
!pip install hnswlib

Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hnswlib: filename=hnswlib-0.8.0-cp312-cp312-linux_x86_64.whl size=2528146 sha256=81f26a6c454516efccba59139efcc22f3523e0cca8b4a8cbac519fec49b98433
  Stored in directory: /root/.cache/pip/wheels/ac/39/b3/cbd7f9cbb76501d2d5fbc84956e70d0b94e788aac87bda465e
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.8.0


In [None]:
import pandas as pd
import numpy as np
import time
import faiss
from annoy import AnnoyIndex
import hnswlib
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

####Load Datasets

In [None]:
df = pd.read_csv('songs_with_attributes_and_lyrics.csv', on_bad_lines='warn', engine='python')
features = ['danceability', 'energy', 'loudness', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = df[features].values

# Standarisasi fitur
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X).astype('float32')

k = 10  # jumlah nearest neighbors
num_features = X_scaled.shape[1]
num_items = X_scaled.shape[0]

query_index = 0
query_vector = X_scaled[query_index].reshape(1, -1)
query_track_name = df.iloc[query_index].get('track_name', f'Track {query_index}')

print(f"Mencari {k} lagu terdekat untuk: '{query_track_name}'")
print(f"Total data: {num_items} lagu")

results = {}


  df = pd.read_csv('songs_with_attributes_and_lyrics.csv', on_bad_lines='warn', engine='python')


Mencari 10 lagu terdekat untuk: 'Track 0'
Total data: 478118 lagu


####Exact NN

In [None]:
start_time = time.time()
nn_exact = NearestNeighbors(n_neighbors=k + 1, algorithm='brute', metric='euclidean')
nn_exact.fit(X_scaled)
fit_time = time.time() - start_time

start_time = time.time()
distances, indices = nn_exact.kneighbors(query_vector)
query_time = time.time() - start_time

exact_indices = indices[0][1:]
results['Exact NN'] = {'build_time': fit_time, 'query_time': query_time, 'indices': exact_indices}

####ANNOY

In [None]:
import annoy

start_time = time.time()
annoy_index = annoy.AnnoyIndex(num_features, 'euclidean')
for i in range(num_items):
  annoy_index.add_item(i, X_scaled[i])
annoy_index.build(10)
fit_time = time.time() - start_time

start_time = time.time()
annoy_indices = annoy_index.get_nns_by_vector(query_vector[0], k + 1)
query_time = time.time() - start_time

annoy_indices = np.array(annoy_indices[1:])
results['ANNOY'] = {'build_time': fit_time, 'query_time': query_time, 'indices': annoy_indices}

####FAISS

In [None]:
start_time = time.time()
faiss_index = faiss.IndexFlatL2(num_features)
faiss_index.add(X_scaled)
build_time = time.time() - start_time

start_time = time.time()
distances, faiss_indices = faiss_index.search(query_vector, k + 1)
query_time = time.time() - start_time

faiss_indices = faiss_indices[0][1:]
results['FAISS'] = {'build_time': build_time, 'query_time': query_time, 'indices': faiss_indices}

####HNSW

In [None]:
start_time = time.time()
hnsw_index = hnswlib.Index(space='l2', dim=num_features)
hnsw_index.init_index(max_elements=num_items, ef_construction=200, M=16)
hnsw_index.add_items(X_scaled, np.arange(num_items))
build_time = time.time() - start_time

start_time = time.time()
hnsw_index.set_ef(50)
hnsw_indices, distances = hnsw_index.knn_query(query_vector, k=k + 1)
query_time = time.time() - start_time

hnsw_indices = hnsw_indices[1:]
results['HNSW'] = {'build_time': build_time, 'query_time': query_time, 'indices': hnsw_indices}

####Hasil Perbandingan

In [None]:
print("="*60)
print("HASIL PERBANDINGAN")
print("="*60)

print(f"{'Metode':<15} | {'Waktu Build (s)':<18} | {'Waktu Query (s)':<18} |")
print("-" * 60)

for name, data in results.items():
    build_t = f"{data['build_time']:.6f}"
    query_t = f"{data['query_time']:.6f}"

    print(f"{name:<15} | {build_t:<18} | {query_t:<18} |")

HASIL PERBANDINGAN
Metode          | Waktu Build (s)    | Waktu Query (s)    |
------------------------------------------------------------
Exact NN        | 0.010652           | 0.021496           |
ANNOY           | 9.630796           | 0.000214           |
FAISS           | 0.021497           | 0.004399           |
HNSW            | 93.667323          | 0.000674           |


Berdasarkan perbandingan antara 4 metode, menunjukkan bahwa meskipun HNSW dan ANNOY adalah metode paling lambat pada saat tahap persiapan (build), keduanya merupakan metode yang paling unggul dalam kecepatan pencarian.