# Embeddings

In [None]:
!pip install -q sentence-transformers h5py torch
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import h5py

embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda")
print(f"Model loaded, dim: {embedder.get_sentence_embedding_dimension()}")

In [None]:
def generate_embeddings(input_file, output_file):
    docs = []
    with open(input_file) as f:
        for line in f:
            parts = line.strip().split('\t', 1)
            if len(parts) == 2:
                docs.append((parts[0], parts[1]))
    
    print(f"{input_file}: {len(docs):,} docs")
    
    texts = [d[1] for d in docs]
    embeddings = embedder.encode(texts, batch_size=256, show_progress_bar=True, normalize_embeddings=True)
    
    with h5py.File(output_file, 'w') as f:
        f.create_dataset('id', data=np.array([d[0] for d in docs], dtype='S20'))
        f.create_dataset('embedding', data=embeddings.astype(np.float32))
    
    print(f"  -> {output_file} {embeddings.shape}")

In [None]:
base = '/content/drive/MyDrive/hqf_de'

files = [
    ('collection_100k.tsv', 'embeddings_original.h5'),
    ('expanded_100k.tsv', 'embeddings_expanded.h5'),
    ('validated_100k.tsv', 'embeddings_validated.h5'),
    ('doc2query_100k.tsv', 'embeddings_doc2query.h5'),
]

for doc_file, emb_file in files:
    generate_embeddings(f"{base}/{doc_file}", f"{base}/{emb_file}")

print("\nDone!")