## Indexing Wikipedia Chunk Embeddings
* How much can I scale indexing on my desktop computer?
* Using 64GB system memory and 24GB GPU memory

In [1]:
import gc
import math
import os
os.environ["TOKENIZERS_PARALLELISM"] = 'false'
import pickle

import faiss
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from torch.utils.data import DataLoader
import transformers
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util

device = torch.device('cuda')

In [2]:
documents_path = '/home/stefanwebb/data/wikimedia/wikipedia/20231101.en'
embeddings_path = '/home/stefanwebb/embeddings/wikimedia/wikipedia/20231101.en'
files = [f"train-{idx:05d}-of-00041.parquet" for idx in range(41)]
batch_size = 1024


### Count How Many Documents

In [4]:
count_documents = 0
for file in files:
    fullpath = os.path.join(documents_path, file)
    parquet_file = pq.ParquetFile(fullpath)
    count_documents += parquet_file.metadata.num_rows
print('Documents:', count_documents)

Documents: 6407814


### Count How Many Chunks

In [5]:
count_chunks = 0
for file in files:
    fullpath = os.path.join(embeddings_path, file)
    parquet_file = pq.ParquetFile(fullpath)
    count_chunks += parquet_file.metadata.num_rows
print('Chunks', count_chunks)

Chunks 49522046


In [6]:
print('Avg chunks/doc:', round(count_chunks/count_documents, 2))

Avg chunks/doc: 7.73


### Memory Requirements
* We clearly can't keep raw embeddings in 64GB system RAM (although my motherboard can hold up 128GB)
* How about we deal with the text of the chunks separately and cast the embeddings to float16?

In [7]:
bytes_per_embedding = 384 * 4
print('Memory for raw embeddings:', round(bytes_per_embedding * count_chunks / 10**9, 2), 'GB')

Memory for raw embeddings: 76.07 GB


### Separate Embeddings and Chunks
* Taking an incremental approach, which is easier for debugging.
* First, extract and concatenate the embeddings from each Parquet file.
* Separately, save the document chunks to another file.
* This step takes about 3 hours on my machine.

In [None]:
for idx, file in enumerate(files):
    print(f"File {idx} of 41")
    embeddings = []
    chunks = []

    fullpath = os.path.join(embeddings_path, file)
    dataset = load_dataset("parquet", data_files={'train': fullpath}, streaming=True, batch_size=batch_size)

    for x in dataset['train'].iter(batch_size=batch_size):
        this_embeddings = np.array(x['embeddings']).astype(np.float16)
        this_chunks = (x['chunks'])

        embeddings.append(this_embeddings)
        chunks.extend(this_chunks)

    embeddings_file = os.path.join(embeddings_path, f'embeddings-{idx:05d}-of-00041.npy')
    with open(embeddings_file, 'wb') as f:
        embedding_matrix = np.concatenate(embeddings, axis=0)
        np.save(f, embedding_matrix)

    chunks_file = os.path.join(embeddings_path, f'chunks-{idx:05d}-of-00041.pkl')
    with open(chunks_file, 'wb') as f:
        pickle.dump(chunks, f)

    del embedding_matrix
    del embeddings
    del chunks

### Sample 10% of data
* Need a smaller subset for training FAISS index on GPU
* This kept crashing due to an out of memory error, so I've moved to standalone script `subsample_embeddings.py`

In [8]:
embeddings_file = os.path.join(embeddings_path, f'subset-embeddings.npy')
Xsubset = np.load(embeddings_file)

In [9]:
Xsubset.shape, round(Xsubset.shape[0] / count_chunks * 100, 2)

((4952183, 384), 10.0)

### Train FAISS Index
* 10% from each file should be representative of entire dataset
* TODO: Calculate maximum fraction of data I can fit into GPU memory 

In [10]:
assert faiss.get_num_gpus() > 0

# This type of index is recommended in the FAISS docs for our scale of number of embeddings
embed_dim = 384
# index = faiss.index_factory(embed_dim, "IVF16384_HNSW32,Flat")
# index = faiss.index_factory(embed_dim, "PCA128,IVF16384_HNSW32,Flat")


n_ivf = int(8 * np.sqrt(Xsubset.shape[0]))
index = faiss.index_factory(embed_dim, f'PCA64,IVF{n_ivf},Flat', faiss.METRIC_L2)

index_ivf = faiss.extract_index_ivf(index)
clustering_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(index_ivf.d))
index_ivf.clustering_index = clustering_index
index.train(Xsubset)

In [11]:
n_ivf

17802

In [11]:
faiss.write_index(index, "wikipedia-en-simplifying-empty.index")

### Add Embeddings to Index
* Index file ends up being `13GB`, which is not bad considering the size of the raw embeddings is much larger!
* Also, this fits into GPU memory

In [12]:
print('Adding embeddings to index...')
for idx, file in enumerate(files):
    print(f"File {idx} of 41")
    embeddings_file = os.path.join(embeddings_path, f'embeddings-{idx:05d}-of-00041.npy')
    X = np.load(embeddings_file)
    index.add(X)
    del X
    gc.collect()

Adding embeddings to index...
File 0 of 41
File 1 of 41
File 2 of 41
File 3 of 41
File 4 of 41
File 5 of 41
File 6 of 41
File 7 of 41
File 8 of 41
File 9 of 41
File 10 of 41
File 11 of 41
File 12 of 41
File 13 of 41
File 14 of 41
File 15 of 41
File 16 of 41
File 17 of 41
File 18 of 41
File 19 of 41
File 20 of 41
File 21 of 41
File 22 of 41
File 23 of 41
File 24 of 41
File 25 of 41
File 26 of 41
File 27 of 41
File 28 of 41
File 29 of 41
File 30 of 41
File 31 of 41
File 32 of 41
File 33 of 41
File 34 of 41
File 35 of 41
File 36 of 41
File 37 of 41
File 38 of 41
File 39 of 41
File 40 of 41


In [13]:
faiss.write_index(index, "wikipedia-en-simplifying.index")

In [14]:
index.is_trained

True

### Fin
We have a scalable index over all of Wikipedia we can use for RAG inference!