In [18]:
import sqlite3
import numpy as np
import faiss
import openai
import torch
import transformers
from transformers import BertTokenizer, BertModel
from concurrent.futures import ThreadPoolExecutor
from transformers import AutoModel, AutoTokenizer
from concurrent.futures import ProcessPoolExecutor, as_completed

In [19]:
#conda install -c conda-forge PyTorch

In [27]:
def fetch_document_ids(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT goid FROM subset_table")  # Adjust based on your table schema
    ids = [row[0] for row in cursor.fetchall()]
    conn.close()
    return ids

In [28]:
db_path = 'subset_data.db'
document_ids = fetch_document_ids(db_path)

In [120]:
def fetch_documents_by_ids(db_path, document_ids):
    documents = []
    # Open a single connection
    with sqlite3.connect(db_path) as conn:
        for doc_id in document_ids:
            cursor = conn.cursor()
            cursor.execute("SELECT text FROM subset_table WHERE goid = ?", (doc_id,))
            result = cursor.fetchone()
            if result:
                documents.append(result[0])
            else:
                documents.append(None)
    return documents

In [259]:
def create_hnsw_faiss_index(encoded_docs, M=16, efConstruction=200):
    """
    Create an HNSW FAISS index.

    Parameters:
    - encoded_docs: numpy array of vectors to be indexed.
    - M: The number of bi-directional links created for each element in the index. Higher values lead to higher accuracy but also higher memory consumption.
    - efConstruction: Controls the size of the dynamic list for the construction phase, affecting index construction speed and quality.

    Returns:
    - A trained FAISS HNSW index.
    """
    dimension = encoded_docs.shape[1]
    
    # Create an HNSW index
    index = faiss.IndexHNSWFlat(dimension, M, faiss.METRIC_L2)
    index.hnsw.efConstruction = efConstruction
    
    # No need to check if the index is trained since HNSW does not require explicit training
    index.add(encoded_docs)
    
    # Optionally, you can set efSearch parameter (size of the dynamic list for searching) after adding the data
    # For example, index.hnsw.efSearch = 64
    
    return index

In [383]:
embeddings_loaded = np.load('embeddings_docs_bert_base.npy', allow_pickle=True)

In [384]:
fa_index = create_quantized_faiss_index(embeddings_loaded)

In [196]:
def create_quantized_faiss_index(encoded_docs):
    dimension = encoded_docs.shape[1]
    nlist = min(len(encoded_docs), 2048)
    quantizer = faiss.IndexFlatL2(dimension)
    index = faiss.IndexIVFPQ(quantizer, dimension, nlist, 256, 8) 
    index.nprobe = 20
    assert not index.is_trained
    index.train(encoded_docs)
    index.add(encoded_docs)
    return index

In [246]:
def create_flat_faiss_index(encoded_docs):
    dimension = encoded_docs.shape[1]
    # Create a flat L2 index
    index = faiss.IndexFlatL2(dimension)
    # No need to train a flat index, so we can directly add the documents
    index.add(encoded_docs)
    return index

In [249]:
index = create_flat_faiss_index(embeddings_loaded)

FAISS generation took 0.00 seconds for 10000 samples with distilibert


In [200]:
faiss.write_index(index, "saved_index_subset_flat_512.index")

In [201]:
index_in = faiss.read_index("saved_index_subset_flat_512.index")

In [362]:
def generate_single_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=256)
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:,0,:].cpu().numpy()
    return embedding

_vector = generate_single_embedding(text, model, tokenizer)
faiss.normalize_L2(_vector)

In [389]:
query = "What is World War 2?"

def search_faiss(query, faiss_index, k=5):
    faiss_index.nprobe = 100
    encoded_query = generate_single_embedding(query, model, tokenizer)
    distances, indices = faiss_index.search(encoded_query, k)
    return [i for i in indices[0] if i < len(documents)]

In [390]:
search_faiss(" Those of Lansdowne, Lawrence and Sheppard Placed in Arlington Receiving Vault.   Special to The New York Times.   WASHINGTON, Sept. 5", fa_index )

[-1, -1, -1, -1, -1]