In [None]:
#import statements
import sqlite3
import numpy as np
import openai
import torch
import transformers
from transformers import BertTokenizer, BertModel
from concurrent.futures import ThreadPoolExecutor
from transformers import AutoModel, AutoTokenizer
from concurrent.futures import ProcessPoolExecutor, as_completed

In [None]:
#get document IDs from SQLite DB
def fetch_document_ids(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT goid FROM subset_table")  # Adjust based on your table schema
    ids = [row[0] for row in cursor.fetchall()]
    conn.close()
    return ids
db_path = 'subset_data.db'
document_ids = fetch_document_ids(db_path)

In [None]:
#get from SQLite DB using the retrieved document IDs, store as an array
def fetch_documents_by_ids(db_path, document_ids):
    documents = []
    # Open a single connection
    with sqlite3.connect(db_path) as conn:
        for doc_id in document_ids:
            cursor = conn.cursor()
            cursor.execute("SELECT text FROM subset_table WHERE goid = ?", (doc_id,))
            result = cursor.fetchone()
            if result:
                documents.append(result[0])
            else:
                documents.append(None)
    return documents

documents = fetch_documents_by_ids(db_path, document_ids)

In [None]:
#import the DistiliBert Tokenizer and Model (Note the exact methodology was replicated for other embedding models)
tokenizer = AutoTokenizer.from_pretrained('tokenizer')
model = AutoModel.from_pretrained('model')

In [None]:
# Move the model to the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

model = model.to(device)  

In [None]:
#function to generate embeddings via batch processing
def generate_embeddings(texts, model, tokenizer, batch_size=256):
    model.eval()  # Put the model in evaluation mode
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        # Move your inputs to the GPU
        inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            # Move the embeddings back to CPU for further processing if needed
            embeddings.extend(outputs.last_hidden_state[:,0,:].cpu().numpy())
    return embeddings

In [None]:
#time the embeddings for comparison between embedding models
import time
start_time = time.time()

# Generate embeddings using the quantized model
embeddings = generate_embeddings(documents[0:100000], model, tokenizer)

end_time = time.time()
duration = end_time - start_time
print(f"Embedding generation took {duration:.2f} seconds for {100000} samples.")

In [None]:
#save the embeddings to be called in later
np.save('embeddings_docs_bert_base.npy', embeddings)

In [None]:
#generate one embedding for the query
def generate_single_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=256)
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:,0,:].cpu().numpy()
    return embedding