In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import torch

from openai import OpenAI
import os

from config import model_experiments

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
data = pd.read_json('../data/preprocessed/combined_data.json')
# concatenate the question_id and label to create a unique identifier for each record
data['rec_id'] = data['question_id'].astype(str) + '_' + data['label'].astype(str)
data['metadata_filter'] = data['db_id'].astype(str) + '-' + data['label'].astype(str)

# add evidence_id as row id
data['index_id'] = data.index
data_0 = data

In [4]:
# get list of models as list where "type": "open-ai" in config
open_ai_models = [model for model, details in model_experiments.items() if details['type'] == 'open-ai']
open_ai_models

['text-embedding-3-small', 'text-embedding-3-large', 'text-embedding-ada-002']

In [5]:
client = OpenAI()

In [6]:
def get_embedding_openai(text, model="text-embedding-3-small"):
   return client.embeddings.create(input = text, model=model).data[0].embedding

In [7]:
def get_embeddings_openai(list_of_sentences, model="text-embedding-3-small"):
    embeddings = []
    for sent in list_of_sentences:
        embeddings.append(client.embeddings.create(input = sent, model=model).data[0].embedding)
    # return and convert to tensor
    return torch.tensor(embeddings)

In [64]:
# text-embedding-3-large will be priced at $0.00013 / 1k tokens.
# Pricing for text-embedding-3-small has therefore been reduced by 5X compared to text-embedding-ada-002, from a price per 1k tokens of $0.0001 to $0.00002.

In [12]:
# this code snippet will create and save embeddings as tensors for each model
# get time
import time
start = time.time()
for model in open_ai_models:
    for column_name in ['question', 'evidence', 'SQL']:
        embeddings = get_embeddings_openai(data[column_name], model=model)
        torch.save(embeddings, f'../data/embeddings/emb_{column_name}_{model}.pt')
        print(f'emb_{column_name}_{model}.pt have been saved')
print('############################################')
print(f'Elapsed time: {time.time() - start}')

KeyboardInterrupt: 

In [16]:
import time
from concurrent.futures import ThreadPoolExecutor
import torch

def process_column_model(column_name, model, df=data):
    embeddings = get_embeddings_openai(df[column_name], model=model)
    torch.save(embeddings, f'../data/embeddings/emb_{column_name}_{model}.pt')
    print(f'emb_{column_name}_{model}.pt have been saved')

def run(df=data):
    start = time.time()
    models = open_ai_models  # Assuming open_ai_models is a list of models
    column_names = ['question', 'evidence', 'SQL']
    
    # Create a ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=None) as executor:
        futures = []
        for model in models:
            for column_name in column_names:
                # Schedule the processing function to run with specific arguments
                future = executor.submit(process_column_model, column_name, model, df)
                futures.append(future)

        # Wait for all futures to complete
        for future in futures:
            future.result()

    print('############################################')
    print(f'Elapsed time: {time.time() - start}')
    
run(df=data)

emb_SQL_text-embedding-ada-002.pt have been saved
emb_evidence_text-embedding-ada-002.pt have been saved
emb_question_text-embedding-ada-002.pt have been saved
emb_evidence_text-embedding-ada-002.pt have been saved
emb_question_text-embedding-ada-002.pt have been saved
emb_SQL_text-embedding-ada-002.pt have been saved
emb_question_text-embedding-3-small.pt have been saved
emb_SQL_text-embedding-3-small.pt have been saved
emb_evidence_text-embedding-3-small.pt have been saved
emb_SQL_text-embedding-3-small.pt have been saved
emb_evidence_text-embedding-3-small.pt have been saved
emb_question_text-embedding-3-small.pt have been saved
emb_question_text-embedding-3-large.pt have been saved
emb_evidence_text-embedding-3-large.pt have been saved
emb_SQL_text-embedding-3-large.pt have been saved
emb_evidence_text-embedding-3-large.pt have been saved
emb_question_text-embedding-3-large.pt have been saved
emb_SQL_text-embedding-3-large.pt have been saved
########################################