In [1]:
import pandas as pd
import numpy as np
import torch

import cohere
import os

from config import model_experiments

In [2]:
from dotenv import load_dotenv
load_dotenv()
cohere_api_key = os.getenv("COHERE_API_KEY")

In [3]:
data = pd.read_json('../data/preprocessed/combined_data.json')
# concatenate the question_id and label to create a unique identifier for each record
data['rec_id'] = data['question_id'].astype(str) + '_' + data['label'].astype(str)
data['metadata_filter'] = data['db_id'].astype(str) + '-' + data['label'].astype(str)

# add evidence_id as row id
data['index_id'] = data.index
data_0 = data

In [4]:
# get list of models as list where "type": "open-ai" in config
cohere_models = [model for model, details in model_experiments.items() if details['type'] == 'cohere']
cohere_models

['embed-english-v3.0']

In [5]:
co = cohere.Client(cohere_api_key)

# because the texts being embedded are the passages we are searching over, we set the input type as search_doc


In [6]:
# function that takes in a model name, list of sentences and returns the embeddings in torch tensor format
def get_embeddings_cohere(model_name, sentences):
    doc_embeddings = co.embed(texts=sentences,
                model=model_name,
                input_type="search_document")
    doc_embeddings = torch.tensor(doc_embeddings.embeddings)
    return doc_embeddings

In [7]:
for model in cohere_models:
    for column_name in ['question', 'evidence', 'SQL']:
        embeddings = get_embeddings_cohere(model, data[column_name].tolist())
        model_short_name = model.split('/')[-1]
        torch.save(embeddings, f'../data/embeddings/emb_{column_name}_{model_short_name}.pt')
        print(f'emb_{column_name}_{model_short_name}.pt have been saved')

emb_question_embed-english-v3.0.pt have been saved
emb_evidence_embed-english-v3.0.pt have been saved
emb_SQL_embed-english-v3.0.pt have been saved
