Load libraries.

In [3]:
import rag_functions as rf

In [None]:
import os
import chromadb
from chromadb.utils import embedding_functions
import google.auth
from chromadb.api.types import (
    Documents,
    EmbeddingFunction,
)
from vertexai.language_models import TextEmbeddingModel
import vertexai


Load all data – documents, questions and list of the right answers. Segment documents into chunks.

In [None]:
DATA_FOLDER_PATH = "/Users/sladkydrevo/opt/baka/dataset/texts"
texts = rf.load_texts(DATA_FOLDER_PATH)

In [None]:
chunk_counts, chunk_data = rf.chunk_texts(texts, chunk_size=128, overlap=10)

In [None]:
chunk_names, text_chunks = rf.split_dict_data(chunk_data) 

In [None]:
QUESTIONS_FOLDER_PATH = "/Users/sladkydrevo/opt/baka/dataset/questions"
questions_data = rf.load_texts(QUESTIONS_FOLDER_PATH)

In [None]:
questions = rf.convert_questions_dict(questions_data)

In [None]:
question_names, question_texts = rf.split_dict_data(questions) 

In [None]:
right_answers_path = "/Users/sladkydrevo/opt/baka/right_answers.txt"
right_answers = rf.load_right_answers(right_answers_path)

In [None]:
results_path = "/Users/sladkydrevo/opt/baka/MODELS_RESULTS.csv"

Declare a class for testing sentence transformers and other embedding models.

In [None]:
class ChromaModelTester:
    def __init__(self, col_name, documents, ids, query_texts, question_names, right_answers, results_path, n_results=5):
        self.col_name = col_name
        self.documents = documents
        self.ids = ids
        self.query_texts = query_texts
        self.question_names = question_names
        self.right_answers = right_answers
        self.results_path = results_path
        self.n_results = n_results
        self.chroma_client = chromadb.Client()
        self.results = {}
        
    def _test_model(self, model_name, ef):
        try:
            self.chroma_client.delete_collection(self.col_name)
        except ValueError:
            pass
        
        print(f"Testing model {model_name}...")
        collection = self.chroma_client.create_collection(name=self.col_name, embedding_function=ef)
        collection.upsert(
            documents=self.documents,
            ids=self.ids
        )
        print(f"Documents embedded and inserted to the collection.")
        
        outputs = collection.query(
            query_texts=self.query_texts, 
            n_results=self.n_results
        )
                
        answers = rf.get_top_n_from_db(outputs, question_names, question_texts, n=self.n_results)
        rank_table = rf.get_rank_table(answers, self.right_answers, self.n_results)
        match_results = rf.get_match_count(rank_table)
        tops = rf.get_top_accuracies(match_results, questions)   
        rf.write_to_csv_top_5(self.results_path, results=tops, model_name=model_name)
        
        self.results[model_name] = {
            "model_name" : model_name,
            "outputs": outputs,
            "answers": answers,
            "match_results": match_results,
            "tops": tops
        }
        
    def test_sentence_transformers(self, sentence_transformers_list):
        for model_name in sentence_transformers_list:
            try:
                ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)
                self._test_model(model_name, ef)
            except Exception as e:
                print(f"Failed to test {model_name}: {e}")
            
    def test_api_models(self, efs_dict):
        for model_name, ef in efs_dict.items():
            try:
                self._test_model(model_name, ef())  
            except Exception as e:
                print(f"Failed to test {model_name}: {e}")
        

List sentence transformers for testing.

In [None]:
sentence_transformers = [
    "all-MiniLM-L6-v2",
    "distiluse-base-multilingual-cased-v1",
    "distiluse-base-multilingual-cased-v2",
    "multi-qa-MiniLM-L6-cos-v1",
    "multi-qa-distilbert-cos-v1",
    "multi-qa-mpnet-base-dot-v1",
    "paraphrase-multilingual-MiniLM-L12-v2",
    "paraphrase-multilingual-mpnet-base-v2"
]

Define embedding functions for other models and put them into a dictionary.

In [None]:
class VertexAIEmbeddingFunction(EmbeddingFunction[Documents]):
    def __init__(self, project_id, location, model_name, dimensionality=3072):
        creds, _ = google.auth.default(quota_project_id=project_id)
        vertexai.init(project=project_id, location=location, credentials=creds)
        
        self.model = TextEmbeddingModel.from_pretrained(model_name)
        self.dimensionality = dimensionality

    def __call__(self, texts):
        embeddings = []
        for text in texts:
            embedding = self.model.get_embeddings([text])
            embeddings.append(embedding[0].values)
        return embeddings

In [None]:
def cohere_embedding(model_name):
    cohere_ef = embedding_functions.CohereEmbeddingFunction(
        api_key=os.environ["COHERE_API_KEY"], 
        model_name=model_name
    )
    return cohere_ef

In [None]:
def vertex_embedding(model_name):
    vertex_ef = VertexAIEmbeddingFunction(
        project_id="nodal-vigil-455211-t6",
        location="us-central1",
        model_name=model_name,
    )
    return vertex_ef

In [None]:
def genai_embedding(model_name):
    genai_ef = embedding_functions.GoogleGenerativeAiEmbeddingFunction(
        api_key=os.environ["GOOGLE_API_KEY"],
        model_name=model_name
    )
    return genai_ef

In [None]:
def jina_embedding(model_name):
    jina_ef = embedding_functions.JinaEmbeddingFunction(
        api_key=os.environ["JINA_API_KEY"],
        model_name=model_name,
    )
    return jina_ef

In [None]:
def ollama_embedding(model_name):
    ollama_ef = embedding_functions.OllamaEmbeddingFunction(
        url="http://127.0.0.1:11434/api/embeddings",
        model_name=model_name,
    )
    return ollama_ef

In [None]:
def openai_embedding(model_name):
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        api_key=os.environ["OPENAI_API_KEY"],
        model_name=model_name
    )
    return openai_ef

In [None]:
embedding_models = {
    "MixedBreadAI: mxbai-embed-large" : lambda: ollama_embedding("mxbai-embed-large"),
    "Snowflake: snowflake-arctic-embed" : lambda: ollama_embedding("snowflake-arctic-embed"),
    "NomicAI: nomic-embed-text" : lambda: ollama_embedding("nomic-embed-text"),
    #"Llama: granite-embedding" : lambda: ollama_embedding("granite-embedding"), # list index out of range?
    "Cohere: embed-multilingual-v2.0" : lambda: cohere_embedding("embed-multilingual-v2.0"),    
    "Cohere: embed-multilingual-v3.0" : lambda: cohere_embedding("embed-multilingual-v3.0"),
    "Cohere: embed-multilingual-light-v3.0" : lambda: cohere_embedding("embed-multilingual-light-v3.0"),
    "Jina: jina-clip-v2" : lambda: jina_embedding("jina-clip-v2"),
    "Jina: jina-embeddings-v3" : lambda: jina_embedding("jina-embeddings-v3"),
    #"Jina: jina-colbert-v2" : lambda: jina_embedding("jina-colbert-v2"), # not available to use for /v1/embeddings
    #"Jina: jina-reranker-v2-base-multilingual" : lambda: jina_embedding("jina-reranker-v2-base-multilingual"), # not available to use for /v1/embeddings
    "Google: text-embedding-004" : lambda: genai_embedding("models/text-embedding-004"),
    "Google Vertex AI: text-multilingual-embedding-002" : lambda: vertex_embedding("text-multilingual-embedding-002"),
    #"Google Vertex AI: text-embedding-large-exp-03-07" : lambda: vertex_embedding("text-embedding-large-exp-03-07"), # neodpovídá počet požadavků za minutu
    "OpenAI: text-embedding-ada-002" : lambda: openai_embedding("text-embedding-ada-002"),
    "OpenAI: text-embedding-3-small" : lambda: openai_embedding("text-embedding-3-small"),
    "OpenAI: text-embedding-3-large" : lambda: openai_embedding("text-embedding-3-large"),
}

In [None]:
tester = ChromaModelTester(
    col_name="dataset",
    documents=text_chunks, 
    ids=chunk_names,
    query_texts=question_texts,
    question_names=question_names,
    right_answers=right_answers, 
    results_path="/Users/sladkydrevo/opt/baka/MODELS_RESULTS.csv"
    )

Test multilingual Sentence transformers.

In [None]:
tester.test_sentence_transformers(sentence_transformers)

Testing model all-MiniLM-L6-v2...
Documents embedded and inserted to the collection.
Results successfully recorded. Model name: all-MiniLM-L6-v2 Results: [0.15, 0.25, 0.3]


Test multilingual embedding models through API.

In [None]:
tester.test_api_models(embedding_models)