In [9]:
import os
import config
import tools.rag_functions as rf

In [10]:
models_path = "models/models_test.json"
documents_path = "dataset/texts"
qa_eval_set_path = "dataset/qa/qa_eval_set.json"

In [11]:
embedding_models = rf.list_models(models_path, models_type="embedding_models")
generative_models = rf.list_models(models_path, models_type="generative_models")

In [12]:
qa_eval_set = rf.load_json(qa_eval_set_path)

In [13]:
texts = rf.load_texts(documents_path)


In [14]:
chunk_data = rf.chunk_texts(texts, chunk_size=128, overlap=10)
chunk_names, text_chunks = rf.dict_to_kv_lists(chunk_data) 

In [15]:
import chromadb
from chromadb import Settings

class Vectorizator:
    def __init__(self, documents, ids):
        self.documents = documents
        self.ids = ids
        self.client = chromadb.PersistentClient(
            path="chroma_data/",
            settings=Settings(allow_reset=True)
        )
        
    def get_or_load_model_collection(self, model_family, embedding_model_name):
        collection_name = f"collection_by_{embedding_model_name}"
        ef = rf.universal_ef(model_family, embedding_model_name)            
        try:
            self.load_collection(collection_name, ef)

        except Exception:
            print(f"Collection for this model doesn't exist, database will be populated instead.")
            self.populate_db(collection_name, ef)

    def populate_db(self, collection_name, ef):
        try:
            self.collection = self.client.create_collection(
                name=collection_name, 
                embedding_function=ef()
            )
            self.collection.upsert(
                documents=self.documents,
                ids=self.ids
            )
            print(f"Documents successfully embedded and saved to a collection.")

        except Exception as err:
            print(f"An unknown error occurred while saving:\n{err}")
            raise
             
    def load_collection(self, collection_name, ef):
        self.collection = self.client.get_collection(
            name=collection_name,
            embedding_function=ef()
        )
        print(f"Documents successfully loaded as a collection.")


    def clear(self, whole=False, embedding_model_name=None):
        if whole:
            self.client.reset()
        else:
            try:
                collection_name = f"collection_by_{embedding_model_name}"
                self.client.delete_collection(name=collection_name)
            except Exception as err:
                print(f"An error occurred:\n{err}")

            
    def get_results(self, question, n_results=3):
        outputs = self.collection.query(
            query_texts=[question], 
            n_results=n_results
        )
        return outputs

In [16]:
vectorizator = Vectorizator(
    documents=text_chunks,
    ids=chunk_names
)

In [17]:
from model_config.clients_api import clients

class GenerativeModelClient:
    def __init__(self, provider):
        for provider_name, client_function in clients:
            if provider == provider_name:
                client = client_function()
                client = clients[provider]()
        self.client = client
        return client

    def generate_answer(self, prompt, model_name):
        response = self.client.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
        )
        self.answer = response.choices[0].message.content
        return self.answer

In [None]:
{
    "em1" : {
        "llama" : {
            {
                "question1" : answer_eval_results, 
                "question2" : answer_eval_results
            }
        },
        "gpt" : {
            {
                "question1" : answer_eval_results, 
                "question2" : answer_eval_results
            }
        }
    },
    "em2" : {
        "llama" : {
            {
                "question1" : answer_eval_results, 
                "question2" : answer_eval_results
            }
        },
        "gpt" : {
            {
                "question1" : answer_eval_results, 
                "question2" : answer_eval_results
            }
        }
    }
}


In [29]:
print(embedding_models)

{'sentence_transformers_embedding': ['all-MiniLM-L6-v2', 'paraphrase-multilingual-mpnet-base-v2']}


In [32]:
for model_family in embedding_models:
    for embedding_model_name in embedding_models[model_family]:
        print(embedding_model_name)

all-MiniLM-L6-v2
paraphrase-multilingual-mpnet-base-v2


In [33]:
from model_config.prompt_template import inject_prompt
from tests.similarity_calculating_test import SimilarityCalculator

all_combs_results = {}


for model_family in embedding_models:
    for embedding_model_name in embedding_models[model_family]:
        print(f"Working on {embedding_model_name}...")
        try:
            vectorizator.get_or_load_model_collection(model_family, embedding_model_name)
        except Exception as err:
            print(err)
            break # continue with next model

        all_combs_results[embedding_model_name] = {}
        for provider in generative_models:
            try:
                chatbot = GenerativeModelClient(provider)
            except Exception as err:
                print(err)
                break # continue with next provider, probably something wrong with API

            for generative_model_name in provider:
                
                all_combs_results[embedding_model_name][generative_model_name] = {}
                if generation_failed:
                        break
                for i in range(len(qa_eval_set)):
                    question = qa_eval_set[i]["question"]
                    expected_answer = qa_eval_set[i]["answer"]

                    outputs = vectorizator.get_results(question)
                    context = "\n\n".join(outputs["documents"][0])
                    prompt = inject_prompt(context, question)
                    try:
                        generated_answer = chatbot.generate_answer(prompt, generative_model_name)
                    except Exception as err:
                        print(err)
                        generation_failed = True
                        # delete the generative_model_name from the dict
                        break
                    
                    similaritator = SimilarityCalculator(expected_answer, generated_answer)
                    answer_eval_results = similaritator.compare_texts()
                    all_combs_results[embedding_model_name][generative_model_name][question] = answer_eval_results
            
        

Working on sentence_transformers_embedding...
Collection for this model doesn't exist, database will be populated instead.
Documents successfully embedded and saved to a collection.
too many values to unpack (expected 2)
Working on sentence_transformers_embedding...
Documents successfully loaded as a collection.
too many values to unpack (expected 2)
