In [1]:
import os
import config
import tools.rag_functions as rf

In [2]:
models_path = "models/models_test.json"
documents_path = "dataset/texts"
qa_eval_set_path = "dataset/qa/qa_eval_set.json"

In [3]:
embedding_models = rf.list_models(models_path, models_type="embedding_models")
generative_models = rf.list_models(models_path, models_type="generative_models")

In [4]:
qa_eval_set = rf.load_json(qa_eval_set_path)

In [5]:
texts = rf.load_texts(documents_path)


In [6]:
chunk_data = rf.chunk_texts(texts, chunk_size=128, overlap=10)
chunk_names, text_chunks = rf.dict_to_kv_lists(chunk_data) 

In [7]:
import chromadb
from chromadb import Settings

class Vectorizator:
    def __init__(self, documents, ids):
        self.documents = documents
        self.ids = ids
        self.client = chromadb.PersistentClient(
            path="chroma_data/",
            settings=Settings(allow_reset=True)
        )
        
    def get_or_load_model_collection(self, model_family, embedding_model_name):
        collection_name = f"collection_by_{embedding_model_name}"
        ef = rf.universal_ef(model_family, embedding_model_name)            
        try:
            self.load_collection(collection_name, ef)

        except Exception:
            print(f"Collection for this model doesn't exist, database will be populated instead.")
            self.populate_db(collection_name, ef)

    def populate_db(self, collection_name, ef):
        try:
            self.collection = self.client.create_collection(
                name=collection_name, 
                embedding_function=ef()
            )
            self.collection.upsert(
                documents=self.documents,
                ids=self.ids
            )
            print(f"Documents successfully embedded and saved to a collection.")

        except Exception as err:
            print(f"An unknown error occurred while saving:\n{err}")
            raise
             
    def load_collection(self, collection_name, ef):
        self.collection = self.client.get_collection(
            name=collection_name,
            embedding_function=ef()
        )
        print(f"Documents successfully loaded as a collection.")


    def clear(self, whole=False, embedding_model_name=None):
        if whole:
            self.client.reset()
        else:
            try:
                collection_name = f"collection_by_{embedding_model_name}"
                self.client.delete_collection(name=collection_name)
            except Exception as err:
                print(f"An error occurred:\n{err}")

            
    def get_results(self, question, n_results=3):
        outputs = self.collection.query(
            query_texts=[question], 
            n_results=n_results
        )
        return outputs

In [8]:
vectorizator = Vectorizator(
    documents=text_chunks,
    ids=chunk_names
)

In [20]:
from model_config.clients_api import clients

class GenerativeModelClient:
    def __init__(self, provider):
        for provider_name, client_function in clients.items():
            if provider == provider_name:
                print("Provider found")
                client = client_function()
                print("Client created")
                break
        else:
            raise ValueError(f"Unknown provider: {provider}")
        
        self.client = client

    def generate_answer(self, prompt, model_name):
        response = self.client.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
        )
        self.answer = response.choices[0].message.content
        return self.answer

In [None]:
{
    "em1" : {
        "llama" : {
            {
                "question1" : answer_eval_results, 
                "question2" : answer_eval_results
            }
        },
        "gpt" : {
            {
                "question1" : answer_eval_results, 
                "question2" : answer_eval_results
            }
        }
    },
    "em2" : {
        "llama" : {
            {
                "question1" : answer_eval_results, 
                "question2" : answer_eval_results
            }
        },
        "gpt" : {
            {
                "question1" : answer_eval_results, 
                "question2" : answer_eval_results
            }
        }
    }
}


In [40]:
from model_config.prompt_template import inject_prompt
from tests.similarity_calculating_test import SimilarityCalculator

all_combs_results = {}


for model_family in embedding_models:
    for embedding_model_name in embedding_models[model_family]:

        all_combs_results[embedding_model_name] = {}
        for provider in generative_models.keys():
            print(provider)
            

            for generative_model_name in generative_models[provider]:
                print(generative_model_name)
                
                all_combs_results[embedding_model_name][generative_model_name] = {}
             
                for i in range(len(qa_eval_set)):
                    
                    all_combs_results[embedding_model_name][generative_model_name]["a"] = "b"
           

all_combs_results      
        

together
meta-llama/Llama-3.3-70B-Instruct-Turbo-Free
together
meta-llama/Llama-3.3-70B-Instruct-Turbo-Free


{'all-MiniLM-L6-v2': {'meta-llama/Llama-3.3-70B-Instruct-Turbo-Free': {'a': 'b'}},
 'paraphrase-multilingual-mpnet-base-v2': {'meta-llama/Llama-3.3-70B-Instruct-Turbo-Free': {'a': 'b'}}}

In [30]:
embedding_models

{'sentence_transformers_embedding': ['all-MiniLM-L6-v2',
  'paraphrase-multilingual-mpnet-base-v2']}

In [31]:
generative_models

{'together': ['meta-llama/Llama-3.3-70B-Instruct-Turbo-Free']}

In [43]:
from model_config.prompt_template import inject_prompt
from tests.similarity_calculating_test import SimilarityCalculator

all_combs_results = {}


for model_family in embedding_models:
    for embedding_model_name in embedding_models[model_family]:
        print(f"Working on {embedding_model_name}...")
        try:
            vectorizator.get_or_load_model_collection(model_family, embedding_model_name)
            print("Collection done.")
        except Exception as err:
            print(err)
            break # continue with next model

        all_combs_results[embedding_model_name] = {}
        for provider in generative_models.keys():
            try:
                chatbot = GenerativeModelClient(provider)
                print("API success...")
            except Exception as err:
                print(f"{err}\n – something went wrong while initializing the client, trying next provider...")
                continue

            for generative_model_name in generative_models[provider]:
                
                all_combs_results[embedding_model_name][generative_model_name] = {}
                print("Assigned generative model...")
             
                for i in range(len(qa_eval_set)):
                    question = qa_eval_set[i]["question"]
                    expected_answer = qa_eval_set[i]["answer"]

                    outputs = vectorizator.get_results(question)
                    context = "\n\n".join(outputs["documents"][0])
                    prompt = inject_prompt(context, question)
                    try:
                        generated_answer = chatbot.generate_answer(prompt, generative_model_name)
                    except Exception as err:
                        print(f"{err}\n – generation failed with {generative_model_name} – skipping rest of this provider")
                        generation_failed = True
                        break
                    
                    similaritator = SimilarityCalculator(expected_answer, generated_answer)
                    answer_eval_results = similaritator.compare_texts()
                    all_combs_results[embedding_model_name][generative_model_name][question] = answer_eval_results
                    print(all_combs_results)
                
                if generation_failed:
                    break
            
        

Working on all-MiniLM-L6-v2...
Documents successfully loaded as a collection.
Collection done.
Provider found
Client created
API success...
Assigned generative model...
{'all-MiniLM-L6-v2': {'meta-llama/Llama-3.3-70B-Instruct-Turbo-Free': {'Jaký byl výsledek měření teploty na povrchu objektu na okraji Sluneční soustavy a co to naznačuje?': {'cosine': array([[0.54393011]]), 'levenshtein': 0.3120879120879121, 'dice': 0.3, 'word_difference': 0.8235294117647058}}}}
{'all-MiniLM-L6-v2': {'meta-llama/Llama-3.3-70B-Instruct-Turbo-Free': {'Jaký byl výsledek měření teploty na povrchu objektu na okraji Sluneční soustavy a co to naznačuje?': {'cosine': array([[0.54393011]]), 'levenshtein': 0.3120879120879121, 'dice': 0.3, 'word_difference': 0.8235294117647058}, 'Existují nějaké známé přírodní jevy, které by tajemný signál u Proximy Centauri mohly způsobit?': {'cosine': array([[0.936775]]), 'levenshtein': 0.5466666666666666, 'dice': 0.8, 'word_difference': 0.33333333333333337}}}}
{'all-MiniLM-L6-v

In [48]:
all_combs_results

{'all-MiniLM-L6-v2': {'meta-llama/Llama-3.3-70B-Instruct-Turbo-Free': {'Jaký byl výsledek měření teploty na povrchu objektu na okraji Sluneční soustavy a co to naznačuje?': {'cosine': array([[0.54393011]]),
    'levenshtein': 0.3120879120879121,
    'dice': 0.3,
    'word_difference': 0.8235294117647058},
   'Existují nějaké známé přírodní jevy, které by tajemný signál u Proximy Centauri mohly způsobit?': {'cosine': array([[0.936775]]),
    'levenshtein': 0.5466666666666666,
    'dice': 0.8,
    'word_difference': 0.33333333333333337},
   'Jaké chemické sloučeniny byly na povrchu jednoho z měsíců Sluneční soustavy detekovány?': {'cosine': array([[0.54955892]]),
    'levenshtein': 0.351931330472103,
    'dice': 0.3333333333333333,
    'word_difference': 0.8},
   'Jakým teleskopem budou vědci zkoumat podrobněji světelné impulzy vycházející z oblasti černé díry v galaxii M87?': {'cosine': array([[0.65362689]]),
    'levenshtein': 0.3457943925233645,
    'dice': 0.2033898305084746,
    'wo

In [45]:
rf.save_json(all_combs_results, "tests/test_results.json") 

TypeError: Object of type ndarray is not JSON serializable