In [11]:
import os
import config
import tools.rag_functions as rf

In [12]:
models_path = "models/models_test.json"
documents_path = "dataset/texts"
qa_eval_set_path = "dataset/qa/qa_eval_set.json"

In [13]:
embedding_models = rf.list_models(models_path, models_type="embedding_models")
generative_models = rf.list_models(models_path, models_type="generative_models")

In [14]:
qa_eval_set = rf.load_json(qa_eval_set_path)

In [15]:
texts = rf.load_texts(documents_path)


In [16]:
chunk_data = rf.chunk_texts(texts, chunk_size=128, overlap=10)
chunk_names, text_chunks = rf.dict_to_kv_lists(chunk_data) 

In [17]:
import chromadb
from chromadb import Settings

class Vectorizator:
    def __init__(self, documents, ids):
        self.documents = documents
        self.ids = ids
        self.client = chromadb.PersistentClient(
            path="chroma_data/",
            settings=Settings(allow_reset=True)
        )
        
    def get_or_load_model_collection(self, model_family, embedding_model_name):
        collection_name = f"collection_by_{embedding_model_name}"
        ef = rf.universal_ef(model_family, embedding_model_name)            
        try:
            self.load_collection(collection_name, ef)

        except Exception:
            print(f"Collection for this model doesn't exist, database will be populated instead.")
            self.populate_db(collection_name, ef)

    def populate_db(self, collection_name, ef):
        try:
            self.collection = self.client.create_collection(
                name=collection_name, 
                embedding_function=ef()
            )
            self.collection.upsert(
                documents=self.documents,
                ids=self.ids
            )
            print(f"Documents successfully embedded and saved to a collection.")

        except Exception as err:
            print(f"An unknown error occurred while saving:\n{err}")
            raise
             
    def load_collection(self, collection_name, ef):
        self.collection = self.client.get_collection(
            name=collection_name,
            embedding_function=ef()
        )
        print(f"Documents successfully loaded as a collection.")


    def clear(self, whole=False, embedding_model_name=None):
        if whole:
            self.client.reset()
        else:
            try:
                collection_name = f"collection_by_{embedding_model_name}"
                self.client.delete_collection(name=collection_name)
            except Exception as err:
                print(f"An error occurred:\n{err}")

            
    def get_results(self, question, n_results=3):
        outputs = self.collection.query(
            query_texts=[question], 
            n_results=n_results
        )
        return outputs

In [18]:
vectorizator = Vectorizator(
    documents=text_chunks,
    ids=chunk_names
)

In [19]:
from model_config.clients_api import clients

class GenerativeModelClient:
    def __init__(self, provider):
        for provider_name, client_function in clients:
            if provider == provider_name:
                client = client_function()
                client = clients[provider]()
        self.client = client
        return client

    def generate_answer(self, prompt, model_name):
        response = self.client.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
        )
        self.answer = response.choices[0].message.content
        return self.answer

In [24]:

from sklearn.metrics.pairwise import cosine_similarity
import Levenshtein

class SimilarityCalculator:
    def __init__(self):
        pass

    def levenshtein_distance(self, x, y):
        normalized = Levenshtein.distance(x, y) / max(len(x), len(y))
        return normalized

    def cosine_disimilarity(self, x, y):
        result = 1 - cosine_similarity(rf.vectorize_text(x), rf.vectorize_text(y))
        return result

    def bow(self, x, y):
        pass

    

    def compare_texts(self, text1, text2):
        cosine = self.calculate_cosine(text1, text2)
        levenshtein = self.calculate_levenshtein(text1, text2)
        return {"cosine" : cosine, "levenshtein" : levenshtein}


In [25]:
similaritator = SimilarityCalculator()


In [30]:
text1 = "Několik faktů naznačuje, že ""Kus Zdi"" by mohl být umělého původu: jeho podivný geometrický tvar připomínající stavební blok, neobvyklý pohyb, který není v souladu s gravitačními interakcemi, přítomnost velmi vzácných prvků, které se dosud nikdy nenašly v přirozených tělesech Sluneční soustavy, a povrch, který odráží světlo neobvyklým způsobem, podobně jako syntetické materiály."
text2 = "Černá díra v galaxii T57 se chová jinak než všechny známé černé díry, protože namísto typického vyzařování rentgenového záření z okolního horkého plynu vyzařuje tajemné elektromagnetické vlny, které naznačují, že černá díra vyzařuje teplo způsobem, který by mohl souviset s neznámým zdrojem energie."

In [29]:
similaritator.levenshtein_distance(text1, text2)

0.0

In [94]:
from model_config.prompt_template import inject_prompt

In [None]:


embeddings = vectorize_text("ahoj jak se máš")

print(embeddings)


[-0.029941560700535774, -0.008570948615670204, -0.0069284201599657536, -0.00587124191224575, -0.003542256774380803, 0.018035888671875, -0.03595825284719467, 0.027344733476638794, 0.011309679597616196, 0.025102663785219193, 0.00022150191944092512, -0.029175283387303352, -0.028451576828956604, 0.024265434592962265, -0.0028522529173642397, 0.008272952400147915, 0.013523368164896965, -0.0010137205244973302, -0.03715023770928383, -0.03561768680810928, 0.007655673660337925, -0.018092649057507515, -0.02480466663837433, 0.013303418643772602, -0.003714314429089427, 0.003934264183044434, -0.02693321369588375, 0.025031711906194687, -0.021924033761024475, -0.005917360540479422, 0.012962850742042065, 0.015197825618088245, 0.01975291594862938, 0.007300916127860546, 0.06482134759426117, -0.0027422779239714146, 0.014885637909173965, 0.008890231139957905, -0.0038739554584026337, 0.02419448271393776, 0.011941147968173027, 0.011990814469754696, -0.038938216865062714, 0.024180293083190918, 0.0038278368301

In [None]:
for model_family in embedding_models:

    for embedding_model_name in model_family:
        vectorizator.get_or_load_model_collection(model_family, embedding_model_name)

        for provider in generative_models:
            chatbot = GenerativeModelClient(provider, generative_model_name)
            for generative_model_name in provider:

                for i in range(len(qa_eval_set)):
                    question = qa_eval_set[i]["question"]
                    expected_answer = qa_eval_set[i]["answer"]

                    outputs = vectorizator.get_results(question)
                    context = "\n\n".join(outputs["documents"][0])
                    prompt = inject_prompt(context, question)
                    generated_answer = chatbot.generate_answer(prompt)
                    similaritator.compare_texts(expected_answer, generated_answer)
            
        