In [1]:
import os
import sys

# Set the root directory of your project
project_root = '/Users/rodolfocacacho/Documents/Documents/MAI/Master Thesis/Code/rag_project'
os.chdir(project_root)
# Add the root directory to sys.path
if project_root not in sys.path:
    sys.path.append(project_root)

from config import (CONFIG_SQL_DB,DB_NAME,
                    SQL_EVAL_QAS_TABLE_SCHEMA,
                    SQL_EVAL_QAS_TABLE, 
                    EMBEDDING_MODEL,EMBEDDING_MODEL_API,
                    EMBEDDING_MODEL_EMB_TASK,
                    TEST_RESULTS_TABLE,SQL_EVAL_CHUNKS_TABLE,
                    SQL_PROMPTS_TABLE)
from utils.MySQLDB_manager import MySQLDB
from testing.modules.evaluating_modules import RAGEvaluator
import json

TEST_NAME = "testGerman250"

sql_con = MySQLDB(CONFIG_SQL_DB,DB_NAME)

ragEval = RAGEvaluator(sql_con=sql_con,
                       test_name=TEST_NAME,
                       test_table_name=TEST_RESULTS_TABLE,
                       qas_table_name=SQL_EVAL_QAS_TABLE,
                       chunks_eval_table_name=SQL_EVAL_CHUNKS_TABLE,
                       prompts_table_name=SQL_PROMPTS_TABLE)

df_results = ragEval.data_df

print(df_results.head(5))
print(df_results.columns)
print(df_results.shape)

print(ragEval.generate_report())


   id_question  id_sample type_question  \
0            1          1       Factual   
1            2          1    Procedural   
2            3          1    Analytical   
3            6          2       Factual   
4            8          2    Procedural   

                                            question  \
0  Welche Dokumente müssen bei der Beantragung de...   
1  Welche Rollen spielen Fachunternehmer und Ener...   
2  Welche Voraussetzungen müssen für den Einkomme...   
3  Welche Anforderungen werden an Heizsysteme ges...   
4  Welche Änderungen wurden im Vergleich zur vorh...   

                                     expected_answer  clarity  specificity  \
0  Bei der Beantragung des Klimageschwindigkeits-...        5            5   
1  Für geförderte Maßnahmen müssen Fachunternehme...        5            5   
2  1. Die bonusberechtigten selbstnutzenden Eigen...        5            5   
3  1. Die Anforderungen an Heizsysteme im Rahmen ...        5            5   
4  Die neue Ri

In [56]:
def get_adj_ids(id, last_id, dif=1):
    """
    Get all adjacent IDs within a given difference from the original ID.
    
    Args:
        id (str): The current ID in the format "base.cid".
        last_id (str): The last ID in the format "base.lid".
        dif (int): The difference to consider for adjacent IDs.

    Returns:
        list: A list of adjacent IDs within the specified range.
    """
    adjs_ids = []

    # Split and parse the base and numeric parts
    base, cid = id.split('.')
    _, lid = last_id.split('.')

    # Convert to integers
    cid = int(cid)
    lid = int(lid)

    # Generate all IDs within the range [cid-dif, cid+dif]
    for offset in range(-dif, dif + 1):
        adj_cid = cid + offset
        # Ensure IDs are within bounds and not the original ID
        if 0 <= adj_cid <= lid and adj_cid != cid:
            adjs_ids.append(f"{base}.{adj_cid}")

    return adjs_ids


def evaluate_context(df):

    used_count = 0
    retrieved_count = 0

    used_exp_count = 0
    retrieved_exp_count = 0

    for index,row in df.iterrows():
        metadata = json.loads(row['metadata'])
        end_chunk = metadata['last_id']
        context_ids = json.loads(row['context_ids'])
        total_context_ids = json.loads(row['context_ids_total'])
        original_chunk_id = row['id']
        adj_context = get_adj_ids(original_chunk_id,end_chunk,dif=1)
        adj_context.append(original_chunk_id)
        used_context = original_chunk_id in context_ids
        retrieved_context = original_chunk_id in total_context_ids
        
        used_context_ext = any(item in adj_context for item in context_ids) 
        retrieved_context_ext = any(item in adj_context for item in total_context_ids)

        used_count+=used_context
        retrieved_count+=retrieved_context
        used_exp_count+=used_context_ext
        retrieved_exp_count+=retrieved_context_ext

        # print(f"Index: {index} Used: {used_context} Retrieved: {retrieved_context}")

    print(f"Used: {used_count/len(df)} Used Ext: {used_exp_count/len(df)}\nRetrieved: {retrieved_count/len(df)} Retrieved Ext: {retrieved_exp_count/len(df)}")


    return 0


res = evaluate_context(df_results)
print(res)

Used: 0.13164556962025317 Used Ext: 0.2
Retrieved: 0.21772151898734177 Retrieved Ext: 0.29620253164556964
0


In [34]:
adjies = get_adj_ids("0.5","0.100",4)
adjies.append('0.5')
print(adjies)

['0.1', '0.2', '0.3', '0.4', '0.6', '0.7', '0.8', '0.9', '0.5']


In [None]:
testJinaV3-250
Used: 0.027848101265822784 Used Ext: 0.0379746835443038
Retrieved: 0.0810126582278481 Retrieved Ext: 0.15443037974683543

testJinaV3-500
Used: 0.41265822784810124 Used Ext: 0.5341772151898734
Retrieved: 0.579746835443038 Retrieved Ext: 0.6481012658227848

testJinaDE-250
Used: 0.13670886075949368 Used Ext: 0.18734177215189873
Retrieved: 0.23037974683544304 Retrieved Ext: 0.3392405063291139

testJinaDE-500
Used: 0.2 Used Ext: 0.25569620253164554
Retrieved: 0.2911392405063291 Retrieved Ext: 0.33164556962025316

testGerman-250
Used: 0.13164556962025317 Used Ext: 0.2
Retrieved: 0.21772151898734177 Retrieved Ext: 0.29620253164556964

In [2]:
# RAG CONFIGURATIONS

MAX_TOKENS = 500
SUFFIX = 'clean'

EMBEDDING_MODELS = {
    "jinaai/jina-embeddings-v2-base-de":{"dimension":768,
                                         "embed_task":None,
                                         "retrieve_task":None,
                                         "api_usage":False,
                                         "instruction":None},
    "jinaai/jina-embeddings-v3":{"dimension":1024,
                                 "embed_task":"retrieval.passage",
                                 "retrieve_task":"retrieval.query",
                                 "api_usage":False,
                                 "instruction":None},
    "aari1995/German_Semantic_V3":{"dimension":1024,
                                   "embed_task":None,
                                   "retrieve_task":None,
                                   "api_usage":False,
                                   "instruction":None},
    "intfloat/multilingual-e5-large-instruct":{"dimension":1024,
                                   "embed_task":None,
                                   "retrieve_task":None,
                                   "api_usage":False,
                                   "instruction":"Given a query, retrieve relevant information from the available documents"}}

EMBEDDING_MODEL = "intfloat/multilingual-e5-large-instruct"
EMBEDDING_MODEL_NAME = EMBEDDING_MODEL.split("/")[1].replace('_','-').lower()
EMBEDDING_MODEL_DIM = EMBEDDING_MODELS[EMBEDDING_MODEL]["dimension"]
EMBEDDING_MODEL_API = EMBEDDING_MODELS[EMBEDDING_MODEL]["api_usage"]
EMBEDDING_MODEL_RET_TASK = EMBEDDING_MODELS[EMBEDDING_MODEL]["retrieve_task"]
EMBEDDING_MODEL_EMB_TASK = EMBEDDING_MODELS[EMBEDDING_MODEL]["embed_task"]
EMBEDDING_MODEL_INSTRUCTION = EMBEDDING_MODELS[EMBEDDING_MODEL]["instruction"]
 
INDEX_NAME = f'{EMBEDDING_MODEL_NAME}-{SUFFIX}-{MAX_TOKENS}'

from utils.embedding_handler import EmbeddingHandler

embed = EmbeddingHandler(EMBEDDING_MODEL,EMBEDDING_MODEL_API,
                         api_function=None,task=EMBEDDING_MODEL_EMB_TASK)


In [3]:
texts_test = ['Warum ist es wichtig, die aktuelle Fassung des Merkblattes bei der Antragstellung zu beachten?']

vrs = embed.embed_texts(texts=texts_test,
                        task=EMBEDDING_MODEL_RET_TASK,
                        instruction=EMBEDDING_MODEL_INSTRUCTION)

In [8]:
print("Dense Vector (comma-separated):", ", ".join(map(str, vrs[0])))

Dense Vector (comma-separated): 0.006305352784693241, 0.04737638682126999, -0.00479593500494957, -0.04191368445754051, 0.04937692731618881, -0.03234514221549034, 0.01948515512049198, 0.050661902874708176, 0.03793444111943245, -0.03438062593340874, -0.002762113232165575, -0.00847641285508871, -0.022910330444574356, 0.018040426075458527, -0.0632224828004837, -0.030978092923760414, -0.0003475877456367016, 0.039048708975315094, -0.0029321350157260895, 0.013067902997136116, 0.004978557582944632, -0.024805041030049324, 0.043208010494709015, -0.013060971163213253, -0.02221596986055374, 0.0088179437443614, -0.03943951800465584, -0.02259860187768936, 0.007700707297772169, -0.059736259281635284, -0.000718893192242831, 0.015109227038919926, -0.04136116802692413, -0.029274383559823036, -0.01343005895614624, 0.01362552773207426, 0.031014982610940933, 0.04483432695269585, -0.0534488819539547, 0.04011570289731026, 0.0038349253591150045, 0.024467557668685913, 0.011514240875840187, -0.0082340557128191,