In [3]:
from llama_index.core.schema import NodeRelationship
from llama_index.core import Document, Settings, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.core.schema import TextNode
from llama_index.core.evaluation import RetrieverEvaluator, EmbeddingQAFinetuneDataset
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer
from dotenv import load_dotenv

load_dotenv(override=True)

True

In [4]:
# embedding_id = "intfloat/multilingual-e5-small"
embedding_id = "intfloat/multilingual-e5-large"
tokenizer = AutoTokenizer.from_pretrained(embedding_id)


In [5]:

ds1 = load_dataset("rasdani/germanrag-positives", split="train")
ds2 = load_dataset("rasdani/germanrag-positives-queries", split="train")
# ds1 = ds1.select(range(1500))
# ds2 = ds2.select(range(1500))


In [6]:

docs1 = [Document(id_=i, text=example["contexts"][example["positive_ctx_idx"]]) for i, example in enumerate(ds1)]
docs2 = [Document(id_=i, text=example["contexts"][example["positive_ctx_idx"]]) for i, example in enumerate(ds2)]
print(len(docs1), len(docs2))

embedding_id = "intfloat/multilingual-e5-small" # model with smallest token limit so far
tokenizer = AutoTokenizer.from_pretrained(embedding_id)
chunk_size = 512
splitter = TokenTextSplitter(chunk_size=10000, chunk_overlap=0)


nodes1 = splitter.get_nodes_from_documents(docs1)
nodes2 = splitter.get_nodes_from_documents(docs2)

print(len(nodes1), len(nodes2))
# by default, the node ids are set to random uuids. To ensure same id's per run, we manually set them.
for idx, node in enumerate(nodes1):
    node.id_ = idx

for idx, node in enumerate(nodes2):
    node.id_ = idx

2243 2243
2243 2243


In [7]:
embedding_config = {
    "query_instruction": "query: ",
    "text_instruction": "passage: ",
    "pooling": "mean",
    "trust_remote_code": False,
}

Settings.embed_model = HuggingFaceEmbedding(
    model_name=embedding_id, 
    embed_batch_size=4, 
    query_instruction=embedding_config["query_instruction"], 
    text_instruction=embedding_config["text_instruction"], 
    # pooling=embedding_config["pooling"],
    trust_remote_code=embedding_config["trust_remote_code"],
)

# embed_model = CohereEmbedding(
#     model_name="embed-multilingual-v3.0",
#     input_type="search_query",
    
# )


In [8]:
vector_index1 = VectorStoreIndex(nodes1, show_progress=True)
vector_index2 = VectorStoreIndex(nodes2, show_progress=True)
# vector_index.storage_context.persist(persist_dir=persist_dir)



Generating embeddings: 100%|██████████| 2048/2048 [00:59<00:00, 34.25it/s]
Generating embeddings: 100%|██████████| 195/195 [00:07<00:00, 26.97it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:29<00:00, 69.63it/s] 
Generating embeddings: 100%|██████████| 195/195 [00:03<00:00, 57.81it/s]


In [9]:
# vector_index1.storage_context.persist(persist_dir="index1")
# vector_index2.storage_context.persist(persist_dir="index2")



In [10]:
# from llama_index.core import StorageContext, load_index_from_storage

# # Load both indices if they exist, otherwise create them
# PERSIST_DIR1 = "index1"
# PERSIST_DIR2 = "index2"

# storage_context1 = StorageContext.from_defaults(persist_dir=PERSIST_DIR1)
# vector_index1 = load_index_from_storage(storage_context1)

# storage_context2 = StorageContext.from_defaults(persist_dir=PERSIST_DIR2)
# vector_index2 = load_index_from_storage(storage_context2)


In [11]:
retriever1 = vector_index1.as_retriever(similarity_top_k=10)
retriever2 = vector_index2.as_retriever(similarity_top_k=10)

In [12]:
retrieval_evaluator1 = RetrieverEvaluator.from_metric_names(metric_names=["mrr"], retriever=retriever1)
retrieval_evaluator2 = RetrieverEvaluator.from_metric_names(metric_names=["mrr"], retriever=retriever2)



In [13]:
ds1, ds2

(Dataset({
     features: ['contexts', 'question', 'answer', 'positive_ctx_idx'],
     num_rows: 2243
 }),
 Dataset({
     features: ['contexts', 'question', 'answer', 'positive_ctx_idx', 'model_name', 'query'],
     num_rows: 2243
 }))

In [15]:
from typing import Dict, List

def prepare_embedding_qa_dataset(ds, query_key="question") -> EmbeddingQAFinetuneDataset:
    """Prepare a dataset for Embedding QA Finetuning.

    Args:
        ds: The dataset loaded from Hugging Face datasets.

    Returns:
        An instance of EmbeddingQAFinetuneDataset.
    """
    # Initialize containers for the EmbeddingQAFinetuneDataset fields
    queries: Dict[str, str] = {}
    corpus: Dict[str, str] = {}
    relevant_docs: Dict[str, List[str]] = {}

    # Populate the containers with data from the dataset
    for i, entry in enumerate(ds):
        query_id = str(i)  # Assuming unique IDs are not provided, using index as ID
        doc_id = str(i)  # Use 'qid' if available, else use index
        query = entry[query_key]
        doc = entry["contexts"][entry["positive_ctx_idx"]]

        # Update queries and corpus dictionaries
        queries[query_id] = query
        corpus[doc_id] = doc

        # Update relevant_docs dictionary
        if query_id not in relevant_docs:
            relevant_docs[query_id] = [doc_id]
        else:
            relevant_docs[query_id].append(doc_id)

    return EmbeddingQAFinetuneDataset(queries=queries, corpus=corpus, relevant_docs=relevant_docs, mode="text")



In [16]:
eval_dataset1 = prepare_embedding_qa_dataset(ds1, query_key="question")
eval_dataset2 = prepare_embedding_qa_dataset(ds2, query_key="query")



In [17]:
print(eval_dataset1.queries["7"])
print(eval_dataset1.corpus["7"])
print(eval_dataset1.relevant_docs["7"])



Wer wurde von Intel als USB Rockstar bezeichnet?
Universal_Serial_Bus

== Kurioses ==
Auch ausgefallene Geräte sind auf den Markt gekommen, die USB vorwiegend zur Stromversorgung nutzen. Beispielsweise gibt es USB-Wärmeplatten, mit denen etwa eine Kaffeetasse über die USB-Schnittstelle warmgehalten werden kann, USB-Lampen für Notebooks, um die Tastatur zu beleuchten, USB-Tastatur-Staubsauger, USB-Ventilatoren, Rotoren mit LED-Lichteffekten, USB-Weihnachtsbäume oder beheizbare USB-Handschuhe, USB-Butt-Plugs, USB-Hörgerätetrockner und USB-Pantoffeln.
Ajay Bhatt wurde aus der Gruppe der Entwickler des USB-Standards besonders hervorgehoben, als er in dem Werbespot ''Ajay Bhatt – The Real USB Rock Star!'' des Unternehmens Intel als Rockstar porträtiert wurde.
['7']


In [18]:
results1 = await retrieval_evaluator1.aevaluate_dataset(eval_dataset1)
results2 = await retrieval_evaluator2.aevaluate_dataset(eval_dataset2)


In [19]:
results1[0].metric_dict["mrr"].score

1.0

In [20]:
results1[1]

RetrievalEvalResult(query='Wann werden Kleinspannungsglühlampen aufgrund ihrer Vorteile eingesetzt?', expected_ids=['1'], expected_texts=None, retrieved_ids=['1', '729', '561', '755', '801', '2166', '1244', '1324', '1853', '1266'], retrieved_texts=['Glühlampe\n\n=== Lichtmodulation ===\nAufgrund der thermischen Trägheit des Glühfadens weisen auch an netz- bzw. niederfrequenter Wechselspannung betriebene Glühlampen nur geringe Schwankungen der Helligkeit auf. Die Helligkeitsmodulation mit der doppelten Betriebsfrequenz ist umso stärker, je dünner der Glühfaden ist. Sie ist also besonders bei Lampen geringer Leistung für Netzspannung ausgeprägt und beträgt bei einer Glühlampe 15\xa0W und 230\xa0V etwa 30 %.\nInsbesondere Kleinspannungsglühlampen gelten aufgrund ihrer dicken, thermisch trägen Glühdrähte als flimmerfrei – ein Vorteil bei der Beleuchtung von rotierenden Maschinen. Glühlampen mit sehr dünnem Glühfaden für Betriebsströme von weniger als 0,1\xa0A können mit Frequenzen bis zu e

In [21]:
len(results1), len(results2)



(2243, 2243)

In [22]:

# Extract scores from the results
scores1 = [result.metric_dict["mrr"].score for result in results1]
scores2 = [result.metric_dict["mrr"].score for result in results2]


In [23]:
average_mrr1 = sum(scores1) / len(scores1)
average_mrr2 = sum(scores2) / len(scores2)

print(f"Average MRR for dataset 1: {average_mrr1}")
print(f"Average MRR for dataset 2: {average_mrr2}")


Average MRR for dataset 1: 0.8992527015264418
Average MRR for dataset 2: 0.9663514001231345


In [24]:
import numpy as np
from scipy.stats import pearsonr, spearmanr

# Compute Pearson correlation
pearson_corr, _ = pearsonr(scores1, scores2)
print(f"Pearson correlation between the two sets of scores: {pearson_corr}")

# Compute Spearman correlation
spearman_corr, _ = spearmanr(scores1, scores2)
print(f"Spearman correlation between the two sets of scores: {spearman_corr}")


Pearson correlation between the two sets of scores: -0.01464660011642985
Spearman correlation between the two sets of scores: -0.01796824233492936


In [26]:
for i, (score1, score2) in enumerate(zip(scores1, scores2)):
    print(f"Score 1: {score1}, Score 2: {score2}")



Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 0.3333333333333333, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 0.5
Score 1: 1.0, Score 2: 1.0
Score 1: 0.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 0.5, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 0.5, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 0.5, Score 2: 0.5
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 0.5, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0, Score 2: 1.0
Score 1: 1.0,

Variability is low, which might screw up correlations coefficients.

In [38]:
results1[0].retrieved_ids

['0', '305', '1833', '179', '2185', '1349', '1704', '1135', '852', '1897']

In [39]:
results2[0].retrieved_ids



['0', '1729', '622', '1866', '999', '453', '511', '823', '1835', '2187']

In [40]:
# Initialize lists to store integer IDs across all results
all_ids_1 = []
all_ids_2 = []

# Loop through each result pair and extract integer IDs
for result1, result2 in zip(results1, results2):
    retrieved_ids_1 = result1.retrieved_ids
    retrieved_ids_2 = result2.retrieved_ids
    
    # Filter IDs to include only those present in both sets for correlation calculation
    common_ids_1 = [int(id_) for id_ in retrieved_ids_1 if id_ in retrieved_ids_2]
    common_ids_2 = [int(id_) for id_ in retrieved_ids_2 if id_ in retrieved_ids_1]
    print(common_ids_1, common_ids_2)
    
    # Append the integer IDs from this pair to the overall lists
    all_ids_1.extend(common_ids_1)
    all_ids_2.extend(common_ids_2)

# Calculate Pearson correlation for the integer IDs across all results
pearson_corr, _ = pearsonr(all_ids_1, all_ids_2)
print(f"Pearson correlation across all matching IDs: {pearson_corr}")



[0] [0]
[1] [1]
[2] [2]
[3] [3]
[4] [4]
[5] [5]
[] []
[7] [7]
[8] [8]
[9] [9]
[10] [10]
[11] [11]
[12, 922] [12, 922]
[13] [13]
[14] [14]
[15] [15]
[16] [16]
[17] [17]
[18] [18]
[19] [19]
[20] [20]
[21] [21]
[22] [22]
[23] [23]
[24] [24]
[25] [25]
[26] [26]
[27] [27]
[28] [28]
[29] [29]
[30] [30]
[31] [31]
[32] [32]
[33] [33]
[34] [34]
[35] [35]
[36] [36]
[37] [37]
[38] [38]
[39] [39]
[40] [40]
[41] [41]
[42] [42]
[43, 1790] [43, 1790]
[44] [44]
[45] [45]
[46] [46]
[47, 787] [47, 787]
[48] [48]
[49] [49]
[] []
[51] [51]
[] []
[53] [53]
[54] [54]
[55] [55]
[56] [56]
[57] [57]
[58] [58]
[59] [59]
[60] [60]
[61] [61]
[62] [62]
[63] [63]
[64] [64]
[65] [65]
[66] [66]
[67] [67]
[68] [68]
[69, 684] [69, 684]
[70] [70]
[71] [71]
[72] [72]
[73] [73]
[74] [74]
[75] [75]
[76] [76]
[77] [77]
[78] [78]
[79] [79]
[80] [80]
[81, 859] [81, 859]
[82] [82]
[83] [83]
[84] [84]
[85] [85]
[86] [86]
[87] [87]
[88] [88]
[89] [89]
[90] [90]
[91] [91]
[92] [92]
[93, 876] [93, 876]
[94] [94]
[95] [95]
[96] [96

In [43]:
retrieved_ids_1 = [result.retrieved_ids for result in results1]
retrieved_ids_2 = [result.retrieved_ids for result in results2]

In [46]:
# flatten lists
flat_list1 = [item for sublist in retrieved_ids_1 for item in sublist]
flat_list2 = [item for sublist in retrieved_ids_2 for item in sublist]
flat_list1



['0',
 '305',
 '1833',
 '179',
 '2185',
 '1349',
 '1704',
 '1135',
 '852',
 '1897',
 '1',
 '729',
 '561',
 '755',
 '801',
 '2166',
 '1244',
 '1324',
 '1853',
 '1266',
 '2173',
 '1671',
 '2',
 '1179',
 '819',
 '763',
 '673',
 '1429',
 '1433',
 '1242',
 '3',
 '616',
 '1261',
 '1452',
 '1210',
 '820',
 '1087',
 '221',
 '1097',
 '1043',
 '4',
 '529',
 '55',
 '2111',
 '110',
 '940',
 '1055',
 '417',
 '1249',
 '620',
 '5',
 '132',
 '234',
 '1235',
 '530',
 '1807',
 '1204',
 '1160',
 '399',
 '177',
 '1432',
 '1156',
 '1711',
 '163',
 '281',
 '626',
 '175',
 '215',
 '1390',
 '1003',
 '7',
 '811',
 '1409',
 '1931',
 '1254',
 '1596',
 '2001',
 '2032',
 '183',
 '1032',
 '8',
 '1915',
 '458',
 '52',
 '1885',
 '1351',
 '1056',
 '56',
 '1942',
 '2103',
 '9',
 '1027',
 '1629',
 '389',
 '1024',
 '678',
 '1208',
 '438',
 '894',
 '382',
 '10',
 '889',
 '1652',
 '50',
 '1594',
 '2158',
 '2176',
 '1514',
 '1558',
 '283',
 '11',
 '586',
 '1666',
 '2001',
 '1505',
 '1545',
 '2201',
 '1771',
 '628',
 '827',


In [48]:
flat_list1 = [int(item) for item in flat_list1]
flat_list2 = [int(item) for item in flat_list2]


In [49]:
# Calculate Pearson correlation
pearson_corr, _ = pearsonr(flat_list1, flat_list2)

# Calculate Spearman correlation
spearman_corr, _ = spearmanr(flat_list1, flat_list2)

print(f"Pearson Correlation: {pearson_corr}")
print(f"Spearman Correlation: {spearman_corr}")

Pearson Correlation: 0.08455256836823186
Spearman Correlation: 0.08463176933961304


In [52]:
agree_count = 0
for item1, item2 in zip(flat_list1, flat_list2):
    if item1 == item2:
        agree_count += 1
        print(f"{item1} - {item2}")
print(f"Total agreements: {agree_count} out of {len(flat_list1)}, ratio: {agree_count / len(flat_list1)}")


0 - 0
1 - 1
3 - 3
5 - 5
7 - 7
8 - 8
9 - 9
10 - 10
11 - 11
12 - 12
922 - 922
13 - 13
14 - 14
15 - 15
16 - 16
18 - 18
19 - 19
21 - 21
22 - 22
23 - 23
24 - 24
26 - 26
27 - 27
28 - 28
29 - 29
30 - 30
31 - 31
32 - 32
33 - 33
34 - 34
35 - 35
36 - 36
37 - 37
39 - 39
40 - 40
41 - 41
42 - 42
43 - 43
44 - 44
45 - 45
46 - 46
47 - 47
48 - 48
49 - 49
51 - 51
53 - 53
54 - 54
56 - 56
57 - 57
58 - 58
59 - 59
60 - 60
61 - 61
62 - 62
63 - 63
64 - 64
65 - 65
66 - 66
68 - 68
71 - 71
72 - 72
73 - 73
74 - 74
75 - 75
76 - 76
78 - 78
79 - 79
80 - 80
82 - 82
83 - 83
84 - 84
86 - 86
87 - 87
88 - 88
90 - 90
91 - 91
92 - 92
94 - 94
96 - 96
98 - 98
99 - 99
100 - 100
101 - 101
102 - 102
103 - 103
104 - 104
105 - 105
106 - 106
107 - 107
108 - 108
109 - 109
110 - 110
111 - 111
492 - 492
114 - 114
115 - 115
116 - 116
117 - 117
119 - 119
120 - 120
121 - 121
122 - 122
123 - 123
124 - 124
125 - 125
126 - 126
127 - 127
128 - 128
129 - 129
130 - 130
131 - 131
132 - 132
133 - 133
135 - 135
136 - 136
137 - 137
138 - 138
140 