In [12]:
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
import psutil
import json

In [13]:
v_a = model.encode("Testing 123")
v_b = model.encode("Testing 153")

t = v_a.dot(v_b)

In [14]:
type(t)

numpy.float32

In [4]:
df_rekt = pd.read_csv('../datasets/web3isgoinggreat_dataset.csv', index_col=0)
df_rekt = df_rekt.dropna()
df_rekt = df_rekt[['Hack' in i for i  in  df_rekt.tags]]
df_rekt = df_rekt.reset_index()
df_rekt = df_rekt.drop(columns=['index'])
df_rekt['id'] = df_rekt.index

In [10]:
documents = df_rekt.to_dict(orient='records')
doc_idx = {d['id']: d for d in documents}

In [4]:
df_gt = pd.read_csv('../datasets/ground-truth-retrieval_web3.csv')

In [5]:
ground_truth = df_gt.to_dict(orient='records')

In [5]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [7]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "date": {"type": "text"},
            "summary": {"type": "text"},
            "tags": {"type": "keyword"},
            "id": {"type": "keyword"},
            "title_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "date_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "tags_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "summary_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "all_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "rekt-knowledgebase"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7ff210230b20>: Failed to establish a new connection: [Errno 111] Connection refused))

In [8]:
for doc in tqdm(documents):
    title = doc['title']
    summary = doc['summary']
    doc['all_vector'] = model.encode(title + ' ' + summary)
    es_client.index(index=index_name, document=doc)

100%|█████████████████████████████████████████████████████████████████████████████████| 261/261 [00:06<00:00, 41.61it/s]


In [9]:
def elastic_search_hybrid(query): 
    v_q = model.encode(query)
    
    knn_query = {
        "field": "all_vec",
        "query_vector": v_q,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.8
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["title", "date", "summary", "tags"],
                    "type": "best_fields",
                    "boost": 0.2,
                }
            }
        }
    }

    response = es_client.search(
        index=index_name,
        query=keyword_query,
        knn=knn_query,
        size=5
    )

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [10]:
def question_hybrid(q):
    question = q['question']
    return elastic_search_hybrid(question)

In [11]:
def build_prompt(query, search_results):

    prompt_template = """
    You're a assistant that informs the user on the latest cryptocurrency hacks and exploits. Answer the QUESTION based on the CONTEXT from our crytocurrency hacks and exploits database.
    Use only the facts from the CONTEXT when answering the QUESTION. Do not include any reference to the CONTEXT in your answer.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
    
    entry_template = """
    title: {title}
    date: {date}
    summary: {summary}
    tags: {tags}
    """.strip()
    
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [2]:
def llm(prompt, model):
    client = OpenAI(
        base_url='http://localhost:11434/v1/',
        api_key='ollama',
    )
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [53]:
def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [14]:
def process_record(rec, model="gemma2:2b"):
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['id']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['summary']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'id': doc_id,
        'question': rec['question'],
    }

In [15]:
def rag(query, model):
    search_results = question_hybrid(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model)
    return answer

In [20]:
pool = ThreadPoolExecutor(max_workers=psutil.cpu_count())
results_gemma = map_progress(pool, ground_truth, process_record)

100%|███████████████████████████████████████████████████████████████████████████████| 1305/1305 [22:49<00:00,  1.05s/it]


In [92]:
json_gemma = json.loads(json.dumps(results_gemma))

In [94]:
with open('../datasets/results-gemma.json', 'w') as f:
    json.dump(json_gemma, f)

In [16]:
def process_record(rec, model="phi3"):
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['id']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['summary']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'id': doc_id,
        'question': rec['question'],
    }

In [17]:
pool = ThreadPoolExecutor(max_workers=psutil.cpu_count())
results_phi3 = map_progress(pool, ground_truth, process_record)

100%|███████████████████████████████████████████████████████████████████████████████| 1305/1305 [56:36<00:00,  2.60s/it]


In [64]:
json_phi3 = json.loads(json.dumps(results_phi3))

In [65]:
with open('../datasets/results-phi3.json', 'w') as f:
    json.dump(json_phi3, f)

In [98]:
def process_record(rec, model="mistral"):
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['id']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['summary']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'id': doc_id,
        'question': rec['question'],
    }

In [99]:
pool = ThreadPoolExecutor(max_workers=psutil.cpu_count())
results_mistral = map_progress(pool, ground_truth, process_record)

100%|█████████████████████████████████████████████████████████████████████████████| 1305/1305 [1:00:20<00:00,  2.77s/it]


In [100]:
json_mistral = json.loads(json.dumps(results_mistral))

In [101]:
with open('../datasets/results-mistral.json', 'w') as f:
    json.dump(json_mistral, f)

In [106]:
def process_record(rec, model="llama3.1"):
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['id']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['summary']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'id': doc_id,
        'question': rec['question'],
    }

In [107]:
pool = ThreadPoolExecutor(max_workers=psutil.cpu_count())
results_llama = map_progress(pool, ground_truth, process_record)

100%|█████████████████████████████████████████████████████████████████████████████| 1305/1305 [3:26:03<00:00,  9.47s/it]


In [108]:
json_llama = json.loads(json.dumps(results_llama))

In [109]:
with open('../datasets/results-llama.json', 'w') as f:
    json.dump(json_llama, f)

In [8]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

## Cosine Similarity

### Gemma

In [13]:
df_gemma = pd.read_json('../datasets/results-gemma.json')

In [14]:
dict_gemma = df_gemma.to_dict(orient='records')

In [15]:
similarity = []

for record in tqdm(dict_gemma):
   sim = compute_similarity(record)
   similarity.append(sim)

100%|███████████████████████████████████████████████████████████████████████████████| 1305/1305 [00:23<00:00, 54.90it/s]


In [16]:
df_gemma['cosine'] = similarity
df_gemma['cosine'].describe()

count    1305.000000
mean        0.632725
std         0.169203
min         0.061967
25%         0.523001
50%         0.663565
75%         0.762764
max         0.959846
Name: cosine, dtype: float64

### Phi3

In [17]:
df_phi3 = pd.read_json('../datasets/results-phi3.json')

In [18]:
dict_phi3 = df_phi3.to_dict(orient='records')

In [19]:
similarity = []

for record in tqdm(dict_phi3):
   sim = compute_similarity(record)
   similarity.append(sim)

100%|███████████████████████████████████████████████████████████████████████████████| 1305/1305 [00:22<00:00, 58.50it/s]


In [20]:
df_phi3['cosine'] = similarity
df_phi3['cosine'].describe()

count    1305.000000
mean        0.054209
std         0.090633
min        -0.116379
25%        -0.000040
50%         0.046980
75%         0.086350
max         0.811990
Name: cosine, dtype: float64

### Mistral

In [21]:
df_mistral = pd.read_json('../datasets/results-mistral.json')

In [22]:
dict_mistral = df_mistral.to_dict(orient='records')

In [23]:
similarity = []

for record in tqdm(dict_mistral):
   sim = compute_similarity(record)
   similarity.append(sim)

100%|███████████████████████████████████████████████████████████████████████████████| 1305/1305 [00:22<00:00, 58.22it/s]


In [24]:
df_mistral['cosine'] = similarity
df_mistral['cosine'].describe()

count    1305.000000
mean        0.656022
std         0.148491
min         0.081942
25%         0.566109
50%         0.677183
75%         0.767298
max         0.941107
Name: cosine, dtype: float64

### Llama

In [25]:
df_llama = pd.read_json('../datasets/results-llama.json')

In [26]:
dict_llama = df_llama.to_dict(orient='records')

In [27]:
similarity = []

for record in tqdm(dict_llama):
   sim = compute_similarity(record)
   similarity.append(sim)

100%|███████████████████████████████████████████████████████████████████████████████| 1305/1305 [00:33<00:00, 38.97it/s]


In [28]:
df_llama['cosine'] = similarity
df_llama['cosine'].describe()

count    1305.000000
mean        0.658443
std         0.162653
min         0.006543
25%         0.560586
50%         0.692744
75%         0.779472
max         0.952975
Name: cosine, dtype: float64

## LLM as a Judge

In [29]:
judge_prompt_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

### Gemma

In [32]:
df_gemma_sample = df_gemma.sample(n=250, random_state=1)

In [33]:
samples_gemma = df_gemma_sample.to_dict(orient='records')

In [34]:
evaluations = []

for record in tqdm(samples_gemma):
    prompt = judge_prompt_template.format(**record)
    evaluation = llm(prompt, model='mistral')
    evaluations.append(evaluation)

100%|█████████████████████████████████████████████████████████████████████████████████| 250/250 [31:55<00:00,  7.66s/it]


In [16]:
type(json.loads("""{
  "Relevance": "RELEVANT",
  "Explanation": "The generated answer directly corresponds to a specific detail in the original story about Robb's previous conviction, showing a good understanding of the context and context switching between the different scams. The only aspect where it falls short is by not providing the additional details mentioned in the original answer."
}"""))

dict

In [35]:
json_evaluations = []

for i, str_eval in enumerate(evaluations):
    try:
        json_eval = json.loads(str_eval)
        json_evaluations.append(json_eval)
    except:
        print(str_eval)

Here is my evaluation as an expert evaluator for the Retrieval-Augmented Generation (RAG) system:

{
  "Relevance": "RELEVANT",
  "Explanation": "The generated answer directly corresponds to a specific detail in the original story about Robb's previous conviction, showing a good understanding of the context and context switching between the different scams. The only aspect where it falls short is by not providing the additional details mentioned in the original answer."
}
Here is my analysis in the requested format:

```
{
  "Relevance": "PARTLY_RELEVANT",
  "Explanation": "The generated answer partially captures the sequence of events, but it deviates from the original answer in specifying how Oulahyane used the victim's credentials. The mention of quick access to crypto assets and sale of NFTs is accurate, but the detail about transferring his own assets before selling victims' is not supported by the original text."
}
```
Here is the evaluation result in JSON format:

{
  "Relevance

In [36]:
df_evaluations_gemma = pd.DataFrame(json_evaluations)

In [130]:
df_evaluations_gemma.Relevance.value_counts(normalize=True)

Relevance
RELEVANT              0.419913
PARTLY_RELEVANT       0.393939
NON_RELEVANT          0.112554
PARTIALLY_RELEVANT    0.043290
PARTY_RELEVANT        0.025974
Partly_Relevant       0.004329
Name: proportion, dtype: float64

### Phi3

In [115]:
df_phi3_sample = df_phi3.sample(n=250, random_state=1)

In [116]:
samples_phi3 = df_phi3_sample.to_dict(orient='records')

In [117]:
evaluations = []

for record in tqdm(samples_phi3):
    prompt = judge_prompt_template.format(**record)
    evaluation = llm(prompt, model='mistral')
    evaluations.append(evaluation)

100%|█████████████████████████████████████████████████████████████████████████████████| 250/250 [10:18<00:00,  2.47s/it]


In [118]:
json_evaluations = []

for i, str_eval in enumerate(evaluations):
    try:
        json_eval = json.loads(str_eval)
        json_evaluations.append(json_eval)
    except:
        print(str_eval)

 {
      "Relevance": "PARTLY\_RELEVANT",
      "Explanation": "The generated answer does not provide specific details about the serious flaws discovered in the Pond0x contract. However, it can be deduced from the original answer that the issue revolves around the ability of anyone to transfer coins belonging to other people. The generated answer indirectly refers to this problem by mentioning 'flaws' found in the contract."
   }
 {
      "Relevance": "NON\_RELEVANT",
      "Explanation": "The generated answer does not directly provide the action taken by BNB Chain within 10 minutes to address the Discord URL hijacking issue."
   }
 {
    "Relevance": "NON\_RELEVANT",
    "Explanation": "The generated answer does not provide an answer to the provided question 'Who was responsible for the flash loan attack on Normie memecoin and what was the monetary value they were able to cash out?' The text contains unrelated information about a different topic (Dawn of Food Safety Concerns) without 

In [119]:
df_evaluations_phi3 = pd.DataFrame(json_evaluations)

In [120]:
len(df_evaluations_phi3)

231

In [121]:
df_evaluations_phi3.Relevance.value_counts(normalize=True)

Relevance
PARTLY_RELEVANT       0.493506
NON_RELEVANT          0.372294
RELEVANT              0.077922
PARTIALLY_RELEVANT    0.021645
NONE                  0.008658
PARTY_RELEVANT        0.008658
PARTILY_RELEVANT      0.008658
 PARTLY_RELEVANT      0.004329
NONE_RELEVANT         0.004329
Name: proportion, dtype: float64

### Mistral

In [96]:
df_mistral_sample = df_mistral.sample(n=250, random_state=1)

In [97]:
samples_mistral = df_mistral_sample.to_dict(orient='records')

In [98]:
evaluations = []

for record in tqdm(samples_mistral):
    prompt = judge_prompt_template.format(**record)
    evaluation = llm(prompt, model='mistral')
    evaluations.append(evaluation)

100%|█████████████████████████████████████████████████████████████████████████████████| 250/250 [11:00<00:00,  2.64s/it]


In [100]:
json_evaluations = []

for i, str_eval in enumerate(evaluations):
    try:
        json_eval = json.loads(str_eval)
        json_evaluations.append(json_eval)
    except:
        print(str_eval)

 {
      "Relevance": "PARTLY\_RELEVANT",
      "Explanation": "The generated answer mentions that the vulnerability found in Conic Finance is not exclusive to Conic Finance, which is factually correct and part of the original response. However, the generated answer does not accurately address or directly connect the vulnerability described in the original response with any specific example outside of Conic Finance (such as Sonne Finance), nor does it explain how the vulnerability had led to an actual hack in the mentioned case."
   }
 {
      "Relevance": "PARTLY\_RELEVANT",
      "Explanation": "The generated answer correctly states the reason behind LoveMake's dissatisfaction towards Twitter, which is related to the phishing scam. However, it omits the specific part where LoveMake mentioned Twitter's verification process not helping them notice the fraudulent account because of similarities with the original and verified account. Therefore, while the general idea is correct, the gen

In [103]:
df_evaluations_mistral = pd.DataFrame(json_evaluations)

In [104]:
len(df_evaluations_mistral)

235

In [105]:
df_evaluations_mistral.Relevance.value_counts(normalize=True)

Relevance
PARTLY_RELEVANT       0.442553
RELEVANT              0.425532
NON_RELEVANT          0.093617
PARTY_RELEVANT        0.021277
PARTIALLY_RELEVANT    0.008511
PARTILY_RELEVANT      0.008511
Name: proportion, dtype: float64

In [146]:
0.442553+0.425532

0.868085

In [138]:
df_llama_sample = df_llama.sample(n=250, random_state=1)

In [139]:
samples_llama = df_llama_sample.to_dict(orient='records')

In [140]:
evaluations = []

for record in tqdm(samples_llama):
    prompt = judge_prompt_template.format(**record)
    evaluation = llm(prompt, model='mistral')
    evaluations.append(evaluation)

100%|█████████████████████████████████████████████████████████████████████████████████| 250/250 [10:29<00:00,  2.52s/it]


In [141]:
json_evaluations = []

for i, str_eval in enumerate(evaluations):
    try:
        json_eval = json.loads(str_eval)
        json_evaluations.append(json_eval)
    except:
        print(str_eval)

 {
      "Relevance": "PARTLY\_RELEVANT",
      "Explanation": "Although the generated answer only directly covers part of the original answer (the attempt to contact the exploiter), it indirectly discusses the main topic of 'attempts by Conic Finance to contact the exploiter'. However, it misstates the date of the event, which makes it only partly relevant."
   }
 {
     "Relevance": "PARTIALLY\_RELEVANT",
     "Explanation": "The generated answer correctly points out that a change was made to the multisig wallet security requirements, specifically reducing the necessary signatures for transactions. However, it misses mentioning the context of how this change occurred so closely before a large transfer of tokens, which raised concerns about the project possibly rug pulling or being hacked."
   }
 {
      "Relevance": "PARTLY\_RELEVANT",
      "Explanation": "While the generated answer correctly identifies that an apparent whitehat hacker intervened to recover funds in regards to the A

In [142]:
df_evaluations_llama = pd.DataFrame(json_evaluations)

In [143]:
len(df_evaluations_llama)

240

In [144]:
df_evaluations_llama.Relevance.value_counts(normalize=True)

Relevance
RELEVANT              0.495833
PARTLY_RELEVANT       0.395833
NON_RELEVANT          0.079167
PARTY_RELEVANT        0.012500
PARTIALLY_RELEVANT    0.008333
PARTILY_RELEVANT      0.008333
Name: proportion, dtype: float64

In [145]:
 0.495833+0.395833

0.8916660000000001

In [17]:
import numpy as np

x = np.float32(3.14)
y = x.item() 

print(type(x))
print(type(y))

<class 'numpy.float32'>
<class 'float'>
