In [87]:
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
import minsearch
import json

## Import Dataset

In [24]:
df_rekt = pd.read_csv('../datasets/web3isgoinggreat_dataset.csv', index_col=0)

In [25]:
df_rekt = df_rekt.dropna()

In [26]:
df_rekt = df_rekt[['Hack' in i for i  in  df_rekt.tags]]

In [27]:
df_rekt = df_rekt.reset_index()

In [28]:
df_rekt = df_rekt.drop(columns=['index'])

In [29]:
df_rekt['id'] = df_rekt.index

In [31]:
df_rekt

Unnamed: 0,title,date,summary,tags,id
0,"""Peripheral"" Aave smart contract hacked for $5...","August 28, 2024","The popular defi lending platform, Aave, suffe...",Hack or scam,0
1,"Brothers charged by SEC for $60 million ""crypt...","August 26, 2024",Brothers Jonathan and Tanner Adam were charged...,"Hack or scam, Law",1
2,Users suffer losses after Polygon Discord hack,"August 24, 2024","Some fans of the Polygon blockchain, or those ...",Hack or scam,2
3,"McDonald's Instagram hacked, hackers claim $70...","August 21, 2024","McDonald's Instagram account, as well as the T...",Hack or scam,3
4,Crypto holder loses over $55 million to appare...,"August 20, 2024",Someone holding almost $55.5 million in the DA...,Hack or scam,4
...,...,...,...,...,...
256,Sentiment protocol hacked for almost $1 million,"April 4, 2023",The Sentiment liquidity protocol on the Arbitr...,Hack or scam,256
257,Over $25 million taken from an MEV bot by mali...,"April 3, 2023",It's a dog-eat dog-world in the crypto univers...,Hack or scam,257
258,Allbridge cross-chain bridge exploited for aro...,"April 1, 2023",The Allbridge cross-chain bridge project was e...,"Bug, Hack or scam",258
259,"Arbitrum airdrop plagued by downtime, bugs, an...","March 31, 2023",A token airdrop from the popular Arbitrum Ethe...,"Hack or scam, Hmm",259


In [32]:
documents = df_rekt.to_dict(orient='records')

## Define Evaluation Functions

In [33]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [34]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

## Evaluate Minsearch

In [140]:
index = minsearch.Index(
    text_fields=["title", "date", "summary"],
    keyword_fields=["tags", "id"]
)

index.fit(documents)

<minsearch.Index at 0x7f73195aaa10>

In [141]:
def minsearch_search(query):
    boost = {'date': 3.0, 'summary': 0.5}

    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=5
    )

    return results

In [142]:
df_ground_truth = pd.read_csv('../datasets/ground-truth-retrieval_web3.csv')

In [143]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [144]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [71]:
evaluate(ground_truth,  lambda q: minsearch_search(q['question']))

100%|██████████████████████████████████████████████████████████████████████████████| 1305/1305 [00:02<00:00, 604.20it/s]


{'hit_rate': 0.7318007662835249, 'mrr': 0.6523882503192853}

## Evaluate Elasticsearch (Text Search)

In [74]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "date": {"type": "text"},
            "summary": {"type": "text"},
            "tags": {"type": "keyword"},
            "id": {"type": "keyword"}
        }
    }
}

index_name = "rekt-knowledgebase"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'rekt-knowledgebase'})

In [75]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:02<00:00, 105.72it/s]


In [76]:
def elastic_search(query):

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["title", "date^2", "summary"],
                        "type": "best_fields"
                    }
                }
            }
        }
    }


    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [77]:
evaluate(ground_truth,  lambda q: elastic_search(q['question']))

100%|██████████████████████████████████████████████████████████████████████████████| 1305/1305 [00:04<00:00, 324.85it/s]


{'hit_rate': 0.9134099616858238, 'mrr': 0.8574457215836528}

## Evaluate Elasticsearch (Vector Search)

In [78]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "date": {"type": "text"},
            "summary": {"type": "text"},
            "tags": {"type": "keyword"},
            "id": {"type": "keyword"},
            "title_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "date_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "tags_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "summary_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "all_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "rekt-knowledgebase"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'rekt-knowledgebase'})

In [81]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [82]:
def generate_embeddings(row):
    title = row['title']
    date = row['date']
    tags = row['tags']
    if pd.isna(tags):
        tags = ''
    summary = row['summary']
    title_summary = title + ' ' + summary

    row['title_vec'] = model.encode(title)
    row['date_vec'] = model.encode(date)
    row['tags_vec'] = model.encode(tags)
    row['summary_vec'] = model.encode(summary)
    row['all_vec'] = model.encode(title_summary)

    return row

In [83]:
df_rekt = df_rekt.apply(generate_embeddings, axis=1)

In [84]:
rekt_docs = df_rekt.to_json(orient="records")

In [88]:
parsed = json.loads(rekt_docs)

In [90]:
for doc in tqdm(parsed):
    es_client.index(index=index_name, document=doc)

100%|█████████████████████████████████████████████████████████████████████████████████| 261/261 [00:03<00:00, 78.57it/s]


In [94]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000
    }

    search_query = {
        "knn": knn,
        "_source": ["title", "date", "summary", "tags", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [95]:
def question_vector_knn(q):
    question = q['question']

    v_q = model.encode(question)

    return elastic_search_knn('all_vec', v_q)

In [96]:
evaluate(ground_truth, question_vector_knn)

100%|███████████████████████████████████████████████████████████████████████████████| 1305/1305 [00:18<00:00, 70.44it/s]


{'hit_rate': 0.8, 'mrr': 0.742158365261814}

## Evaluate Elasticsearch (Hybrid Search)

In [137]:
def elastic_search_hybrid(query): #0.8, 0.2
    v_q = model.encode(query)
    
    knn_query = {
        "field": "all_vec",
        "query_vector": v_q,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.8
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["title", "date", "summary", "tags"],
                    "type": "best_fields",
                    "boost": 0.2,
                }
            }
        }
    }

    response = es_client.search(
        index=index_name,
        query=keyword_query,
        knn=knn_query,
        size=5
    )

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [138]:
def question_hybrid(q):
    question = q['question']
    return elastic_search_hybrid(question)

In [139]:
evaluate(ground_truth, question_hybrid)

100%|███████████████████████████████████████████████████████████████████████████████| 1305/1305 [00:19<00:00, 66.55it/s]


{'hit_rate': 0.9302681992337165, 'mrr': 0.8803575989782887}