In [14]:
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from openai import OpenAI

In [15]:
df_rekt = pd.read_csv('../datasets/web3isgoinggreat_dataset.csv', index_col=0)

In [16]:
df_rekt = df_rekt.dropna()

In [17]:
df_rekt = df_rekt[['Hack' in i for i  in  df_rekt.tags]]

In [18]:
df_rekt

Unnamed: 0,title,date,summary,tags
0,"""Peripheral"" Aave smart contract hacked for $5...","August 28, 2024","The popular defi lending platform, Aave, suffe...",Hack or scam
3,"Brothers charged by SEC for $60 million ""crypt...","August 26, 2024",Brothers Jonathan and Tanner Adam were charged...,"Hack or scam, Law"
5,Users suffer losses after Polygon Discord hack,"August 24, 2024","Some fans of the Polygon blockchain, or those ...",Hack or scam
6,"McDonald's Instagram hacked, hackers claim $70...","August 21, 2024","McDonald's Instagram account, as well as the T...",Hack or scam
7,Crypto holder loses over $55 million to appare...,"August 20, 2024",Someone holding almost $55.5 million in the DA...,Hack or scam
...,...,...,...,...
559,Sentiment protocol hacked for almost $1 million,"April 4, 2023",The Sentiment liquidity protocol on the Arbitr...,Hack or scam
562,Over $25 million taken from an MEV bot by mali...,"April 3, 2023",It's a dog-eat dog-world in the crypto univers...,Hack or scam
565,Allbridge cross-chain bridge exploited for aro...,"April 1, 2023",The Allbridge cross-chain bridge project was e...,"Bug, Hack or scam"
567,"Arbitrum airdrop plagued by downtime, bugs, an...","March 31, 2023",A token airdrop from the popular Arbitrum Ethe...,"Hack or scam, Hmm"


In [19]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [20]:
def generate_embeddings(row):
    title = row['title']
    date = row['date']
    tags = row['tags']
    if pd.isna(tags):
        tags = ''
    summary = row['summary']
    title_summary = title + ' ' + summary

    row['title_vec'] = model.encode(title)
    row['date_vec'] = model.encode(date)
    row['tags_vec'] = model.encode(tags)
    row['summary_vec'] = model.encode(summary)
    row['all_vec'] = model.encode(title_summary)

    return row

In [21]:
df_rekt = df_rekt.apply(generate_embeddings, axis=1)

In [22]:
df_rekt

Unnamed: 0,title,date,summary,tags,title_vec,date_vec,tags_vec,summary_vec,all_vec
0,"""Peripheral"" Aave smart contract hacked for $5...","August 28, 2024","The popular defi lending platform, Aave, suffe...",Hack or scam,"[-0.03558176, -0.00775452, 0.05457384, -0.0614...","[0.0045564217, 0.021765362, 0.03966032, 0.0135...","[-0.10653037, -0.014528477, -0.056755316, -0.0...","[-0.11574944, -0.015938979, -0.0093452595, -0....","[-0.08428086, -0.016661586, 0.025461903, -0.07..."
3,"Brothers charged by SEC for $60 million ""crypt...","August 26, 2024",Brothers Jonathan and Tanner Adam were charged...,"Hack or scam, Law","[-0.03900629, -0.0058030444, -0.105479434, -0....","[0.012031225, 0.032236822, 0.03686401, 0.02744...","[-0.079515815, 0.001883131, -0.06285686, -0.05...","[-0.035585508, 0.077503175, -0.09508826, -0.03...","[-0.03972884, 0.0371599, -0.1153651, -0.030651..."
5,Users suffer losses after Polygon Discord hack,"August 24, 2024","Some fans of the Polygon blockchain, or those ...",Hack or scam,"[0.09447819, 0.00029435614, 0.10371893, 0.0383...","[0.009936126, 0.024253512, 0.04532223, 0.01871...","[-0.10653037, -0.014528477, -0.056755316, -0.0...","[0.062519275, -0.044542164, 0.06750499, 0.0273...","[0.083045095, -0.034339312, 0.08493627, 0.0367..."
6,"McDonald's Instagram hacked, hackers claim $70...","August 21, 2024","McDonald's Instagram account, as well as the T...",Hack or scam,"[-0.051038153, -0.057920873, 0.047193713, -0.0...","[0.018289678, 0.036256082, 0.048997402, 0.0191...","[-0.10653037, -0.014528477, -0.056755316, -0.0...","[-0.058439437, -0.029506354, 0.08196194, 0.030...","[-0.064780444, -0.03958622, 0.07336637, 0.0182..."
7,Crypto holder loses over $55 million to appare...,"August 20, 2024",Someone holding almost $55.5 million in the DA...,Hack or scam,"[0.061816115, 0.072248265, 0.021658989, -0.026...","[0.009036036, 0.03263282, 0.06939557, -0.01048...","[-0.10653037, -0.014528477, -0.056755316, -0.0...","[0.011191156, 0.10501867, -0.034607336, 0.0277...","[0.046877768, 0.11102803, -0.018116431, 0.0134..."
...,...,...,...,...,...,...,...,...,...
559,Sentiment protocol hacked for almost $1 million,"April 4, 2023",The Sentiment liquidity protocol on the Arbitr...,Hack or scam,"[0.03627624, 0.01114466, 0.05209314, -0.036827...","[-0.06238184, -0.025215795, 0.041302435, 0.000...","[-0.10653037, -0.014528477, -0.056755316, -0.0...","[0.029724026, 0.088646136, 0.04146622, 0.01375...","[0.03253247, 0.081768066, 0.042567883, 0.00066..."
562,Over $25 million taken from an MEV bot by mali...,"April 3, 2023",It's a dog-eat dog-world in the crypto univers...,Hack or scam,"[-0.02806156, 0.015294933, -0.028831037, -0.05...","[-0.08900714, -0.023725184, 0.036874328, 0.001...","[-0.10653037, -0.014528477, -0.056755316, -0.0...","[-0.07917026, -0.014964834, -0.02996774, -0.02...","[-0.053193823, 0.009125637, -0.028843762, -0.0..."
565,Allbridge cross-chain bridge exploited for aro...,"April 1, 2023",The Allbridge cross-chain bridge project was e...,"Bug, Hack or scam","[-0.027797084, 0.035158765, 0.04340918, 0.0070...","[-0.07205015, -0.011304075, 0.044962775, 0.005...","[-0.13383146, -0.043943524, -0.022239104, -0.0...","[-0.058315024, 0.03664161, 0.054380104, -0.024...","[-0.058807872, 0.03847384, 0.057238556, -0.012..."
567,"Arbitrum airdrop plagued by downtime, bugs, an...","March 31, 2023",A token airdrop from the popular Arbitrum Ethe...,"Hack or scam, Hmm","[0.00616036, 0.0041198567, 0.007767766, 0.0045...","[-0.06676465, -0.00070751936, 0.04269805, -0.0...","[-0.10288795, 0.01972593, -0.041685242, -0.037...","[-0.0036260646, 0.0074736984, -0.005211988, 0....","[-0.001927368, 0.01820763, 5.0267354e-05, 0.03..."


## Indexing

In [23]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "date": {"type": "text"},
            "summary": {"type": "text"},
            "tags": {"type": "text"},
            "title_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "date_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "tags_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "summary_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "all_vec": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "rekt-knowledgebase"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'rekt-knowledgebase'})

In [24]:
rekt_docs = df_rekt.to_json(orient="records")

In [25]:
from json import loads, dumps
parsed = loads(rekt_docs)

In [26]:
for doc in tqdm(parsed):
    es_client.index(index=index_name, document=doc)
    #print(doc)

100%|█████████████████████████████████████████████████████████████████████████████████| 261/261 [00:03<00:00, 76.56it/s]


## Search

In [27]:
query = 'What hacks occured on August 15, 2023?'

In [28]:
v_q = model.encode(query)

In [29]:
knn_query = {
    "field": "all_vec",
    "query_vector": v_q,
    "k": 5,
    "num_candidates": 10000,
    "boost": 0.5
}

In [30]:
keyword_query = {
    "bool": {
        "must": {
            "multi_match": {
                "query": query,
                "fields": ["title", "date^2", "summary", "tags"],
                "type": "best_fields",
                "boost": 0.5,
            }
        }
    }
}

In [31]:
response = es_client.search(
    index=index_name,
    query=keyword_query,
    knn=knn_query,
    size=5
)

In [51]:
def elastic_search(query):
    v_q = model.encode(query)
    
    knn_query = {
        "field": "all_vec",
        "query_vector": v_q,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["title", "date^2", "summary", "tags"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            }
        }
    }

    response = es_client.search(
        index=index_name,
        query=keyword_query,
        knn=knn_query,
        size=5
    )

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    print(type(result_docs[0]))

    return result_docs

In [52]:
results = elastic_search(query)

<class 'dict'>


## LLM

In [34]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [35]:
def llm(prompt, model='phi3', temperature=0.0):
    response = client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [36]:
def build_prompt(query, search_results):

    prompt_template = """
    You're a assistant that informs the user on the latest cryptocurrency hacks and exploits. Answer the QUESTION based on the CONTEXT from our crytocurrency hacks and exploits database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
    
    entry_template = """
    title: {title}
    date: {date}
    summary: {summary}
    tags: {tags}
    """.strip()
    
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [46]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    print(type(prompt))
    answer = llm(prompt)
    return answer

In [47]:
query_1 = 'What hacks occured on August 1, 2024?'

In [48]:
rag(query)

<class 'str'>


APIConnectionError: Connection error.