In [172]:
import pandas as pd
from openai import OpenAI
import json
from tqdm.auto import tqdm
from jsoncomment import JsonComment

In [173]:
df_rekt = pd.read_csv('../datasets/web3isgoinggreat_dataset.csv', index_col=0)

In [174]:
df_rekt = df_rekt.dropna()

In [175]:
df_rekt = df_rekt[['Hack' in i for i in df_rekt.tags]]

In [176]:
df_rekt = df_rekt.reset_index()

In [177]:
df_rekt = df_rekt.drop(columns=['index'])

In [178]:
df_rekt['id'] = df_rekt.index

In [179]:
df_rekt

Unnamed: 0,title,date,summary,tags,id
0,"""Peripheral"" Aave smart contract hacked for $5...","August 28, 2024","The popular defi lending platform, Aave, suffe...",Hack or scam,0
1,"Brothers charged by SEC for $60 million ""crypt...","August 26, 2024",Brothers Jonathan and Tanner Adam were charged...,"Hack or scam, Law",1
2,Users suffer losses after Polygon Discord hack,"August 24, 2024","Some fans of the Polygon blockchain, or those ...",Hack or scam,2
3,"McDonald's Instagram hacked, hackers claim $70...","August 21, 2024","McDonald's Instagram account, as well as the T...",Hack or scam,3
4,Crypto holder loses over $55 million to appare...,"August 20, 2024",Someone holding almost $55.5 million in the DA...,Hack or scam,4
...,...,...,...,...,...
256,Sentiment protocol hacked for almost $1 million,"April 4, 2023",The Sentiment liquidity protocol on the Arbitr...,Hack or scam,256
257,Over $25 million taken from an MEV bot by mali...,"April 3, 2023",It's a dog-eat dog-world in the crypto univers...,Hack or scam,257
258,Allbridge cross-chain bridge exploited for aro...,"April 1, 2023",The Allbridge cross-chain bridge project was e...,"Bug, Hack or scam",258
259,"Arbitrum airdrop plagued by downtime, bugs, an...","March 31, 2023",A token airdrop from the popular Arbitrum Ethe...,"Hack or scam, Hmm",259


In [180]:
documents = df_rekt.to_dict(orient='records')

In [181]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [182]:
def generate_questions(doc):
    prompt_template = """
    You emulate a user of our cryptocurrency hacks and exploits assistant application.
    Formulate 5 questions this user might ask based on a provided hack or exploit.
    Make the questions specific to this hack or exploit.
    The question need to be able to be answered using the record, and the questions should
    be complete and not too short. Use as fewer words as possible from the record. DO NOT include the answer in the questions.
    
    The record:
    
    title: {title}
    date: {date}
    summary: {summary}
    tags: {tags}
    
    Provide the output in parsable JSON without using code blocks:

    {{
        "questions": [
            "Question 1",
            "Question 2",
            "Question 3",
            "Question 4",
            "Question 5"
        ]
    }}

    """.strip()
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='mistral',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [183]:
results = {}

In [184]:
parser = JsonComment()

In [185]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    #if doc_id in results:
        #continue

    questions_raw = generate_questions(doc)

    try:
        questions = parser.loads(questions_raw)
        results[doc_id] = questions['questions']
    except:
        print(doc)
        print(questions_raw)

 54%|███████████████████████████████████████████▊                                     | 141/261 [08:35<07:54,  3.96s/it]

{'title': 'Samudai treasury drained', 'date': 'November 10, 2023', 'summary': 'The treasury of the Samudai DAO was apparently drained as an attacker compromised the project\'s multisignature wallets and the wallet belonging to the project\'s founder, Kushagra Agarwal. Altogether, around $1.25\xa0million in ETH was stolen.Agarwal sent a message to the thief shortly afterwards, offering a 10% "bounty" in exchange for the return of the rest of the funds. The attacker didn\'t seem to be interested, and in mid-January began tumbling the assets through the Tornado Cash cryptocurrency mixer.Samudai didn\'t seem to publicly acknowledge the theft, even though they\'ve posted on Twitter a few times since then. The organization had raised $2.5\xa0million in pre-seed capital in June 2022.', 'tags': 'Hack or scam', 'id': 140}
 {
        'questions': [
            "'Who compromised the multisignature wallets and the wallet belonging to Kushagra Agarwal in the Samudai treasury draining incident on No

 63%|███████████████████████████████████████████████████▏                             | 165/261 [10:02<06:08,  3.84s/it]

{'title': 'Remitano hacked for $2.7\xa0million', 'date': 'September 15, 2023', 'summary': 'Crypto exchange Remitano suffered a hack in which $2.7\xa0million in Tether (USDT), USDC, and Ankr was drained from the exchange\'s hot wallets across three blockchains. Luckily for them, Tether was able to freeze $1.9\xa0million of the stolen funds, substantially reducing the attacker\'s profits.Remitano acknowledged the hack, writing that they had suffered a "data breach from a third-party source". They have claimed that users\' assets will not be affected by the theft.Remitano is a peer-to-peer crypto exchange focused on emerging markets, including Nigeria, Pakistan, Venezuela, and Malaysia.', 'tags': 'Hack or scam', 'id': 164}
 {
       "questions": [
           "On which date did Remitano suffer a $2.7 million hack? (Answer: September 15, 2023)",
           "Which cryptocurrencies were stolen in the Remitano hack? (Answer: Tether (USDT), USDC, and Ankr)",
           "How much of the stolen f

100%|█████████████████████████████████████████████████████████████████████████████████| 261/261 [15:54<00:00,  3.66s/it]


In [204]:
json_str = """

{
       "questions": [
           "On which date did Remitano suffer a $2.7 million hack?",
           "Which cryptocurrencies were stolen in the Remitano hack?",
           "How much of the stolen funds from the Remitano hack were frozen by Tether?",
           "What is the focus market of Remitano as indicated in the record?",
           "How did Remitano describe the source of the hack mentioned in the record?"
       ]
   }
      

"""

In [205]:
questions = parser.loads(json_str)
results[164] = questions['questions']

In [206]:
results[164]

['On which date did Remitano suffer a $2.7 million hack?',
 'Which cryptocurrencies were stolen in the Remitano hack?',
 'How much of the stolen funds from the Remitano hack were frozen by Tether?',
 'What is the focus market of Remitano as indicated in the record?',
 'How did Remitano describe the source of the hack mentioned in the record?']

In [207]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [208]:
final_results

[(0,
  'Who was the attacker that exploited the smart contract in Aave on August 28, 2024?'),
 (0,
  "How did the attacker utilize the arbitrary call error to steal funds from multiple contracts within Aave's ecosystem?"),
 (0,
  'What was the total amount of tokens leftover from slippage that were accumulated across several blockchain networks?'),
 (0,
  'What is the estimated amount stolen by the exploiter during this hack on Aave?'),
 (0,
  "In what sense did Aave representatives describe the hack as a 'raid of the tip jar'?"),
 (1,
  'Who were the individuals charged by SEC for a $60 million Ponzi scheme involving GCZ Global and Triten Financial Group?'),
 (1,
  'What did the accused brothers claim to offer to investors in their crypto bot business, and what percentage returns did they promise?'),
 (1,
  "How were the funds of investors utilized according to the SEC's allegations, and how much of it was actually used to pay other investors?"),
 (1,
  'In what ways did the brothers 

In [209]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [210]:
df_results = df_results.sort_values(by='id')

In [211]:
df_results.to_csv('../datasets/ground-truth-retrieval_web3.csv', index=False)

In [213]:
df_results['question']

0       Who was the attacker that exploited the smart ...
1       How did the attacker utilize the arbitrary cal...
3       What is the estimated amount stolen by the exp...
2       What was the total amount of tokens leftover f...
4       In what sense did Aave representatives describ...
                              ...                        
1291    Which code upgrade introduced the bug that fac...
1290    How and when was the SafeMoon liquidity pool c...
1293    How was the price of the SafeMoon token artifi...
1292    What tactic did the attacker use to take advan...
1294    What method was used by the attacker to sell t...
Name: question, Length: 1305, dtype: object