In [1]:
import minsearch
import json

In [2]:
with open('faq_crypto.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [3]:
documents = []

for doc in docs_raw:
    documents.append(doc)

In [4]:
documents[0]

{'question': 'What is a blockchain?',
 'answer': 'A blockchain is a distributed, cryptographically-secure database structure that allows network participants to establish a trusted and immutable record of transactional data without the need for intermediaries. A blockchain can execute a variety of functions beyond transaction settlement, such as smart contracts. Smart contracts are digital agreements that are embedded in code and that can have limitless formats and conditions. Blockchains have proven themselves as superior solutions for securely coordinating data, but they are capable of much more, including tokenization, incentive design, attack-resistance, and reducing counterparty risk. The very first blockchain was the Bitcoin blockchain, which itself was a culmination of over a century of advancements in cryptography and database technology.'}

In [5]:
import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['question']}-{doc['answer'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [6]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [7]:
documents[3]

{'question': 'What is a blockchain system?',
 'answer': 'A blockchain system refers to all the aspects and features that go into a particular blockchain, everything from the consensus algorithm to the state machine to cryptographic functions. As Andreas Antonopoulus and Gavin Wood note in Mastering Ethereum, there are “a huge variety of blockchains with different properties”––qualifiers “help us understand the characteristics of the blockchain in question, such as open, public, decentralized, neutral, and censorship-resistant.”',
 'id': 'e3753a49'}

Let's check how many hash_ids which we successfull generated

In [8]:
from collections import defaultdict

In [9]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [10]:
hashes

defaultdict(list,
            {'ccb39dc7': [{'question': 'What is a blockchain?',
               'answer': 'A blockchain is a distributed, cryptographically-secure database structure that allows network participants to establish a trusted and immutable record of transactional data without the need for intermediaries. A blockchain can execute a variety of functions beyond transaction settlement, such as smart contracts. Smart contracts are digital agreements that are embedded in code and that can have limitless formats and conditions. Blockchains have proven themselves as superior solutions for securely coordinating data, but they are capable of much more, including tokenization, incentive design, attack-resistance, and reducing counterparty risk. The very first blockchain was the Bitcoin blockchain, which itself was a culmination of over a century of advancements in cryptography and database technology.',
               'id': 'ccb39dc7'}],
             '667d9237': [{'question': 'What i

Idealnya, seharusnya sama jumlah hash_id dgn total documentnya

In [11]:
len(hashes), len(documents)

(46, 46)

Check document mana yg duplicate

In [12]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

In [13]:
import json

In [14]:
with open('documents-crypto-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [17]:
prompt_template = """
You emulate a people who's use our product to asking about cryptocurrency.
Formulate 5 questions this people might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

question: {question}
answer: {answer}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [18]:
from openai import OpenAI
client = OpenAI()

In [19]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [20]:
questions = generate_questions(documents[0])

In [21]:
questions

'["What is a distributed, cryptographically-secure database?", "How can network participants establish a trusted and immutable record?", "Do blockchains require intermediaries for validating transactions?", "What additional functions can a blockchain execute beyond transaction settlement?", "What was the first blockchain and what advancements did it incorporate?"]'

In [22]:
documents[0]

{'question': 'What is a blockchain?',
 'answer': 'A blockchain is a distributed, cryptographically-secure database structure that allows network participants to establish a trusted and immutable record of transactional data without the need for intermediaries. A blockchain can execute a variety of functions beyond transaction settlement, such as smart contracts. Smart contracts are digital agreements that are embedded in code and that can have limitless formats and conditions. Blockchains have proven themselves as superior solutions for securely coordinating data, but they are capable of much more, including tokenization, incentive design, attack-resistance, and reducing counterparty risk. The very first blockchain was the Bitcoin blockchain, which itself was a culmination of over a century of advancements in cryptography and database technology.',
 'id': 'ccb39dc7'}

Now, we will generate alternatives questions to all questions we have

In [24]:
from tqdm.auto import tqdm
results = {}

In [25]:
# for doc in tqdm(documents): 
#     doc_id = doc['id']
#     if doc_id in results:
#         continue

#     questions = generate_questions(doc)
#     results[doc_id] = questions

  0%|          | 0/46 [00:00<?, ?it/s]

In [29]:
results['ccb39dc7']

'[\n    "What is a blockchain?",\n    "How is a blockchain different from traditional databases?",\n    "Can you explain what smart contracts are in the context of a blockchain?",\n    "What are some of the additional functions that blockchains can execute besides transaction settlements?",\n    "How did the Bitcoin blockchain contribute to blockchain technology?"\n]'

In [30]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

In [31]:
doc_index = {d['id']: d for d in documents}

In [32]:
doc_index

{'ccb39dc7': {'question': 'What is a blockchain?',
  'answer': 'A blockchain is a distributed, cryptographically-secure database structure that allows network participants to establish a trusted and immutable record of transactional data without the need for intermediaries. A blockchain can execute a variety of functions beyond transaction settlement, such as smart contracts. Smart contracts are digital agreements that are embedded in code and that can have limitless formats and conditions. Blockchains have proven themselves as superior solutions for securely coordinating data, but they are capable of much more, including tokenization, incentive design, attack-resistance, and reducing counterparty risk. The very first blockchain was the Bitcoin blockchain, which itself was a culmination of over a century of advancements in cryptography and database technology.',
  'id': 'ccb39dc7'},
 '667d9237': {'question': 'What is blockchain software?',
  'answer': 'Blockchain software is like any o

In [33]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    for q in questions:
        final_results.append((q, doc_id))

In [34]:
final_results

[('What is a blockchain?', 'ccb39dc7'),
 ('How is a blockchain different from traditional databases?', 'ccb39dc7'),
 ('Can you explain what smart contracts are in the context of a blockchain?',
  'ccb39dc7'),
 ('What are some of the additional functions that blockchains can execute besides transaction settlements?',
  'ccb39dc7'),
 ('How did the Bitcoin blockchain contribute to blockchain technology?',
  'ccb39dc7'),
 ('What is blockchain software?', '667d9237'),
 ('What was the first blockchain software?', '667d9237'),
 ("Is Bitcoin's blockchain software open source?", '667d9237'),
 ('Does Ethereum have its own blockchain software?', '667d9237'),
 ('Are all blockchain software efforts publicly available?', '667d9237'),
 ('questions', '84efe905'),
 ('answers', '84efe905'),
 ('What does a blockchain system encompass?', 'e3753a49'),
 ('Who are some experts that mention the variety of blockchains?', 'e3753a49'),
 ('What can help us understand the characteristics of a blockchain?',
  'e375

Export result into csv

In [35]:
import pandas as pd

In [36]:
df = pd.DataFrame(final_results, columns=['question', 'document'])

In [37]:
# df.to_csv('ground-truth-data-crypto.csv', index=False)

In [38]:
df['total_count'] = df['question'].apply(lambda x: len(x.split()))

In [40]:
new_df = df[df['total_count'] > 2]

In [41]:
new_df.to_csv('ground-truth-data-crypto.csv', index=False)