In [13]:
# This script reads tx hashes, queries the related input addresses and computes the
# clusters based on the multiple-input clustering heuristic and saves them into a JSON file

from argparse import ArgumentParser
import json
import pandas as pd
import requests
import os
import numpy as np

In [2]:
BASE_URL = f'https://blockstream.info/api/'

In [3]:
def get_in_addr(tx_hash):

    l_addr_in = [] # list of input addresses

    url = BASE_URL + 'tx/' + str(tx_hash) 
    
    response = requests.get(url)
    if response.status_code == 200:
        json_data = response.json()

        for i in range (len(json_data['vin'])):
            if 'scriptpubkey_address' in json_data['vin'][i]['prevout'].keys() :
                if json_data['vin'][i]['prevout']['scriptpubkey_address'] not in l_addr_in:
                    l_addr_in.append(json_data['vin'][i]['prevout']['scriptpubkey_address'])

    return l_addr_in

In [4]:
def clusters(transactions_file):

    output_file = os.path.join(os.path.dirname(os.getcwd()), "data/clusters.json")

    with open(transactions_file, 'r') as ft:
        tx_hashes = json.load(ft)

    cluster_addresses = []  # [[...],[...],...]
    # cluster_addresses is a list of clusters, where each cluster is a list of (unique) addresses
    # therefore, cluster_addresses is a list of lists of strings

    inputs = []

    for txs in tx_hashes : 
        inputs.append(get_in_addr(txs))
    
    clusters = inputs.copy()
    
    clusters_joined = []
    
    for i in range (len(clusters)):
        if i in clusters_joined:
            continue
        for j in range (len(clusters)):
            if j in clusters_joined or i==j:
                continue
            if set(clusters[i]).intersection(clusters[j]):
                for address in clusters[j] :
                    if address not in clusters[i] :
                        clusters[i].append(address)
                clusters_joined.append(j)

    mask = []
    for k in range (len(clusters)):
        if k not in clusters_joined : 
            mask.append(k)

    cluster_addresses = [clusters[i] for i in mask]

    with open(output_file, 'w') as fp:
        json.dump(cluster_addresses, fp)
        
    return cluster_addresses

In [5]:
clusters = clusters(os.path.join(os.path.dirname(os.getcwd()), "data/transactions.json"))

KeyboardInterrupt: 

In [30]:
len(clusters)

19

In [6]:
with open(os.path.join(os.path.dirname(os.getcwd()), "data/clusters.json"), 'r') as ft:
        clusters = json.load(ft)

In [8]:
len(clusters)

577

In [11]:
with open(os.path.join(os.path.dirname(os.getcwd()), "data/input_addresses.json"), 'r') as ft:
        illegal_adresses = json.load(ft)[:5]

In [12]:
illegal_adresses

['1BCWMwpR4M1nYUuuYe2bmzrNuwGoF9ZAbA',
 '1MQBDeRWsiJBf7K1VGjJ7PWEL6GJXMfmLg',
 'bc1qj6j6p0jdefl6pvdzx3kx8245yy5mz6q4luhzes',
 '1D1ej7zQzywWBDNXKNYpmH7Hso2U9koDG4',
 '1A3iYY4c3dkgNYGewzYzr7EsqfBuWXibGo']

In [24]:
index_clusters = np.linspace(0, len(clusters)-1,len(clusters)).astype(int)
illegal_clusters = []
nb_tot_adresses = 0
for address in illegal_adresses :
    for ind in index_clusters :
        if address in clusters[ind]:
            illegal_clusters.append(ind)
            nb_tot_adresses += len(clusters[ind])
            pass
print(illegal_clusters)
print(nb_tot_adresses)    

In [28]:
print(illegal_clusters)
print(nb_tot_adresses) 

[71, 485, 352, 412, 80]
201
