In [1]:
import pandas as pd
from graphframes import *
from pyspark import *
from pyspark.sql import *
from pyspark.sql.functions import col
spark = SparkSession.builder.appName('fun').config('spark.jars.packages','graphframes:graphframes:0.8.1-spark3.0-s_2.12').getOrCreate()
spark.sparkContext.setCheckpointDir('checkpoints')

In [None]:
# Load data downloaded via BigQuery (Dataset: crypto_bitcoin)
df = pd.read_pickle('BTC_Data/Download_Small.pickle')
inputs_outputs = df[df.columns[-2:]].values
print('Amount of Transactions loaded: ' + str(len(inputs_outputs)))

# Print transaction in readable format
for i in inputs_outputs[-2:]:
    print('\n')
    print('-----Inputs------------------------------')
    for idx in i[0]:
        print(idx['addresses'][0] + ' - ' + str(round(idx['value']/100000000,3))) # convert Satoshi to BTC
    
    print('-----Outputs-----------------------------')
    for idx in i[1]:
        print(idx['addresses'][0] + ' - ' + str(round(idx['value']/100000000,3))) # convert Satoshi to BTC
    print('_________________________________________')

In [None]:
# Parse data into GraphFrames-readable format
# Assumption: ALL input addresses are parsed as fully connected to ALL output addresses

# Nodes
node_set = set()
node_schema = ['id','address']
for i in inputs_outputs:
    for input_idx in i[0]:
        node_set.add(input_idx['addresses'][0])
    for output_idx in i[1]:
        node_set.add(output_idx['addresses'][0])

node_dict = {y: x for x, y in enumerate(node_set)}
node_data = [(v,k) for k,v in node_dict.items()]
print('Amount of distinct addresses: ' + str(len(node_data)))

# Edges
edge_data = set()
edge_schema = ['src','dst','type']
for i in inputs_outputs:
    for input_idx in i[0]:
        for output_idx in i[1]:
            edge_data.add((node_dict[input_idx['addresses'][0]],node_dict[output_idx['addresses'][0]], 'sent_btc_to'))
            
print('Amount of distinct mutual interactions: ' + str(len(edge_data)))

In [4]:
# Load data into Spark GraphFrames
vertices = spark.createDataFrame(node_data,node_schema)
edges = spark.createDataFrame(edge_data,edge_schema)

gf = GraphFrame(vertices,edges)

In [None]:
# DRAFT Connected Components

result = gf.connectedComponents()
result.select("id", "component").orderBy("component").show()

In [None]:
# DRAFT PageRank

results = gf.pageRank(resetProbability=0.15, tol=0.01)

results.vertices.select("id", "pagerank").sort(col("pagerank").desc()).show()
results.edges.select("src", "dst", "weight").show()