In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly.express as px
from google.cloud import bigquery
import pandas as pd
import igraph

client = bigquery.Client()

from bq_helper import BigQueryHelper
bq_assist = BigQueryHelper('bigquery-public-data', 'crypto_bitcoin')
bq_assist_old = BigQueryHelper('bigquery-public-data', 'bitcoin_blockchain')

# Number of recipients

In [None]:
# Query by Allen Day, GooglCloud Developer Advocate (https://medium.com/@allenday)
query = """
#standardSQL
SELECT
  o.day,
  COUNT(DISTINCT(o.output_key)) AS recipients
FROM (
  SELECT
    TIMESTAMP_MILLIS((timestamp - MOD(timestamp,
          86400000))) AS day,
    output.output_pubkey_base58 AS output_key
  FROM
    `bigquery-public-data.bitcoin_blockchain.transactions`,
    UNNEST(outputs) AS output ) AS o
GROUP BY
  day
ORDER BY
  day
"""

query_job = client.query(query)

iterator = query_job.result(timeout=30)
rows = list(iterator)

# Transform the rows into a nice pandas dataframe
transactions = pd.DataFrame(data=[list(x.values()) for x in rows], columns=list(rows[0].keys()))
transactions['Date'] = transactions['day'].dt.date
transactions = transactions.rename(columns={'recipients': 'Recipients'}).drop(columns=['day'])[['Date', 'Recipients']]

# Look at the first 10 headlines
print(transactions.head(3))

In [None]:
print("date range: ", transactions['Date'].min(), "-", transactions['Date'].max())
px.line(transactions, x='Date', y='Recipients', title="Daily number of recipients ")

# tables & columns structure

In [None]:
query = """SELECT *
FROM `bigquery-public-data`.crypto_bitcoin.INFORMATION_SCHEMA.COLUMNS"""

information_schema = client.query(query).to_dataframe()
print('all tables:', information_schema['table_name'].unique())

In [None]:
TABLE_NAME = 'outputs'
print(f'{TABLE_NAME} columns:')
information_schema.query('table_name == @TABLE_NAME')[['column_name', 'data_type']]

In [None]:
# transactions_head = client.query(query).to_dataframe().head(TABLE_NAME, num_rows=4)
# transactions_head

In [None]:
# transactions_head['outputs'].values

# BTC - USD rates

In [None]:
exchange_rates = pd.read_csv("../input/bitcoin-historical-data/bitstampUSD_1-min_data_2012-01-01_to_2020-09-14.csv")
exchange_rates

In [None]:
exchange_rates['Datetime'] = pd.to_datetime(exchange_rates['Timestamp'], unit='s')
exchange_rates['Date'] = exchange_rates['Datetime'].dt.date
exchange_rates = exchange_rates.dropna(subset=['Weighted_Price'])
exchange_rates_daily = (
    exchange_rates
    .groupby('Date')
    .apply(lambda x: np.average(x['Weighted_Price'], weights=x['Volume_(Currency)']))
    .rename('WeightedUSDPrice')
    .reset_index())

print(exchange_rates_daily.head(3))

px.line(exchange_rates_daily, x='Date', y='WeightedUSDPrice', title='BTC USD exchange prices')

# average transaction value

In [None]:
q = """
    SELECT  
        TIMESTAMP_MILLIS((timestamp - MOD(timestamp, 86400000))) AS day, avg(o.output_satoshis) as output_avg
    from `bigquery-public-data.bitcoin_blockchain.transactions`
    JOIN UNNEST(outputs) as o
    group by day 
    order by output_avg desc
"""
daily_outputs_values = client.query(q).to_dataframe()
print('orig df\n', daily_outputs_values.head(3))

daily_outputs_values["OutputAvgBTC"] = daily_outputs_values["output_avg"] / 10**8  # from satoshis to bitcoins
daily_outputs_values['Date'] = daily_outputs_values['day'].dt.date
daily_outputs_values = daily_outputs_values.drop(columns=['day'])
daily_outputs_values.head()

In [None]:
daily_outputs_btc_usd = daily_outputs_values.merge(exchange_rates_daily, on='Date', how='outer')
daily_outputs_btc_usd['OutputAvgUSD'] = daily_outputs_btc_usd['OutputAvgBTC'] * daily_outputs_btc_usd['WeightedUSDPrice']
daily_outputs_btc_usd.sample(10)

# total number of transactions

In [None]:
q = """
SELECT  TIMESTAMP_MILLIS((timestamp - MOD(timestamp,
          86400000))) as Timestamp , count(Timestamp) as TransactionsCount from 
    `bigquery-public-data.bitcoin_blockchain.transactions` group by Timestamp
"""
print (str(round((bq_assist.estimate_query_size(q)),2))+str(" GB"))
transaction_count=bq_assist.query_to_pandas(q)
transaction_count=transaction_count.sort_values(by="Timestamp")
transaction_count['Date'] = transaction_count['Timestamp'].dt.date
transaction_count.head()

# Plot all data

In [None]:
COMMON_COLS = ['Date', 'value_type', 'value']

def prepare_before_plotting(df, index_col, value_type_col, filter_cols):
    if value_type_col not in df.columns:
        raise ValueError(f'invalid value_type_col {value_type_col}, available cols: {df.columns}')
    elif index_col not in df.columns:
        raise ValueError(f'invalid index_col {index_col}, available cols: {df.columns}')
    
    return (
        df
        .sort_values(by=index_col)
        .assign(value_type=value_type_col)
        .rename(columns={value_type_col: 'value'})
        [filter_cols]
    )


subplot_dfs = [
    prepare_before_plotting(exchange_rates_daily, 'Date', 'WeightedUSDPrice', COMMON_COLS),
    prepare_before_plotting(transaction_count, 'Date', 'TransactionsCount', COMMON_COLS),
    prepare_before_plotting(daily_outputs_btc_usd, 'Date', 'OutputAvgBTC', COMMON_COLS),
    prepare_before_plotting(daily_outputs_btc_usd, 'Date', 'OutputAvgUSD', COMMON_COLS),
]
max_dt = pd.to_datetime('2018-09-10')
plot_df = pd.concat(subplot_dfs, axis=0).query("Date <= @max_dt")
print(plot_df.sample(3))

fig = px.line(plot_df, x='Date', y='value', color='value_type', facet_row='value_type', height=800)
fig.update_yaxes(matches=None, title_text='')
fig.update_layout(legend_orientation="h")
fig.show()

# Graph for 10k BTC pizza Transaction Network

In [None]:
QUERY_TEMPLATE = """
SELECT
    timestamp,
    inputs.input_pubkey_base58 AS input_key,
    outputs.output_pubkey_base58 AS output_key,
    outputs.output_satoshis as satoshis
FROM `bigquery-public-data.bitcoin_blockchain.transactions`
    JOIN UNNEST (inputs) AS inputs
    JOIN UNNEST (outputs) AS outputs
WHERE inputs.input_pubkey_base58 IN UNNEST({0})
    AND outputs.output_satoshis  >= {1}
    AND inputs.input_pubkey_base58 IS NOT NULL
    AND outputs.output_pubkey_base58 IS NOT NULL
GROUP BY timestamp, input_key, output_key, satoshis
"""

In [None]:
def trace_transactions(target_depth, seeds, min_satoshi_per_transaction, bq_assist):
    """
    Trace transactions associated with a given bitcoin key.

    To limit the number of BigQuery calls, this function ignores time. 
    If you care about the order of transactions, you'll need to do post-processing.

    May return a deeper graph than the `target_depth` if there are repeated transactions
    from wallet a to b or or self transactions (a -> a).
    """
    MAX_SEEDS_PER_QUERY = 500
    query = QUERY_TEMPLATE.format(seeds, min_satoshi_per_transaction)
    print(f'Estimated total query size: {int(bq_assist.estimate_query_size(query)) * MAX_DEPTH}')
    results = []
    seeds_scanned = set()
    for i in range(target_depth):
        seeds = seeds[:MAX_SEEDS_PER_QUERY]
        print(f"Now scanning {len(seeds)} seeds")
        query = QUERY_TEMPLATE.format(seeds, min_satoshi_per_transaction)
        transactions = bq_assist.query_to_pandas(query)
        results.append(transactions)
        # limit query kb by dropping any duplicated seeds
        seeds_scanned.update(seeds)
        seeds = list(set(transactions.output_key.unique()).difference(seeds_scanned))
    return pd.concat(results).drop_duplicates()

In [None]:
MAX_DEPTH = 4
BASE_SEEDS = ['1XPTgDRhN8RFnzniWCddobD9iKZatrvH4']

In [None]:
df = trace_transactions(MAX_DEPTH, BASE_SEEDS, 0, bq_assist)

In [None]:
df.to_csv("/kaggle/working/transactions.csv", index=False)

In [None]:
df['date_time'] = pd.to_datetime(df.timestamp * 1000000)
df.head()

In [None]:
df.input_key.nunique(), df.output_key.nunique()

In [None]:
data_copy = df[df['date_time'] >= pd.to_datetime("May 22, 2010")].copy()
data_copy.sort_values(by=['timestamp'], ascending=True, inplace=True)
transactions = []
active_seeds = {i for i in BASE_SEEDS}

In [None]:
for ind, row in data_copy.iterrows():
    if row['satoshis']>=0 and row['input_key'] in active_seeds:
        active_seeds.add(row['output_key'])
        transactions.append(row)
future_transactions = pd.DataFrame(transactions)    

In [None]:
len(active_seeds), len(transactions)

In [None]:
future_transactions.head()

In [None]:
total_flows = future_transactions.groupby(
    by=['input_key', 'output_key']).agg({'date_time':min, 'satoshis':"sum"}).reset_index()
total_flows.sort_values(by=['date_time'], ascending=True, inplace=True)
total_flows.head(3)

In [None]:
weights = (np.log(total_flows['satoshis'])/10).tolist()

In [None]:
column_values = total_flows[['input_key', 'output_key']].values.ravel()
nodes = pd.unique(column_values).shape[0]

In [None]:
unique_cols = pd.unique(column_values)
numeric_values = [i for i in range(nodes)]
numeric_dict=dict(zip(unique_cols, numeric_values))

In [None]:
edges = [[numeric_dict[row['input_key']], numeric_dict[row['output_key']]] for ind, row in total_flows.iterrows()]

In [None]:
nodes, len(edges), len(weights)

In [None]:
n_edge = 100000
colors = ['rgba(0, 0, 255, 0.3)']*n_edge
colors[0] = "red"
edge_color = ['rgba(0, 0, 0, 0.3)']*n_edge
edge_color[0] = "red"

In [None]:
g = igraph.Graph(edges = edges[0:n_edge], directed=True)

In [None]:
igraph.plot(g, layout = g.layout_fruchterman_reingold(), vertex_size=6, edge_arrow_size=0.8, edge_width = weights[0:n_edge], 
            vertex_color = colors, edge_color=edge_color)