In [4]:
import pandas as pd
from sklearn_evaluation import plot

# Import jupysql Jupyter extension to create SQL cells
%load_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.displaylimit = None

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [2]:
%sql clickhouse+native://default:@clickhouse.clickhouse:9000

## Create the Bitcoin Blocks Database

In [6]:
%%sql
CREATE DATABASE btc

In [7]:
%sql use btc

## Create the blockchain_btc_blocks Table

Use the [ReplacingMergeTree](https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/replacingmergetree) Engine Type to avoid duplicates

In [8]:
%%sql
CREATE TABLE btc.blockchain_btc_blocks
(
    `hash` String,
    `version` Int64,
    `mediantime` DateTime64(9),
    `nonce` Int64,
    `bits` String,
    `difficulty` Float64,
    `chainwork` String,
    `previousblockhash` String,
    `size` Int64,
    `weight` Int64,
    `coinbase_param` String,
    `number` Int64,
    `transaction_count` Int64,
    `merkle_root` String,
    `stripped_size` Int64,
    `timestamp` DateTime64(9),
    `date` String,
    `last_modified` DateTime64(9)
)
ENGINE = ReplacingMergeTree
ORDER BY (number, hash)
SETTINGS index_granularity = 8192


In [9]:
result = %sql DESCRIBE table btc.blockchain_btc_blocks;
result

name,type,default_type,default_expression,comment,codec_expression,ttl_expression
mediantime,DateTime64(9),,,,,
nonce,Int64,,,,,
bits,String,,,,,
difficulty,Float64,,,,,
chainwork,String,,,,,
previousblockhash,String,,,,,
size,Int64,,,,,
weight,Int64,,,,,
coinbase_param,String,,,,,
number,Int64,,,,,


## Import Data from s3 Parquet into the blocks table

In [None]:
# 20*
%sql INSERT into btc.blockchain_btc_blocks select * FROM s3('https://aws-public-blockchain.s3.us-east-2.amazonaws.com/v1.0/btc/blocks/date=20*/*', 'Parquet') SETTINGS input_format_parquet_allow_missing_columns = 1

In [36]:
# Print number of blocks in the database, use the FINAL statement to make sure the duplicates are removed
result = %sql SELECT count(number) FROM btc.blockchain_btc_blocks FINAL;
df=result.DataFrame()
df

Unnamed: 0,count(number)
0,150


In [8]:
# Get average number of transactions over all blocks
result = %sql SELECT avg(transaction_count) FROM btc.blockchain_btc_blocks;
print(result)

+------------------------+
| avg(transaction_count) |
+------------------------+
|   1086.851562900285    |
+------------------------+
