In [2]:
### Exploratory data analysis notebook

In [3]:
!pip install graphframes
!pip install nbimporter
!pip install hvac

Collecting graphframes
  Downloading graphframes-0.6-py2.py3-none-any.whl.metadata (934 bytes)
Collecting nose (from graphframes)
  Downloading nose-1.3.7-py3-none-any.whl.metadata (1.7 kB)
Downloading graphframes-0.6-py2.py3-none-any.whl (18 kB)
Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: nose, graphframes
Successfully installed graphframes-0.6 nose-1.3.7
Collecting nbimporter
  Downloading nbimporter-0.3.4-py3-none-any.whl.metadata (252 bytes)
Downloading nbimporter-0.3.4-py3-none-any.whl (4.9 kB)
Installing collected packages: nbimporter
Successfully installed nbimporter-0.3.4
Collecting hvac
  Downloading hvac-2.3.0-py3-none-any.whl.metadata (3.3 kB)
Downloading hvac-2.3.0-py3-none-any.whl (155 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.9/155.9 kB[0m [31m2.2 MB/s[0m eta [36m0

In [7]:
import sys
sys.path.append('/home/jovyan/work') # here add notebooks if fails

In [8]:
from pyspark.sql import SparkSession
import nbimporter
from utils.vault_scripts import read_root_token, get_secret_from_vault
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, when, lit, expr
from graphframes import GraphFrame

In [9]:
# S3 and Spark configuration

In [10]:
spark = SparkSession.builder.appName("ExpDataAnalysis").getOrCreate()

In [11]:
hadoopConf = spark._jsc.hadoopConfiguration()

In [12]:
AWS_KEY_ID = get_secret_from_vault("aws1", "keyid")
AWS_ACCESS_KEY = get_secret_from_vault("aws2", "accesskey")
AWS_S3_BUCKET = get_secret_from_vault("aws3", "s3bucket")

In [13]:
hadoopConf.set("fs.s3a.access.key", AWS_KEY_ID)
hadoopConf.set("fs.s3a.secret.key", AWS_ACCESS_KEY)
hadoopConf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

In [8]:
# Reading all data samples for one NFT collection

In [22]:
# s3_path_big = f"s3a://{AWS_S3_BUCKET}/raw/opensea_data/opensea_nft_data/cryptopunks/*.json"
s3_path_small = f"s3a://{AWS_S3_BUCKET}/raw/opensea_nft_data/courtyard-nft/*.json"

In [12]:
df_big = spark.read.json(s3_path_big)

In [23]:
df_small = spark.read.json(s3_path_small)

In [14]:
df_asset_events_big = df_big.select(explode(col("asset_events")).alias("event"))

In [24]:
df_asset_events_flat_small = df_small.select(
    col("transaction"),
    col("event_type"),
    col("buyer"),
    col("seller"),
    col("from_address"),
    col("to_address"),
    col("quantity"),
    col("event_timestamp"),
    col("order_hash"),
    col("nft.identifier"),
    col("nft.collection"),
    col("nft.contract"),
    col("payment.decimals"),
    col("payment.quantity").alias("payment_quantity"),
    col("payment.symbol"),
    col("payment.token_address")
)

In [16]:
df_asset_events_flat_big = df_asset_events_big.select(
    col("event.transaction"),
    col("event.event_type"),
    col("event.buyer"),
    col("event.seller"),
    col("event.from_address"),
    col("event.to_address"),
    col("event.quantity"),
    col("event.event_timestamp"),
    col("event.order_hash"),
    col("event.nft.identifier"),
    col("event.nft.collection"),
    col("event.nft.contract"),
    col("event.payment.decimals"),
    col("event.payment.quantity").alias("payment_quantity"),
    col("event.payment.symbol"),
    col("event.payment.token_address")
)

In [17]:
df_asset_events_flat_big.count()

157470

In [18]:
df_asset_events_flat_big.printSchema()

root
 |-- transaction: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- buyer: string (nullable = true)
 |-- seller: string (nullable = true)
 |-- from_address: string (nullable = true)
 |-- to_address: string (nullable = true)
 |-- quantity: long (nullable = true)
 |-- event_timestamp: long (nullable = true)
 |-- order_hash: string (nullable = true)
 |-- identifier: string (nullable = true)
 |-- collection: string (nullable = true)
 |-- contract: string (nullable = true)
 |-- decimals: long (nullable = true)
 |-- payment_quantity: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- token_address: string (nullable = true)



In [25]:
df_asset_events_flat_small.count()
# 157470 - cryptopunks
# 97916 - pudgypenguins / referencedata2 > 12177
# 14903 - boredapeyachtclub / reference

170593

In [20]:
grouped_df_big = df_asset_events_flat_big.groupBy("transaction", "identifier", "collection", "event_type").count()
duplicates_df_big = grouped_df_big.filter(F.col("count") > 1)
duplicates_df_big.show()

+--------------------+----------+-----------+----------+-----+
|         transaction|identifier| collection|event_type|count|
+--------------------+----------+-----------+----------+-----+
|0x317fc2467d9139e...|      2474|cryptopunks|  transfer|  122|
|0x38b583c3d19bf01...|      1411|cryptopunks|  transfer|  122|
|0xd1f05c106d3f08a...|      1860|cryptopunks|  transfer|  122|
|0x4956c57cf093949...|      4347|cryptopunks|  transfer|  122|
|0x76708336a7f0abd...|      5728|cryptopunks|  transfer|  122|
|0xbb3bb661d0778f8...|      2153|cryptopunks|      sale|  122|
|0x9a760980e284588...|      9362|cryptopunks|      sale|  122|
|0x31d08bb7eb37b64...|      1400|cryptopunks|      sale|  122|
|0xec0e55fec12de0a...|      8771|cryptopunks|      sale|  122|
|0xb2470e68bcd1fd3...|      7113|cryptopunks|      sale|  122|
|0x068a942a5d40fd0...|      3960|cryptopunks|  transfer|  122|
|0xc9bf3c34e8591d2...|      7532|cryptopunks|      sale|  122|
|0xa9c769a5a65ffb2...|      6724|cryptopunks|  transfer

In [26]:
grouped_df_small = df_asset_events_flat_small.groupBy("transaction", "identifier", "collection", "event_type").count()
duplicates_df_small = grouped_df_small.filter(F.col("count") > 1)
duplicates_df_small.show()

+-----------+----------+----------+----------+-----+
|transaction|identifier|collection|event_type|count|
+-----------+----------+----------+----------+-----+
+-----------+----------+----------+----------+-----+



In [22]:
duplicates_df_big.take(5)

[Row(transaction='0x317fc2467d9139ed72a063c54f75fe41570407cd2e7dcf94c3ccf59b88c3648a', identifier='2474', collection='cryptopunks', event_type='transfer', count=122),
 Row(transaction='0x38b583c3d19bf01881d39f9044b36ea91fba8e16da3a9591f7a399fe8004d346', identifier='1411', collection='cryptopunks', event_type='transfer', count=122),
 Row(transaction='0xd1f05c106d3f08a04d464366df8953982ab5fddf3b63cb192b3732268b84d964', identifier='1860', collection='cryptopunks', event_type='transfer', count=122),
 Row(transaction='0x4956c57cf093949277ed318972eb5c34a664ce28796fe23cff0d14e9d3031a20', identifier='4347', collection='cryptopunks', event_type='transfer', count=122),
 Row(transaction='0x76708336a7f0abdd7c19dcbb97a857d06d97ff46a845231d10310cbb39b97a3e', identifier='5728', collection='cryptopunks', event_type='transfer', count=122)]

In [28]:
filtered_df_big = df_asset_events_flat_big.filter(
    (df_asset_events_flat_big['transaction'] == '0x317fc2467d9139ed72a063c54f75fe41570407cd2e7dcf94c3ccf59b88c3648a') & 
    (df_asset_events_flat_big['identifier'] == '2474') & 
    (df_asset_events_flat_big['collection'] == 'cryptopunks') & 
    (df_asset_events_flat_big['event_type'] == 'transfer')
)

big_rows = filtered_df_big.collect()

# for row in big_rows:
#     print(row)

# Read into separate df
new_df_big = filtered_df_big
new_df_big.show()

+--------------------+----------+-----+------+--------------------+--------------------+--------+---------------+----------+----------+-----------+--------------------+--------+----------------+------+-------------+
|         transaction|event_type|buyer|seller|        from_address|          to_address|quantity|event_timestamp|order_hash|identifier| collection|            contract|decimals|payment_quantity|symbol|token_address|
+--------------------+----------+-----+------+--------------------+--------------------+--------+---------------+----------+----------+-----------+--------------------+--------+----------------+------+-------------+
|0x317fc2467d9139e...|  transfer| NULL|  NULL|0x2be665ee2709634...|0x5ca12f79e4d33b0...|       1|     1689008423|      NULL|      2474|cryptopunks|0xb47e3cd837ddf8e...|    NULL|            NULL|  NULL|         NULL|
|0x317fc2467d9139e...|  transfer| NULL|  NULL|0x2be665ee2709634...|0x5ca12f79e4d33b0...|       1|     1689008423|      NULL|      2474|c

In [27]:
filtered_df_small = df_asset_events_flat_small.filter(
    (df_asset_events_flat_small['transaction'] == '0x317fc2467d9139ed72a063c54f75fe41570407cd2e7dcf94c3ccf59b88c3648a') & 
    (df_asset_events_flat_small['identifier'] == '2474') & 
    (df_asset_events_flat_small['collection'] == 'cryptopunks') & 
    (df_asset_events_flat_small['event_type'] == 'transfer')
)

small_rows = filtered_df_small.collect()

for row in small_rows:
    print(row)

Row(transaction='0x317fc2467d9139ed72a063c54f75fe41570407cd2e7dcf94c3ccf59b88c3648a', event_type='transfer', buyer=None, seller=None, from_address='0x2be665ee27096344b8f015b1952d3dfdb4db4691', to_address='0x5ca12f79e4d33b0bd153b40df59f6db9ee03482e', quantity=1, event_timestamp=1689008423, order_hash=None, identifier='2474', collection='cryptopunks', contract='0xb47e3cd837ddf8e4c57f05d70ab865de6e193bbb', decimals=None, payment_quantity=None, symbol=None, token_address=None)


In [25]:
distinct_transactions_count_big = df_asset_events_flat_big.groupBy("identifier") \
    .agg(F.countDistinct("transaction").alias("distinct_transaction_count")).orderBy(col("distinct_transaction_count").desc(), col("identifier").asc())

distinct_transactions_count_big.show(truncate=False)

+----------+--------------------------+
|identifier|distinct_transaction_count|
+----------+--------------------------+
|1458      |11                        |
|4347      |11                        |
|1866      |10                        |
|3042      |10                        |
|2791      |9                         |
|8542      |9                         |
|2927      |8                         |
|486       |8                         |
|8096      |8                         |
|1395      |7                         |
|2356      |7                         |
|3309      |7                         |
|3777      |7                         |
|3925      |7                         |
|3947      |7                         |
|2474      |6                         |
|2837      |6                         |
|3616      |6                         |
|4118      |6                         |
|5064      |6                         |
+----------+--------------------------+
only showing top 20 rows



In [132]:
distinct_transactions_count_small = df_asset_events_flat_small.groupBy("identifier") \
    .agg(F.countDistinct("transaction").alias("distinct_transaction_count")).orderBy(col("distinct_transaction_count").desc(), col("identifier").asc())

distinct_transactions_count_small.show(truncate=False)

+----------+--------------------------+
|identifier|distinct_transaction_count|
+----------+--------------------------+
|4869      |104                       |
|4372      |62                        |
|2626      |58                        |
|7326      |58                        |
|5712      |53                        |
|7258      |53                        |
|3628      |49                        |
|5104      |48                        |
|180       |41                        |
|6628      |41                        |
|3694      |39                        |
|6030      |39                        |
|4481      |38                        |
|533       |37                        |
|2480      |36                        |
|2804      |34                        |
|4542      |34                        |
|6093      |34                        |
|6763      |34                        |
|4019      |32                        |
+----------+--------------------------+
only showing top 20 rows



In [None]:
# Avg number of events per token

In [94]:
# Select the relevant columns
distinct_rows = df_asset_events_flat_small.select("transaction", "identifier", "collection", "event_type").distinct()

# Count the number of distinct rows
distinct_count = distinct_rows.count()

# Show the count
print(f"Number of distinct rows: {distinct_count}")

Number of distinct rows: 12177


In [20]:
# First check time frame that is included by single API call

In [29]:
min_max_timestamps_big = df_asset_events_flat_big.agg(
    F.from_unixtime(F.min("event_timestamp")).alias("min_event_timestamp"),
    F.from_unixtime(F.max("event_timestamp")).alias("max_event_timestamp")
)
min_max_timestamps_big.show()

+-------------------+-------------------+
|min_event_timestamp|max_event_timestamp|
+-------------------+-------------------+
|2023-07-01 03:10:11|2023-09-30 14:54:47|
+-------------------+-------------------+



In [30]:
min_max_timestamps_small = df_asset_events_flat_small.agg(
    F.from_unixtime(F.min("event_timestamp")).alias("min_event_timestamp"),
    F.from_unixtime(F.max("event_timestamp")).alias("max_event_timestamp")
)
min_max_timestamps_small.show()

+-------------------+-------------------+
|min_event_timestamp|max_event_timestamp|
+-------------------+-------------------+
|2023-07-01 03:10:11|2023-09-30 14:54:47|
+-------------------+-------------------+



In [20]:
# Difference between event types
# Sale is the ultimate action that reflects a completed transaction.
# Offer and Order can initiate the process leading to a Sale
# Transfer may occur both before and after a Sale, 
# potentially as part of a wash trading scheme where assets are moved around to give a false impression of market activity.

In [21]:
df_grouped_by_event = df_asset_events_flat.groupBy("event_type").agg(
    F.count("*").alias("count")
)
df_grouped_by_event.show()

+----------+-----+
|event_type|count|
+----------+-----+
|      sale| 4970|
|  transfer|19533|
+----------+-----+



In [22]:
# Payment decomposition
# payment_quantity = 23630000000000000000
# decimals = 18
# Amount in WETH = 23630000000000000000 /10^18 = 23.63

In [37]:
df_count_by_symbol = df_asset_events_flat.groupBy("symbol").agg(
    F.count("*").alias("record_count")
).orderBy(col("record_count").desc())

df_count_by_symbol.show()

+------+------------+
|symbol|record_count|
+------+------------+
|  NULL|       19533|
|  WETH|        3669|
|   ETH|        1287|
|  USDC|          14|
+------+------------+



In [None]:
# Read all used symbols

In [14]:
df_all_path = f"s3a://{AWS_S3_BUCKET}/raw/opensea_nft_data/*/*.json"
df_all = spark.read.json(df_all_path)

In [16]:
df_all.printSchema()

root
 |-- buyer: string (nullable = true)
 |-- chain: string (nullable = true)
 |-- closing_date: long (nullable = true)
 |-- event_timestamp: long (nullable = true)
 |-- event_type: string (nullable = true)
 |-- from_address: string (nullable = true)
 |-- nft: struct (nullable = true)
 |    |-- collection: string (nullable = true)
 |    |-- contract: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- display_animation_url: string (nullable = true)
 |    |-- display_image_url: string (nullable = true)
 |    |-- identifier: string (nullable = true)
 |    |-- image_url: string (nullable = true)
 |    |-- is_disabled: boolean (nullable = true)
 |    |-- is_nsfw: boolean (nullable = true)
 |    |-- metadata_url: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- opensea_url: string (nullable = true)
 |    |-- token_standard: string (nullable = true)
 |    |-- updated_at: string (nullable = true)
 |-- order_hash: string (nullable = tr

In [17]:
df_all_symbols = df_all.groupBy("payment.symbol").agg(
    F.count("*").alias("record_count")
).orderBy(col("record_count").desc()) 

df_all_symbols.show()

+------+------------+
|symbol|record_count|
+------+------------+
|  NULL|      126295|
|  USDC|       51540|
|   POL|        2601|
|  WETH|        1660|
|   ETH|        1224|
|USDC.e|          24|
+------+------------+



In [23]:
# Wash trading analysis sample

In [24]:
# 1. Rapid transfers between a small group of accounts (often involving back-and-forth transfers of the same asset for a set of assets).
# 2. Minimal time differences between buys and sells (or transfers).
# 3. No real change in ownership, as the seller and buyer may be the same person, or colluding accounts.
# 4. Repeated behavior over a period, showing a cycle of transfers or sales without any actual long-term holding.

In [25]:
# Build a Transaction Graph

# Nodes: Each node represents a wallet or address.
# Edges: Each edge represents an event (sale or transfer).
# In sale events, the edge connects the seller to the buyer.
# In transfer events, the edge connects the sender to the receiver.