In [1]:
# Add some transformation notebooks and read data from this location

In [2]:
# Read df

In [3]:
import sys
sys.path.append('/home/jovyan/work') # here add notebooks if fails

In [4]:
from pyspark.sql import SparkSession
import nbimporter
from utils.vault_scripts import read_root_token, get_secret_from_vault
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, when, lit, expr
from graphframes import GraphFrame
from pyspark.sql.functions import col, to_timestamp, lag, unix_timestamp, expr
from pyspark.sql.window import Window
import networkx as nx
import matplotlib.pyplot as plt

In [5]:
spark = SparkSession.builder.appName("DetectCyclesApp").getOrCreate()

In [6]:
hadoopConf = spark._jsc.hadoopConfiguration()

In [7]:
AWS_KEY_ID = get_secret_from_vault("aws1", "keyid")
AWS_ACCESS_KEY = get_secret_from_vault("aws2", "accesskey")
AWS_S3_BUCKET = get_secret_from_vault("aws3", "s3bucket")

In [8]:
hadoopConf.set("fs.s3a.access.key", AWS_KEY_ID)
hadoopConf.set("fs.s3a.secret.key", AWS_ACCESS_KEY)
hadoopConf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

In [9]:
s3_path = f"s3a://{AWS_S3_BUCKET}/raw/opensea_data/*.json"

In [10]:
df = spark.read.json(s3_path)

In [11]:
df_asset_events = df.select(explode(col("asset_events")).alias("event"))

In [12]:
df_asset_events_flat = df_asset_events.select(
    col("event.transaction"),
    col("event.event_type"),
    col("event.buyer"),
    col("event.seller"),
    col("event.from_address"),
    col("event.to_address"),
    col("event.quantity"),
    col("event.event_timestamp"),
    col("event.order_hash"),
    col("event.nft.identifier"),
    col("event.nft.collection"),
    col("event.nft.contract"),
    col("event.payment.decimals"),
    col("event.payment.quantity").alias("payment_quantity"),
    col("event.payment.symbol"),
    col("event.payment.token_address")
)

In [13]:
# Convert the timestamp to a proper timestamp format
df_asset_events_flat = df_asset_events_flat.withColumn("event_timestamp", to_timestamp("event_timestamp"))

# Create a window partitioned by token identifier and ordered by time
windowSpec = Window.partitionBy("identifier").orderBy("event_timestamp")

# Add previous buyer and seller addresses for each transaction within the token's sales history
df_asset_events_flat = df_asset_events_flat.withColumn("prev_buyer", lag("to_address").over(windowSpec)) \
                                           .withColumn("prev_seller", lag("from_address").over(windowSpec)) \
                                           .withColumn("prev_event_timestamp", lag("event_timestamp").over(windowSpec))

In [14]:
df_asset_events_flat.take(5)

[Row(transaction='0xb2e890c6fc844a7eaa1067d00299263f7aaf0d1d9d416c0891b5986d64c9208e', event_type='transfer', buyer=None, seller=None, from_address='0x6e353e91cb721fda5b3131ac40d945f7775c95e7', to_address='0x96d618adc74b18e26968a2aa8ad1ec3f95940d64', quantity=1, event_timestamp=datetime.datetime(2023, 8, 24, 16, 53, 59), order_hash=None, identifier='100', collection='boredapeyachtclub', contract='0xbc4ca0eda7647a8ab7c2061c2e118a18a936f13d', decimals=None, payment_quantity=None, symbol=None, token_address=None, prev_buyer=None, prev_seller=None, prev_event_timestamp=None),
 Row(transaction='0xb2e890c6fc844a7eaa1067d00299263f7aaf0d1d9d416c0891b5986d64c9208e', event_type='sale', buyer='0x96d618adc74b18e26968a2aa8ad1ec3f95940d64', seller='0x6e353e91cb721fda5b3131ac40d945f7775c95e7', from_address=None, to_address=None, quantity=1, event_timestamp=datetime.datetime(2023, 8, 24, 16, 53, 59), order_hash='0xee59198a69ffee09f699b6788c4c9217a8bec046ab3e352df236c3f9713de87e', identifier='100', col

In [15]:
# Filter only transfer events
df_asset_transfers = df_asset_events_flat.filter(col("event_type") == "transfer")

# Create a window partitioned by token identifier and ordered by event timestamp
windowSpec = Window.partitionBy("identifier").orderBy("event_timestamp")

# Add previous buyer (to_address) and previous event timestamp for each token transfer
df_asset_transfers = df_asset_transfers.withColumn("prev_buyer", lag("to_address").over(windowSpec)) \
                                       .withColumn("prev_event_timestamp", lag("event_timestamp").over(windowSpec))

In [16]:
# Calculate the time difference in seconds between the current and previous transfer events
df_asset_transfers = df_asset_transfers.withColumn("time_diff", unix_timestamp(col("event_timestamp")) - unix_timestamp(col("prev_event_timestamp")))

In [17]:
# Filter where the previous buyer is the current seller and the time difference is less than 30 days (2592000 seconds)
df_wash_transfers = df_asset_transfers.filter((col("from_address") == col("prev_buyer")) & (col("time_diff") <= 2592000))

In [18]:
# Create wallet nodes (distinct wallet addresses)
wallets = df_asset_transfers.select("from_address").union(df_asset_transfers.select("to_address")).distinct().withColumnRenamed("from_address", "id")

# Create edges (transactions between wallets for transfers)
edges_transfers = df_asset_transfers.select(col("from_address").alias("src"), col("to_address").alias("dst"), "event_timestamp")

# Create the transfer graph 
graph_transfers = GraphFrame(wallets, edges_transfers)

# Display the wallet nodes and transfer edges
graph_transfers.vertices.show()
graph_transfers.edges.show()



+--------------------+
|                  id|
+--------------------+
|0x532948a2bf980a0...|
|0x7d462af7fb6aa7b...|
|0xa66515af0dfd9aa...|
|0x99d73d76f0058f4...|
|0x174603889ff6086...|
|0x8d3ea02a80b19c8...|
|0x69986d5af481c14...|
|0x484a8e9d2d135c5...|
|0xafc786f195f4a1c...|
|0x65249532663d15a...|
|0x7051499622ab354...|
|0xf15c93562bc3944...|
|0xc67db0df9222389...|
|0xb5bc4849b531b29...|
|0x17b70f6b0dd3bf1...|
|0x021cba2d12aa986...|
|0x3e6527de2cfec82...|
|0x142668dc89e7e69...|
|0x945191ab90d5b51...|
|0x096913c0c00a9f9...|
+--------------------+
only showing top 20 rows

+--------------------+--------------------+-------------------+
|                 src|                 dst|    event_timestamp|
+--------------------+--------------------+-------------------+
|0xdbfd76af2157dc1...|0xdbfd76af2157dc1...|2023-08-20 08:33:11|
|0x2e9a18d66f2fc53...|0xe0b6e70261db8ef...|2023-08-20 08:28:47|
|0xc67db0df9222389...|0xea5b1f2f29d89dd...|2023-08-20 08:24:11|
|0xa8d87df83755179...|0xdb5485c85bd95f

In [19]:
# Convert edges to Pandas for easy graph manipulation in NetworkX
edges_transfers_pd = edges_transfers.select(col("src"), col("dst")).toPandas()

# Ensure there are no duplicate edges to avoid unnecessary noise
edges_transfers_pd = edges_transfers_pd.drop_duplicates()

# Create a directed graph from the transfers data
G_transfers = nx.from_pandas_edgelist(edges_transfers_pd, source='src', target='dst', create_using=nx.DiGraph())

# Find all simple cycles in the graph (this may take time depending on the size of the graph)
cycles = list(nx.simple_cycles(G_transfers))

# Check if we found any cycles
if cycles:
    # Get a random cycle (we are using the first cycle found here for simplicity)
    cycle = cycles[0]
    
    # Generate edges for this cycle
    cycle_edges = [(cycle[i], cycle[i+1]) for i in range(len(cycle) - 1)]
    cycle_edges.append((cycle[-1], cycle[0]))  # Close the cycle by connecting the last to the first node
    
    # Create a new subgraph containing only the cycle
    G_cycle = nx.DiGraph()
    G_cycle.add_edges_from(cycle_edges)
    
    # Draw the graph with just this cycle
    plt.figure(figsize=(8, 6))
    nx.draw(G_cycle, with_labels=True, node_size=700, node_color='lightblue', font_size=10, font_weight='bold', edge_color='gray')
    plt.title(f"Cycle: {' -> '.join(cycle)}")
    plt.show()
else:
    print("No cycles found in the graph.")

KeyboardInterrupt: 

In [20]:
# Step 1: Filter only transfer events
df_asset_transfers = df_asset_events_flat.filter(col("event_type") == "transfer")

# Step 2: Create a window partitioned by token identifier and ordered by event timestamp
windowSpec = Window.partitionBy("identifier").orderBy("event_timestamp")

# Step 3: Add previous buyer (to_address) and previous event timestamp for each token transfer
df_asset_transfers = df_asset_transfers.withColumn("prev_buyer", lag("to_address").over(windowSpec)) \
                                       .withColumn("prev_event_timestamp", lag("event_timestamp").over(windowSpec))

# Step 4: Calculate the time difference in seconds between the current and previous transfer events
df_asset_transfers = df_asset_transfers.withColumn("time_diff", unix_timestamp(col("event_timestamp")) - unix_timestamp(col("prev_event_timestamp")))

# Step 5: Filter where the previous buyer is the current seller and the time difference is less than 30 days (2592000 seconds)
df_wash_transfers = df_asset_transfers.filter((col("from_address") == col("prev_buyer")) & (col("time_diff") <= 2592000))

# Step 6: Count total transfer transactions
total_transfers = df_asset_transfers.count()

# Step 7: Count wash trade (cyclical) transactions
wash_trades_count = df_wash_transfers.count()

# Step 8: Calculate the percentage of wash trades
percentage_wash_trades = (wash_trades_count / total_transfers) * 100

# Step 9: Output the result
print(f"Percentage of wash trades: {percentage_wash_trades:.2f}%")

Percentage of wash trades: 50.43%
