# 0.Setup

In [1]:
import sys
import yaml
import struct
import hashlib
import csv
from pathlib import Path
from datetime import datetime, timezone
from contextlib import redirect_stdout
from io import StringIO

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructType, StructField, StringType, LongType, IntegerType, TimestampType
)


def find_project_root():
    """
    Trouve la racine du projet en cherchant le dossier conf/.
    Fonctionne depuis n'importe quel sous-dossier du projet.
    """
    current = Path.cwd().resolve()
    
    if current.name == "notebooks":
        candidate = current.parent
        if (candidate / "conf").exists():
            return candidate
    
    search = current
    while search != search.parent:
        if (search / "conf").exists() and (search / "conf" / "bda_project_config.yml").exists():
            return search
        search = search.parent
    
    if (current / "conf").exists():
        return current
    
    raise FileNotFoundError(
        f"Cannot find project root (looked for 'conf/' folder)\n"
        f"Started from: {Path.cwd()}\n"
        f"Tip: Run this notebook from the Project/ or Project/notebooks/ directory"
    )

PROJECT_ROOT = find_project_root()
CONFIG_PATH = PROJECT_ROOT / "conf" / "bda_project_config.yml"

with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

PATHS = config['paths']
SPARK_CFG = config['spark']

print(f" Project Root: {PROJECT_ROOT}")
print(f" Config loaded: {CONFIG_PATH}")
print(f" Project Name: {config['project']['name']}")

spark = (
    SparkSession.builder
    spark.sparkContext.setLogLevel("ERROR")
    .appName(SPARK_CFG['app_name'])
    .master(SPARK_CFG['master'])
    .config("spark.driver.memory", SPARK_CFG['driver_memory'])
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.sql.adaptive.enabled", "true")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")

print(f"Spark version: {spark.version}")
print(f"App name: {spark.sparkContext.appName}")
print(f"Master: {spark.sparkContext.master}")

 Project Root: /home/img/BigData/Project
 Config loaded: /home/img/BigData/Project/conf/bda_project_config.yml
 Project Name: Bitcoin Price Predictor


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/07 11:48:07 WARN Utils: Your hostname, a03-341a, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/07 11:48:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/07 11:48:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/07 11:48:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/12/07 11:48:09 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/12/07 11:48:09 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


Spark version: 4.0.1
App name: BTC_ETL_Custom
Master: local[*]


# 1.File Management

In [2]:
BLOCKS_DIR = PROJECT_ROOT / PATHS['raw_blocks']

OUTPUT_PATH = PROJECT_ROOT / PATHS['output_transactions']

METRICS_FILE = PROJECT_ROOT / PATHS['metrics_file']

EVIDENCE_DIR = PROJECT_ROOT / "evidence"

OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
EVIDENCE_DIR.mkdir(parents=True, exist_ok=True)

print("=" * 60)
print("PATHS CONFIGURED (relative to PROJECT_ROOT)")
print("=" * 60)
print(f"Blocks Dir     : {BLOCKS_DIR}")
print(f"Output Parquet : {OUTPUT_PATH}")
print(f"Metrics File   : {METRICS_FILE}")
print(f"\nBlocks dir exists: {BLOCKS_DIR.exists()}")

PATHS CONFIGURED (relative to PROJECT_ROOT)
Blocks Dir     : /home/img/BigData/Project/data/blocks/blocks
Output Parquet : /home/img/BigData/Project/data/output/transactions_parquet
Metrics File   : /home/img/BigData/Project/project_metrics_log.csv

Blocks dir exists: True


# 2.Bitcoin Parser

In [3]:
BITCOIN_MAGIC = bytes.fromhex('F9BEB4D9')

def double_sha256(data: bytes) -> bytes:
    return hashlib.sha256(hashlib.sha256(data).digest()).digest()

def read_varint(data: bytes, offset: int) -> tuple:
    b = data[offset]
    if b < 0xFD:
        return b, offset + 1
    elif b == 0xFD:
        return struct.unpack('<H', data[offset+1:offset+3])[0], offset + 3
    elif b == 0xFE:
        return struct.unpack('<I', data[offset+1:offset+5])[0], offset + 5
    else:
        return struct.unpack('<Q', data[offset+1:offset+9])[0], offset + 9

def parse_tx(data: bytes, offset: int) -> tuple:
    """Parse une transaction, retourne (dict, new_offset)."""
    start = offset
    offset += 4  # version
    
    is_segwit = data[offset] == 0x00 and data[offset+1] == 0x01
    if is_segwit:
        offset += 2
    
    n_in, offset = read_varint(data, offset)
    for _ in range(n_in):
        offset += 36
        script_len, offset = read_varint(data, offset)
        offset += script_len + 4
    
    n_out, offset = read_varint(data, offset)
    total_sats = 0
    for _ in range(n_out):
        total_sats += struct.unpack('<Q', data[offset:offset+8])[0]
        offset += 8
        script_len, offset = read_varint(data, offset)
        offset += script_len
    
    if is_segwit:
        for _ in range(n_in):
            wit_count, offset = read_varint(data, offset)
            for _ in range(wit_count):
                wit_len, offset = read_varint(data, offset)
                offset += wit_len
    
    offset += 4  
    txid = double_sha256(data[start:offset])[::-1].hex()
    
    return {'tx_id': txid, 'n_inputs': n_in, 'n_outputs': n_out, 'amount_sats': total_sats}, offset

def parse_blk_file(filepath: str) -> list:
    """Parse un fichier blk*.dat complet. Execute en parallele sur les workers."""
    transactions = []
    
    with open(filepath, 'rb') as f:
        data = f.read()
    
    offset = 0
    size = len(data)
    
    while offset < size - 88:
        if data[offset:offset+4] != BITCOIN_MAGIC:
            offset += 1
            continue
        
        offset += 4
        block_size = struct.unpack('<I', data[offset:offset+4])[0]
        offset += 4
        
        if block_size == 0 or offset + block_size > size:
            break
        
        block_start = offset
        
        try:
            header = data[offset:offset+80]
            block_hash = double_sha256(header)[::-1].hex()
            timestamp = struct.unpack('<I', header[68:72])[0]
            offset += 80
            
            tx_count, offset = read_varint(data, offset)
            
            for _ in range(tx_count):
                if offset >= block_start + block_size:
                    break
                tx, offset = parse_tx(data, offset)
                tx['block_hash'] = block_hash
                tx['timestamp'] = timestamp
                tx['block_size'] = block_size
                transactions.append(tx)
        except:
            pass
        
        offset = block_start + block_size
    
    return transactions

print("Parser functions defined")

Parser functions defined


# 3.Extract

In [4]:
# Lister les fichiers blk*.dat
blk_files = sorted(BLOCKS_DIR.glob("blk*.dat")) if BLOCKS_DIR.exists() else []
print(f"Block files found: {len(blk_files)}")

if blk_files:
    total_input_size = sum(f.stat().st_size for f in blk_files)
    print(f"Total size: {total_input_size / (1024**3):.2f} GB")
    
    # RDD parallele: chaque worker parse un fichier
    file_paths = [str(f) for f in blk_files]
    files_rdd = spark.sparkContext.parallelize(file_paths, numSlices=len(file_paths))
    
    # flatMap car parse_blk_file retourne une liste de transactions
    transactions_rdd = files_rdd.flatMap(parse_blk_file)
    transactions_rdd.cache()
    
    tx_count = transactions_rdd.count()
    print(f"\nTransactions parsed: {tx_count:,}")
else:
    print(f"No block files in: {BLOCKS_DIR}")
    transactions_rdd = None
    tx_count = 0
    total_input_size = 0

Block files found: 9
Total size: 1.09 GB


[Stage 0:>                                                          (0 + 9) / 9]


Transactions parsed: 2,236,800


                                                                                

# 4.Transform

In [5]:
if transactions_rdd and tx_count > 0:
    # Schema Spark
    tx_schema = StructType([
        StructField("tx_id", StringType(), False),
        StructField("block_hash", StringType(), False),
        StructField("timestamp", LongType(), False),
        StructField("n_inputs", IntegerType(), False),
        StructField("n_outputs", IntegerType(), False),
        StructField("amount_sats", LongType(), False),
        StructField("block_size", IntegerType(), False)
    ])
    
    # RDD[dict] -> RDD[tuple] -> DataFrame
    rows_rdd = transactions_rdd.map(lambda tx: (
        tx['tx_id'], tx['block_hash'], tx['timestamp'],
        tx['n_inputs'], tx['n_outputs'], tx['amount_sats'], tx['block_size']
    ))
    
    raw_df = spark.createDataFrame(rows_rdd, schema=tx_schema)
    
    # Colonnes temporelles (meme logique que market_etl pour jointure)
    transformed_df = (
        raw_df
        .withColumn("datetime", F.from_unixtime(F.col("timestamp")).cast(TimestampType()))
        .withColumn("timestamp_hour", (F.floor(F.col("timestamp") / 3600) * 3600).cast(LongType()))
        .withColumn("date", F.to_date(F.col("datetime")))
        .withColumn("hour", F.hour(F.col("datetime")))
    )
    
    transformed_df.cache()
    row_count = transformed_df.count()
    
    print(f"DataFrame: {row_count:,} rows")
    transformed_df.printSchema()
    
    transformed_df.select("tx_id", "datetime", "timestamp_hour", "n_inputs", "amount_sats").show(10, truncate=20)

    date_range = transformed_df.agg(
        F.min("datetime").alias("first_block"),
        F.max("datetime").alias("last_block"),
        F.countDistinct("block_hash").alias("n_blocks")
    ).collect()[0]
    
    print("=" * 60)
    print("COUVERTURE TEMPORELLE DES BLOCS")
    print("=" * 60)
    print(f"Premier bloc : {date_range['first_block']}")
    print(f"Dernier bloc : {date_range['last_block']}")
    print(f"Nombre de blocs uniques : {date_range['n_blocks']:,}")
    
    # Calculer la duree
    if date_range['first_block'] and date_range['last_block']:
        duration = date_range['last_block'] - date_range['first_block']
        print(f"Duree couverte : {duration.days} jours, {duration.seconds // 3600} heures")
else:
    transformed_df = None
    row_count = 0

                                                                                

DataFrame: 2,236,800 rows
root
 |-- tx_id: string (nullable = false)
 |-- block_hash: string (nullable = false)
 |-- timestamp: long (nullable = false)
 |-- n_inputs: integer (nullable = false)
 |-- n_outputs: integer (nullable = false)
 |-- amount_sats: long (nullable = false)
 |-- block_size: integer (nullable = false)
 |-- datetime: timestamp (nullable = true)
 |-- timestamp_hour: long (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)

+--------------------+-------------------+--------------+--------+-----------+
|               tx_id|           datetime|timestamp_hour|n_inputs|amount_sats|
+--------------------+-------------------+--------------+--------+-----------+
|d1827b6a6c1b67830...|2013-12-07 14:22:49|    1386424800|       1| 2513325458|
|3fb28e73f5a8cb701...|2013-12-07 14:22:49|    1386424800|       2|  110442816|
|6711e8803b04b774f...|2013-12-07 14:22:49|    1386424800|       3| 1998950000|
|bef8b2a4273a11650...|2013-12-07 14:22:49|  

[Stage 6:>                                                          (0 + 9) / 9]

COUVERTURE TEMPORELLE DES BLOCS
Premier bloc : 2013-12-05 19:19:51
Dernier bloc : 2014-01-15 01:19:13
Nombre de blocs uniques : 6,947
Duree couverte : 40 jours, 5 heures


                                                                                

# 5.Spark Execution Plan

In [6]:
if transformed_df:
    print("=" * 60)
    print("SPARK EXECUTION PLAN")
    print("=" * 60)
    transformed_df.explain("formatted")
    
    plan_buffer = StringIO()
    with redirect_stdout(plan_buffer):
        transformed_df.explain("formatted")
    
    plan_file = EVIDENCE_DIR / "block_ingestion_explain.txt"
    plan_file.write_text(f"# Block Ingestion Execution Plan\n# Date: {datetime.now()}\n\n{plan_buffer.getvalue()}")
    
    print(f"\nExecution plan saved to: {plan_file}")

SPARK EXECUTION PLAN
== Physical Plan ==
InMemoryTableScan (1)
   +- InMemoryRelation (2)
         +- * Project (5)
            +- * Project (4)
               +- * Scan ExistingRDD (3)


(1) InMemoryTableScan
Output [11]: [tx_id#0, block_hash#1, timestamp#2L, n_inputs#3, n_outputs#4, amount_sats#5L, block_size#6, datetime#7, timestamp_hour#8L, date#9, hour#10]
Arguments: [tx_id#0, block_hash#1, timestamp#2L, n_inputs#3, n_outputs#4, amount_sats#5L, block_size#6, datetime#7, timestamp_hour#8L, date#9, hour#10]

(2) InMemoryRelation
Arguments: [tx_id#0, block_hash#1, timestamp#2L, n_inputs#3, n_outputs#4, amount_sats#5L, block_size#6, datetime#7, timestamp_hour#8L, date#9, hour#10], StorageLevel(disk, memory, deserialized, 1 replicas)

(3) Scan ExistingRDD [codegen id : 1]
Output [7]: [tx_id#0, block_hash#1, timestamp#2L, n_inputs#3, n_outputs#4, amount_sats#5L, block_size#6]
Arguments: [tx_id#0, block_hash#1, timestamp#2L, n_inputs#3, n_outputs#4, amount_sats#5L, block_size#6], MapPart

# 6.Load

In [7]:
if transformed_df:
    final_df = transformed_df.select(
        "tx_id", "block_hash", "timestamp", "datetime", "timestamp_hour",
        "date", "hour", "n_inputs", "n_outputs", "amount_sats", "block_size"
    )
    
    (
        final_df.write
        .mode("overwrite")
        .option("compression", "snappy")
        .parquet(str(OUTPUT_PATH))
    )
    
    print(f"  Parquet saved to: {OUTPUT_PATH}")

if transformed_df:
    parquet_files = list(OUTPUT_PATH.glob("*.parquet"))
    total_size = sum(f.stat().st_size for f in parquet_files)
    
    print(f"Parquet files created: {len(parquet_files)}")
    print(f"Total size: {total_size / (1024*1024):.2f} MB")
    
    verify_df = spark.read.parquet(str(OUTPUT_PATH))
    verify_count = verify_df.count()
    
    print(f"Rows in Parquet: {verify_count:,}")
    print(f"Integrity check: {'OK' if verify_count == row_count else 'MISMATCH'}")

                                                                                

  Parquet saved to: /home/img/BigData/Project/data/output/transactions_parquet
Parquet files created: 9
Total size: 155.18 MB
Rows in Parquet: 2,236,800
Integrity check: OK


# 7.Spark Evidence

In [8]:
run_id = f"block_ingestion_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

if not METRICS_FILE.exists():
    with open(METRICS_FILE, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['run_id', 'task', 'note', 'files_read', 'input_size_bytes', 
                        'shuffle_read_bytes', 'shuffle_write_bytes', 'timestamp'])

output_size = sum(f.stat().st_size for f in OUTPUT_PATH.glob("*.parquet")) if OUTPUT_PATH.exists() else 0

timestamp = datetime.now(timezone.utc).isoformat()

with open(METRICS_FILE, 'a', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow([
        run_id,
        "block_ingestion",
        f"txs={row_count}",
        len(blk_files),
        total_input_size,
        0,
        0,
        timestamp
    ])

print(f"   Metrics logged to: {METRICS_FILE}")
print(f"   Run ID: {run_id}")
print(f"   Input size: {total_input_size / (1024*1024):.2f} MB")
print(f"   Output size: {output_size / (1024*1024):.2f} MB")

if transactions_rdd:
    transactions_rdd.unpersist()
if transformed_df:
    transformed_df.unpersist()
print("Cache released.")

   Metrics logged to: /home/img/BigData/Project/project_metrics_log.csv
   Run ID: block_ingestion_20251207_114845
   Input size: 1118.59 MB
   Output size: 155.18 MB
Cache released.
