# 0.Setup

In [1]:
import sys
import yaml
import csv
from pathlib import Path
from datetime import datetime, timezone
from contextlib import redirect_stdout
from io import StringIO

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import DoubleType, LongType, IntegerType


def find_project_root():
    """
    Trouve la racine du projet en cherchant le dossier conf/.
    Fonctionne depuis n'importe quel sous-dossier du projet.
    """
    current = Path.cwd().resolve()
    
    if current.name == "notebooks":
        candidate = current.parent
        if (candidate / "conf").exists():
            return candidate
    
    search = current
    while search != search.parent:
        if (search / "conf").exists() and (search / "conf" / "bda_project_config.yml").exists():
            return search
        search = search.parent
    
    if (current / "conf").exists():
        return current
    
    raise FileNotFoundError(
        f"Cannot find project root (looked for 'conf/' folder)\n"
        f"Started from: {Path.cwd()}\n"
        f"Tip: Run this notebook from the Project/ or Project/notebooks/ directory"
    )

PROJECT_ROOT = find_project_root()
CONFIG_PATH = PROJECT_ROOT / "conf" / "bda_project_config.yml"

with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

PATHS = config['paths']
SPARK_CFG = config['spark']

print(f" Project Root: {PROJECT_ROOT}")
print(f" Config loaded: {CONFIG_PATH}")
print(f" Project Name: {config['project']['name']}")

spark = (
    SparkSession.builder
    .appName(SPARK_CFG['app_name'])
    .master(SPARK_CFG['master'])
    .config("spark.driver.memory", SPARK_CFG['driver_memory'])
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.sql.adaptive.enabled", "true")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

print(f"Spark version: {spark.version}")
print(f"App name: {spark.sparkContext.appName}")
print(f"Master: {spark.sparkContext.master}")

 Project Root: /home/img/BigData/Project
 Config loaded: /home/img/BigData/Project/conf/bda_project_config.yml
 Project Name: Bitcoin Price Predictor


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/07 12:02:13 WARN Utils: Your hostname, a03-341a, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/07 12:02:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/07 12:02:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/07 12:02:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/12/07 12:02:21 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


Spark version: 4.0.1
App name: BTC_ETL_Custom
Master: local[*]


# 1.File Management

In [2]:
# Inputs (Parquets des ETL precedents)
TRANSACTIONS_PATH = PROJECT_ROOT / PATHS['output_transactions']
PRICES_PATH = PROJECT_ROOT / PATHS['output_market']

# Output
FEATURES_PATH = PROJECT_ROOT / PATHS.get('output_features', 'data/output_features')

METRICS_FILE = PROJECT_ROOT / PATHS['metrics_file']
EVIDENCE_DIR = PROJECT_ROOT / "evidence"

FEATURES_PATH.mkdir(parents=True, exist_ok=True)
EVIDENCE_DIR.mkdir(parents=True, exist_ok=True)

print("=" * 60)
print("PATHS CONFIGURED")
print("=" * 60)
print(f"Transactions : {TRANSACTIONS_PATH}")
print(f"Prices       : {PRICES_PATH}")
print(f"Features Out : {FEATURES_PATH}")
print(f"\nTransactions exists: {TRANSACTIONS_PATH.exists()}")
print(f"Prices exists: {PRICES_PATH.exists()}")

PATHS CONFIGURED
Transactions : /home/img/BigData/Project/data/output/transactions_parquet
Prices       : /home/img/BigData/Project/data/output/market_parquet
Features Out : /home/img/BigData/Project/data/output/features_parquet

Transactions exists: True
Prices exists: True


# 2.Load

In [3]:
# Charger les donnees des ETL precedents
transactions_df = spark.read.parquet(str(TRANSACTIONS_PATH))
prices_df = spark.read.parquet(str(PRICES_PATH))

# Cache pour reutilisation
transactions_df.cache()
prices_df.cache()

tx_count = transactions_df.count()
price_count = prices_df.count()

print(f"Transactions loaded: {tx_count:,}")
print(f"Prices loaded: {price_count:,}")

# Verifier la plage temporelle commune
tx_range = transactions_df.agg(
    F.min("timestamp_hour").alias("min_h"),
    F.max("timestamp_hour").alias("max_h")
).collect()[0]

price_range = prices_df.agg(
    F.min("timestamp_hour").alias("min_h"),
    F.max("timestamp_hour").alias("max_h")
).collect()[0]

print(f"\nTransactions: {datetime.utcfromtimestamp(tx_range['min_h'])} -> {datetime.utcfromtimestamp(tx_range['max_h'])}")
print(f"Prices: {datetime.utcfromtimestamp(price_range['min_h'])} -> {datetime.utcfromtimestamp(price_range['max_h'])}")

# Plage commune
common_start = max(tx_range['min_h'], price_range['min_h'])
common_end = min(tx_range['max_h'], price_range['max_h'])
print(f"\nPlage commune: {datetime.utcfromtimestamp(common_start)} -> {datetime.utcfromtimestamp(common_end)}")

                                                                                

Transactions loaded: 2,236,800
Prices loaded: 7,317,759

Transactions: 2013-12-05 18:00:00 -> 2014-01-15 00:00:00
Prices: 2012-01-01 10:00:00 -> 2025-11-30 23:00:00

Plage commune: 2013-12-05 18:00:00 -> 2014-01-15 00:00:00


# 3. Aggregate on chain by hour

In [4]:
# ==========================================================================
# AGREGATION HORAIRE DES DONNEES ON-CHAIN
# ==========================================================================
# CORRECTION: Seuil whale en percentile (top 1% des transactions)
# plutot qu'un seuil fixe en BTC qui n'est pas coherent dans le temps

# D'abord, calculer le percentile 99 des montants pour definir "whale"
whale_threshold = transactions_df.approxQuantile("amount_sats", [0.99], 0.01)[0]
print(f"Whale threshold (P99): {whale_threshold / 100_000_000:.2f} BTC")

# Agregation par heure avec Spark (pas de pandas !)
chain_hourly = (
    transactions_df
    .groupBy("timestamp_hour")
    .agg(
        # Activite brute
        F.count("tx_id").alias("tx_count"),
        F.countDistinct("block_hash").alias("n_blocks"),
        
        # Volume
        F.sum("amount_sats").alias("volume_sats"),
        F.avg("amount_sats").alias("avg_tx_sats"),
        
        # Taille des blocs
        F.avg("block_size").alias("avg_block_size"),
        F.sum("block_size").alias("total_block_size"),
        
        # Whales = top 1% des transactions (seuil dynamique)
        F.sum(F.when(F.col("amount_sats") > whale_threshold, 1).otherwise(0)).alias("whale_tx_count"),
        F.sum(F.when(F.col("amount_sats") > whale_threshold, F.col("amount_sats")).otherwise(0)).alias("whale_volume_sats"),
        
        # Coinbase = miner issuance (n_inputs = 0)
        # CORRECTION: Renomme en "issuance" car c'est la creation monetaire, pas les ventes
        F.sum(F.when(F.col("n_inputs") == 0, 1).otherwise(0)).alias("coinbase_count"),
        F.sum(F.when(F.col("n_inputs") == 0, F.col("amount_sats")).otherwise(0)).alias("miner_issuance_sats")
    )
    # Convertir satoshis en BTC
    .withColumn("volume_btc", F.col("volume_sats") / 100_000_000)
    .withColumn("whale_volume_btc", F.col("whale_volume_sats") / 100_000_000)
    .withColumn("miner_issuance_btc", F.col("miner_issuance_sats") / 100_000_000)
)

chain_hourly.cache()
chain_hours = chain_hourly.count()

print(f"Chain hourly aggregated: {chain_hours:,} hours")
chain_hourly.show(5, truncate=False)

                                                                                

Whale threshold (P99): 90993.00 BTC


                                                                                

Chain hourly aggregated: 958 hours
+--------------+--------+--------+--------------+--------------------+------------------+----------------+--------------+-----------------+--------------+-------------------+---------------+----------------+------------------+
|timestamp_hour|tx_count|n_blocks|volume_sats   |avg_tx_sats         |avg_block_size    |total_block_size|whale_tx_count|whale_volume_sats|coinbase_count|miner_issuance_sats|volume_btc     |whale_volume_btc|miner_issuance_btc|
+--------------+--------+--------+--------------+--------------------+------------------+----------------+--------------+-----------------+--------------+-------------------+---------------+----------------+------------------+
|1386972000    |2862    |12      |3234072742750 |1.1300044523934312E9|191591.97099930118|548336221       |0             |0                |0             |0                  |32340.7274275  |0.0             |0.0               |
|1386730800    |3540    |9       |12466129382816|3.521505

# Aggregate price by hour

In [5]:
# ==========================================================================
# AGREGATION HORAIRE DES PRIX
# Les donnees sont deja a la minute, on agregue en OHLCV horaire
# ==========================================================================

# Window pour trouver le premier et dernier prix de l'heure
hour_window = Window.partitionBy("timestamp_hour").orderBy("timestamp_unix")
hour_window_desc = Window.partitionBy("timestamp_hour").orderBy(F.desc("timestamp_unix"))

price_hourly = (
    prices_df
    .withColumn("row_asc", F.row_number().over(hour_window))
    .withColumn("row_desc", F.row_number().over(hour_window_desc))
    .groupBy("timestamp_hour")
    .agg(
        # OHLCV
        F.first(F.when(F.col("row_asc") == 1, F.col("open"))).alias("open"),
        F.max("high").alias("high"),
        F.min("low").alias("low"),
        F.first(F.when(F.col("row_desc") == 1, F.col("close"))).alias("close"),
        F.sum("volume").alias("volume_exchange")
    )
)

price_hourly.cache()
price_hours = price_hourly.count()

print(f"Price hourly aggregated: {price_hours:,} hours")
price_hourly.show(5, truncate=False)



Price hourly aggregated: 121,964 hours
+--------------+----+----+----+-----+---------------+
|timestamp_hour|open|high|low |close|volume_exchange|
+--------------+----+----+----+-----+---------------+
|1325534400    |NULL|5.0 |5.0 |5.0  |0.0            |
|1328572800    |NULL|5.9 |5.9 |5.9  |0.0            |
|1328965200    |NULL|5.88|5.71|5.71 |0.1794878      |
|1329264000    |NULL|4.56|4.56|4.56 |0.0            |
|1330434000    |NULL|5.04|5.04|5.04 |0.0            |
+--------------+----+----+----+-----+---------------+
only showing top 5 rows


                                                                                

# Join on chain and prices

In [6]:
# ==========================================================================
# JOINTURE SUR timestamp_hour
# Inner join: on garde uniquement les heures ou on a les deux
# ==========================================================================

merged_df = chain_hourly.join(price_hourly, "timestamp_hour", "inner")

merged_df.cache()
merged_count = merged_df.count()

print(f"Merged dataset: {merged_count:,} hours")
print(f"(Chain: {chain_hours}, Prices: {price_hours})")

merged_df.select("timestamp_hour", "tx_count", "volume_btc", "close", "volume_exchange").show(10)

Merged dataset: 958 hours
(Chain: 958, Prices: 121964)
+--------------+--------+---------------+------+------------------+
|timestamp_hour|tx_count|     volume_btc| close|   volume_exchange|
+--------------+--------+---------------+------+------------------+
|    1386468000|    1721| 49586.56135508|741.93|421.52729140000014|
|    1386730800|    3540|124661.29382816| 977.0|1258.2459673399997|
|    1386972000|    2862|  32340.7274275| 872.0| 424.7653859099999|
|    1387472400|    3512| 63377.66603127| 640.0|3958.0841365100005|
|    1388739600|    2854| 24535.30260495|778.59|      244.48256643|
|    1386475200|     898| 28549.51505454| 693.3|     1265.63354919|
|    1386576000|    2103|  40269.0119268| 840.0|1342.6005612000004|
|    1386943200|    2984| 45556.04955307| 914.0|338.56955772999993|
|    1387130400|    2374| 20502.92298697|827.88|      355.18842108|
|    1387213200|    2658| 107642.9049248|791.63|     2684.29106636|
+--------------+--------+---------------+------+-------------

# Feature Engineering 

In [7]:
# ==========================================================================
# FEATURES AVEC WINDOW FUNCTIONS (100% Spark, pas de pandas)
# ==========================================================================

# Windows temporelles
w1 = Window.orderBy("timestamp_hour")  # Pour les lags
w24 = Window.orderBy("timestamp_hour").rowsBetween(-23, 0)  # 24h glissantes

features_df = (
    merged_df
    
    # --- A. FEATURES PRIX ---
    # Retours passes (momentum)
    .withColumn("close_lag1", F.lag("close", 1).over(w1))
    .withColumn("close_lag24", F.lag("close", 24).over(w1))
    .withColumn("return_1h", (F.col("close") - F.col("close_lag1")) / F.col("close_lag1"))
    .withColumn("return_24h", (F.col("close") - F.col("close_lag24")) / F.col("close_lag24"))
    
    # Volatilite 24h (std des returns)
    .withColumn("volatility_24h", F.stddev("return_1h").over(w24))
    
    # --- B. FEATURES ON-CHAIN ENRICHIES ---
    # CORRECTION: Supprime velocity (division par constante = inutile)
    # Le modele utilisera volume_btc directement
    
    # NVT-like ratio = price / volume_btc
    # CORRECTION: Renomme en nvt_like pour etre honnete (ce n'est pas le vrai NVT)
    .withColumn("nvt_like", F.col("close") / (F.col("volume_btc") + 0.001))
    
    # CORRECTION: Supprime price_per_tx (trop correle avec close et nvt_like)
    
    # Ratio on-chain vs exchange (volume BTC on-chain / volume BTC exchange)
    # Note: Suppose que volume_exchange est en BTC (verifier dans les donnees Kaggle)
    .withColumn("onchain_vs_exchange", F.col("volume_btc") / (F.col("volume_exchange") + 0.001))
    
    # --- C. MOYENNES MOBILES 24H ---
    .withColumn("tx_count_ma24", F.avg("tx_count").over(w24))
    .withColumn("volume_btc_ma24", F.avg("volume_btc").over(w24))
    .withColumn("whale_tx_ma24", F.avg("whale_tx_count").over(w24))
    .withColumn("miner_issuance_ma24", F.avg("miner_issuance_btc").over(w24))
    
    # --- D. Z-SCORES (ecart a la normale) ---
    .withColumn("tx_count_std24", F.stddev("tx_count").over(w24))
    .withColumn("tx_count_zscore", 
        (F.col("tx_count") - F.col("tx_count_ma24")) / (F.col("tx_count_std24") + 0.001))
    
    .withColumn("whale_std24", F.stddev("whale_tx_count").over(w24))
    .withColumn("whale_zscore",
        (F.col("whale_tx_count") - F.col("whale_tx_ma24")) / (F.col("whale_std24") + 0.001))
    
    # CORRECTION: Renomme miner_zscore en issuance_zscore
    # C'est un z-score sur l'emission monetaire, pas sur les ventes
    .withColumn("issuance_std24", F.stddev("miner_issuance_btc").over(w24))
    .withColumn("issuance_zscore",
        (F.col("miner_issuance_btc") - F.col("miner_issuance_ma24")) / (F.col("issuance_std24") + 0.001))
    
    # --- E. FEATURES CALENDRIER ---
    .withColumn("hour_of_day", F.hour(F.from_unixtime(F.col("timestamp_hour"))))
    .withColumn("day_of_week", F.dayofweek(F.from_unixtime(F.col("timestamp_hour"))))
)

print("Features created with Spark Window functions")
features_df.printSchema()

Features created with Spark Window functions
root
 |-- timestamp_hour: long (nullable = true)
 |-- tx_count: long (nullable = false)
 |-- n_blocks: long (nullable = false)
 |-- volume_sats: long (nullable = true)
 |-- avg_tx_sats: double (nullable = true)
 |-- avg_block_size: double (nullable = true)
 |-- total_block_size: long (nullable = true)
 |-- whale_tx_count: long (nullable = true)
 |-- whale_volume_sats: long (nullable = true)
 |-- coinbase_count: long (nullable = true)
 |-- miner_issuance_sats: long (nullable = true)
 |-- volume_btc: double (nullable = true)
 |-- whale_volume_btc: double (nullable = true)
 |-- miner_issuance_btc: double (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume_exchange: double (nullable = true)
 |-- close_lag1: double (nullable = true)
 |-- close_lag24: double (nullable = true)
 |-- return_1h: double (nullable = true)
 |-- return

# Target Label

In [8]:
# ==========================================================================
# LABEL : Le prix monte-t-il dans l'heure suivante ?
# y = 1 si close_{t+1} > close_t, sinon 0
# ==========================================================================

final_df = (
    features_df
    .withColumn("close_next", F.lead("close", 1).over(w1))
    .withColumn("label", F.when(F.col("close_next") > F.col("close"), 1).otherwise(0))
    
    # Supprimer les lignes sans label (derniere heure) ou sans features (premieres 24h)
    .filter(F.col("close_next").isNotNull())
    .filter(F.col("close_lag24").isNotNull())
    .filter(F.col("volatility_24h").isNotNull())
)

final_df.cache()
final_count = final_df.count()

print(f"Final dataset: {final_count:,} samples")

# Distribution du label
label_dist = final_df.groupBy("label").count().collect()
print(f"\nLabel distribution:")
for row in label_dist:
    pct = row['count'] / final_count * 100
    print(f"  {row['label']}: {row['count']:,} ({pct:.1f}%)")

# Apercu des features finales
final_df.select(
    "timestamp_hour", "close", "return_1h", "tx_count", "tx_count_zscore",
    "whale_zscore", "nvt_like", "issuance_zscore", "label"
).show(10, truncate=False)

25/12/07 12:02:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/07 12:02:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/07 12:02:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/07 12:02:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/07 12:02:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/07 12:02:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/07 1

Final dataset: 933 samples

Label distribution:
  1: 477 (51.1%)
  0: 456 (48.9%)
+--------------+------+---------------------+--------+---------------------+------------+--------------------+---------------+-----+
|timestamp_hour|close |return_1h            |tx_count|tx_count_zscore      |whale_zscore|nvt_like            |issuance_zscore|label|
+--------------+------+---------------------+--------+---------------------+------------+--------------------+---------------+-----+
|1386370800    |800.01|-0.125672131147541   |3731    |3.790152052490987    |0.0         |0.01346228003228441 |0.0            |1    |
|1386378000    |869.9 |0.0873614079824002   |852     |-0.055512584559817016|0.0         |0.042435140822588535|0.0            |1    |
|1386381600    |910.99|0.047235314403954516 |687     |-0.27086493916199555 |0.0         |0.030929359079177463|0.0            |0    |
|1386385200    |862.0 |-0.05377666055609832 |4938    |3.4955645774860575   |0.0         |0.004656438509346882|0.0       

# Spark Execution Plan

In [9]:
print("=" * 60)
print("SPARK EXECUTION PLAN")
print("=" * 60)
final_df.explain("formatted")

plan_buffer = StringIO()
with redirect_stdout(plan_buffer):
    final_df.explain("formatted")

plan_file = EVIDENCE_DIR / "feature_engineering_explain.txt"
plan_file.write_text(f"# Feature Engineering Execution Plan\n# Date: {datetime.now()}\n\n{plan_buffer.getvalue()}")

print(f"\nExecution plan saved to: {plan_file}")

SPARK EXECUTION PLAN
== Physical Plan ==
AdaptiveSparkPlan (98)
+- InMemoryTableScan (1) (columnarIn=false, columnarOut=true)
      +- InMemoryRelation (2)
            +- AdaptiveSparkPlan (97)
               +- == Final Plan ==
                  ResultQueryStage (85)
                  +- * Project (84)
                     +- * Filter (83)
                        +- Window (82)
                           +- * Project (81)
                              +- Window (80)
                                 +- * Project (79)
                                    +- Window (78)
                                       +- * Project (77)
                                          +- Window (76)
                                             +- * Sort (75)
                                                +- ShuffleQueryStage (74), Statistics(sizeInBytes=149.7 KiB, rowCount=958)
                                                   +- Exchange (73)
                                                      +- Tabl

# Final Dataset

In [10]:
# Selection des colonnes pour le modele
# CORRECTION: Liste nettoyee apres audit
# NOTE: whale = P99 percentile (top 1% des transactions par montant)
# NOTE: miner_issuance_btc a faible variance en 2013-14 (~50 BTC/bloc)
#       -> garde pour analyse d'importance, candidat a retirer si non-informatif
model_features = [
    "timestamp_hour",
    # Prix (5)
    "close", "return_1h", "return_24h", "volatility_24h", "volume_exchange",
    # On-chain brut (5)
    "tx_count", "volume_btc", "avg_block_size",
    "whale_tx_count", "whale_volume_btc",
    # Miner issuance (1) - faible variance attendue
    "miner_issuance_btc",
    # On-chain enrichi (2)
    "nvt_like", "onchain_vs_exchange",
    # Z-scores (3)
    "tx_count_zscore", "whale_zscore", "issuance_zscore",
    # Calendrier (2)
    "hour_of_day", "day_of_week",
    # Label
    "label"
]

# Total: 18 features + timestamp_hour + label = 20 colonnes
print(f"Features selectionnees: {len(model_features) - 2} (hors timestamp et label)")

output_df = final_df.select(model_features)

(
    output_df.write
    .mode("overwrite")
    .option("compression", "snappy")
    .parquet(str(FEATURES_PATH))
)

print(f"  Features saved to: {FEATURES_PATH}")

parquet_files = list(FEATURES_PATH.glob("*.parquet"))
total_size = sum(f.stat().st_size for f in parquet_files)

print(f"Parquet files created: {len(parquet_files)}")
print(f"Total size: {total_size / (1024*1024):.2f} MB")

verify_df = spark.read.parquet(str(FEATURES_PATH))
verify_count = verify_df.count()

print(f"Rows in Parquet: {verify_count:,}")
print(f"Integrity check: {'OK' if verify_count == final_count else 'MISMATCH'}")

Features selectionnees: 18 (hors timestamp et label)
  Features saved to: /home/img/BigData/Project/data/output/features_parquet
Parquet files created: 1
Total size: 0.08 MB
Rows in Parquet: 933
Integrity check: OK


# Spark Evidence

In [11]:
run_id = f"feature_eng_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

if not METRICS_FILE.exists():
    with open(METRICS_FILE, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['run_id', 'task', 'note', 'files_read', 'input_size_bytes', 
                        'shuffle_read_bytes', 'shuffle_write_bytes', 'timestamp'])

# Taille des inputs
tx_size = sum(f.stat().st_size for f in TRANSACTIONS_PATH.glob("*.parquet")) if TRANSACTIONS_PATH.exists() else 0
price_size = sum(f.stat().st_size for f in PRICES_PATH.glob("*.parquet")) if PRICES_PATH.exists() else 0
output_size = sum(f.stat().st_size for f in FEATURES_PATH.glob("*.parquet")) if FEATURES_PATH.exists() else 0

timestamp = datetime.now(timezone.utc).isoformat()

with open(METRICS_FILE, 'a', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow([
        run_id,
        "feature_engineering",
        f"samples={final_count},features={len(model_features)-2}",  # -2 pour timestamp et label
        2,  # 2 parquets en input
        tx_size + price_size,
        0,  # A remplir depuis Spark UI si besoin
        0,
        timestamp
    ])

print(f"   Metrics logged to: {METRICS_FILE}")
print(f"   Run ID: {run_id}")
print(f"   Input size: {(tx_size + price_size) / (1024*1024):.2f} MB")
print(f"   Output size: {output_size / (1024*1024):.2f} MB")

# Cleanup
transactions_df.unpersist()
prices_df.unpersist()
chain_hourly.unpersist()
price_hourly.unpersist()
merged_df.unpersist()
final_df.unpersist()
print("Cache released.")

   Metrics logged to: /home/img/BigData/Project/project_metrics_log.csv
   Run ID: feature_eng_20251207_120257
   Input size: 353.67 MB
   Output size: 0.08 MB
Cache released.
