# 0. Setup

In [1]:
import sys
import yaml
from pathlib import Path
from datetime import datetime, timezone

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, LongType, TimestampType



def find_project_root():
    current = Path.cwd().resolve()
    if current.name == "notebooks":
        candidate = current.parent
        if (candidate / "conf").exists():
            return candidate
    
    search = current
    while search != search.parent:  # Jusqu'Ã  la racine du filesystem
        if (search / "conf").exists() and (search / "conf" / "bda_project_config.yml").exists():
            return search
        search = search.parent
    
    if (current / "conf").exists():
        return current
    
    raise FileNotFoundError(
        f"Cannot find project root (looked for 'conf/' folder)\n"
        f"Started from: {Path.cwd()}\n"
        f"Tip: Run this notebook from the Project/ or Project/notebooks/ directory"
    )

PROJECT_ROOT = find_project_root()
CONFIG_PATH = PROJECT_ROOT / "conf" / "bda_project_config.yml"

with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

PATHS = config['paths']
SPARK_CFG = config['spark']

print(f" Project Root: {PROJECT_ROOT}")
print(f" Config loaded: {CONFIG_PATH}")
print(f" Project Name: {config['project']['name']}")

spark = (
    SparkSession.builder
    .appName(SPARK_CFG['app_name'])
    .master(SPARK_CFG['master'])
    .config("spark.driver.memory", SPARK_CFG['driver_memory'])
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.sql.adaptive.enabled", "true")
    .getOrCreate()
)


spark.sparkContext.setLogLevel("ERROR")

print(f"Spark version: {spark.version}")
print(f"App name: {spark.sparkContext.appName}")
print(f"Master: {spark.sparkContext.master}")

 Project Root: /home/img/BigData/Project
 Config loaded: /home/img/BigData/Project/conf/bda_project_config.yml
 Project Name: Bitcoin Price Predictor


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/07 11:37:56 WARN Utils: Your hostname, a03-341a, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/07 11:37:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/07 11:37:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/07 11:37:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/12/07 11:37:57 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


Spark version: 4.0.1
App name: BTC_ETL_Custom
Master: local[*]


# 1. File Management

In [2]:
MARKET_DATA_PATH = PROJECT_ROOT / PATHS['market_data']

OUTPUT_MARKET_PATH = PROJECT_ROOT / PATHS['output_market']

METRICS_FILE = PROJECT_ROOT / PATHS['metrics_file']

EVIDENCE_DIR = PROJECT_ROOT / "evidence"

OUTPUT_MARKET_PATH.mkdir(parents=True, exist_ok=True)
EVIDENCE_DIR.mkdir(parents=True, exist_ok=True)

print("=" * 60)
print("PATHS CONFIGURED (relative to PROJECT_ROOT)")
print("=" * 60)
print(f"Input CSV      : {MARKET_DATA_PATH}")
print(f"Output Parquet : {OUTPUT_MARKET_PATH}")
print(f"Metrics File   : {METRICS_FILE}")
print(f"\nCSV exists: {MARKET_DATA_PATH.exists()}")

PATHS CONFIGURED (relative to PROJECT_ROOT)
Input CSV      : /home/img/BigData/Project/data/prices/btcusd_1-min_data.csv
Output Parquet : /home/img/BigData/Project/data/output/market_parquet
Metrics File   : /home/img/BigData/Project/project_metrics_log.csv

CSV exists: True


# 2. Load Raw Data

In [3]:
price_schema = StructType([
    StructField("Timestamp", DoubleType(), True),
    StructField("Open", DoubleType(), True),
    StructField("High", DoubleType(), True),
    StructField("Low", DoubleType(), True),
    StructField("Close", DoubleType(), True),
    StructField("Volume", DoubleType(), True)
])

raw_df = (
    spark.read
    .option("header", "true")
    .schema(price_schema)
    .csv(str(MARKET_DATA_PATH))
)

raw_df.cache()
raw_count = raw_df.count()

print(f"Rows loaded: {raw_count:,}")
print(f"Columns: {raw_df.columns}")

raw_df.printSchema()
raw_df.show(5, truncate=False)

ts_stats = raw_df.agg(
    F.min("Timestamp").alias("min_ts"),
    F.max("Timestamp").alias("max_ts")
).collect()[0]

min_date = datetime.utcfromtimestamp(ts_stats["min_ts"])
max_date = datetime.utcfromtimestamp(ts_stats["max_ts"])

print(f"Date range: {min_date} -> {max_date}")
print(f"Duration: {(max_date - min_date).days} days")

                                                                                

Rows loaded: 7,317,759
Columns: ['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']
root
 |-- Timestamp: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: double (nullable = true)

+------------+----+----+----+-----+------+
|Timestamp   |Open|High|Low |Close|Volume|
+------------+----+----+----+-----+------+
|1.32541206E9|4.58|4.58|4.58|4.58 |0.0   |
|1.32541212E9|4.58|4.58|4.58|4.58 |0.0   |
|1.32541218E9|4.58|4.58|4.58|4.58 |0.0   |
|1.32541224E9|4.58|4.58|4.58|4.58 |0.0   |
|1.3254123E9 |4.58|4.58|4.58|4.58 |0.0   |
+------------+----+----+----+-----+------+
only showing top 5 rows
Date range: 2012-01-01 10:01:00 -> 2025-11-30 23:59:00
Duration: 5082 days


# 3. Transform

In [4]:
transformed_df = (
    raw_df
    .withColumn("datetime", F.from_unixtime(F.col("Timestamp")).cast(TimestampType()))
    .withColumn("timestamp_hour", (F.floor(F.col("Timestamp") / 3600) * 3600).cast(LongType()))
    .withColumn("date", F.to_date(F.col("datetime")))
    .withColumn("hour", F.hour(F.col("datetime")))
)

print("Schema after transformation:")
transformed_df.printSchema()


transformed_df.select(
    "Timestamp", "datetime", "timestamp_hour", "date", "hour", "Close"
).show(10, truncate=False)


Schema after transformation:
root
 |-- Timestamp: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: double (nullable = true)
 |-- datetime: timestamp (nullable = true)
 |-- timestamp_hour: long (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)

+------------+-------------------+--------------+----------+----+-----+
|Timestamp   |datetime           |timestamp_hour|date      |hour|Close|
+------------+-------------------+--------------+----------+----+-----+
|1.32541206E9|2012-01-01 10:01:00|1325412000    |2012-01-01|10  |4.58 |
|1.32541212E9|2012-01-01 10:02:00|1325412000    |2012-01-01|10  |4.58 |
|1.32541218E9|2012-01-01 10:03:00|1325412000    |2012-01-01|10  |4.58 |
|1.32541224E9|2012-01-01 10:04:00|1325412000    |2012-01-01|10  |4.58 |
|1.3254123E9 |2012-01-01 10:05:00|1325412000    |2012-01-01|10  |4.58 |
|1.32541

# 4. Spark Execution Plan

In [5]:
print("=" * 60)
print("SPARK EXECUTION PLAN")
print("=" * 60)
transformed_df.explain("formatted")

SPARK EXECUTION PLAN
== Physical Plan ==
* Project (5)
+- * Project (4)
   +- InMemoryTableScan (1) (columnarIn=false, columnarOut=true)
         +- InMemoryRelation (2)
               +- Scan csv  (3)


(1) InMemoryTableScan
Output [6]: [Close#4, High#2, Low#3, Open#1, Timestamp#0, Volume#5]
Arguments: [Close#4, High#2, Low#3, Open#1, Timestamp#0, Volume#5]

(2) InMemoryRelation
Arguments: [Timestamp#0, Open#1, High#2, Low#3, Close#4, Volume#5], StorageLevel(disk, memory, deserialized, 1 replicas)

(3) Scan csv 
Output [6]: [Timestamp#0, Open#1, High#2, Low#3, Close#4, Volume#5]
Batched: false
Location: InMemoryFileIndex [file:/home/img/BigData/Project/data/prices/btcusd_1-min_data.csv]
ReadSchema: struct<Timestamp:double,Open:double,High:double,Low:double,Close:double,Volume:double>

(4) Project [codegen id : 1]
Output [8]: [Timestamp#0, Open#1, High#2, Low#3, Close#4, Volume#5, cast(from_unixtime(cast(Timestamp#0 as bigint), yyyy-MM-dd HH:mm:ss, Some(UTC)) as timestamp) AS datetime#

In [6]:
from contextlib import redirect_stdout
from io import StringIO

plan_buffer = StringIO()
with redirect_stdout(plan_buffer):
    transformed_df.explain("formatted")

plan_file = EVIDENCE_DIR / "market_etl_explain.txt"
plan_file.write_text(f"# Market ETL Execution Plan\n# Date: {datetime.now()}\n\n{plan_buffer.getvalue()}")

print(f"Execution plan saved to: {plan_file}")

Execution plan saved to: /home/img/BigData/Project/evidence/market_etl_explain.txt


# 6. Load 

In [7]:
final_df = transformed_df.select(
    F.col("Timestamp").alias("timestamp_unix"),
    "datetime",
    "timestamp_hour",
    "date",
    "hour",
    F.col("Open").alias("open"),
    F.col("High").alias("high"),
    F.col("Low").alias("low"),
    F.col("Close").alias("close"),
    F.col("Volume").alias("volume")
)

print("Final Schema:")
final_df.printSchema()

Final Schema:
root
 |-- timestamp_unix: double (nullable = true)
 |-- datetime: timestamp (nullable = true)
 |-- timestamp_hour: long (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: double (nullable = true)



In [8]:
(
    final_df.write
    .mode("overwrite")
    .option("compression", "snappy")
    .parquet(str(OUTPUT_MARKET_PATH))
)

print(f"  Parquet saved to: {OUTPUT_MARKET_PATH}")



  Parquet saved to: /home/img/BigData/Project/data/output/market_parquet


                                                                                

In [9]:
parquet_files = list(OUTPUT_MARKET_PATH.glob("*.parquet"))
total_size = sum(f.stat().st_size for f in parquet_files)

print(f"Parquet files created: {len(parquet_files)}")
print(f"Total size: {total_size / (1024*1024):.2f} MB")

verify_df = spark.read.parquet(str(OUTPUT_MARKET_PATH))
verify_count = verify_df.count()

print(f"Rows in Parquet: {verify_count:,}")
print(f"Integrity check: {'OK' if verify_count == raw_count else 'MISMATCH'}")

Parquet files created: 12
Total size: 198.50 MB
Rows in Parquet: 7,317,759
Integrity check: OK


# 7. Spark Evidence

In [10]:
import csv

run_id = f"market_etl_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

if not METRICS_FILE.exists():
    with open(METRICS_FILE, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['run_id', 'task', 'note', 'files_read', 'input_size_bytes', 
                        'shuffle_read_bytes', 'shuffle_write_bytes', 'timestamp'])


input_size = MARKET_DATA_PATH.stat().st_size if MARKET_DATA_PATH.exists() else 0
output_size = sum(f.stat().st_size for f in OUTPUT_MARKET_PATH.glob("*.parquet"))

timestamp = datetime.now(timezone.utc).isoformat()

with open(METRICS_FILE, 'a', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow([
        run_id,                    # run_id
        "market_etl",              # task
        f"rows={raw_count}",       # note
        1,                         # files_read (1 CSV)
        input_size,                # input_size_bytes
        0,                         # shuffle_read_bytes (pas de shuffle)
        0,                         # shuffle_write_bytes
        timestamp                  # timestamp
    ])

print(f"   Metrics logged to: {METRICS_FILE}")
print(f"   Run ID: {run_id}")
print(f"   Input size: {input_size / (1024*1024):.2f} MB")
print(f"   Output size: {output_size / (1024*1024):.2f} MB")


raw_df.unpersist()
print("Cache released.")



   Metrics logged to: /home/img/BigData/Project/project_metrics_log.csv
   Run ID: market_etl_20251207_113821
   Input size: 365.96 MB
   Output size: 198.50 MB
Cache released.
