# 0. Setup

In [1]:
import sys
import yaml
import csv
from pathlib import Path
from datetime import datetime, timezone
from contextlib import redirect_stdout
from io import StringIO

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline


def find_project_root():
    current = Path.cwd().resolve()
    if current.name == "notebooks":
        candidate = current.parent
        if (candidate / "conf").exists():
            return candidate
    search = current
    while search != search.parent:
        if (search / "conf").exists() and (search / "conf" / "bda_project_config.yml").exists():
            return search
        search = search.parent
    if (current / "conf").exists():
        return current
    raise FileNotFoundError(f"Cannot find project root from {Path.cwd()}")

PROJECT_ROOT = find_project_root()
CONFIG_PATH = PROJECT_ROOT / "conf" / "bda_project_config.yml"

with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

PATHS = config['paths']
SPARK_CFG = config['spark']

print(f"Project Root: {PROJECT_ROOT}")
print(f"Config loaded: {CONFIG_PATH}")

spark = (
    SparkSession.builder
    .appName(f"{SPARK_CFG['app_name']}_ModelTraining")
    .master(SPARK_CFG['master'])
    .config("spark.driver.memory", SPARK_CFG['driver_memory'])
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.sql.adaptive.enabled", "true")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")

Project Root: /home/img/BigData/Project
Config loaded: /home/img/BigData/Project/conf/bda_project_config.yml


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/07 12:21:27 WARN Utils: Your hostname, a03-341a, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/07 12:21:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/07 12:21:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/07 12:21:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/12/07 12:21:30 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/12/07 12:21:30 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/12/07 12:21:30 WARN Utils: Servi

Spark version: 4.0.1


# 1. Load Features

In [2]:
FEATURES_PATH = PROJECT_ROOT / PATHS.get('output_features', 'data/output_features')
METRICS_FILE = PROJECT_ROOT / PATHS['metrics_file']
EVIDENCE_DIR = PROJECT_ROOT / "evidence"
MODELS_DIR = PROJECT_ROOT / "models"

MODELS_DIR.mkdir(parents=True, exist_ok=True)
EVIDENCE_DIR.mkdir(parents=True, exist_ok=True)

print(f"Features Path: {FEATURES_PATH}")
print(f"Features exists: {FEATURES_PATH.exists()}")

df = spark.read.parquet(str(FEATURES_PATH))
df.cache()

total_count = df.count()
print(f"\nTotal samples: {total_count:,}")
print(f"Columns: {len(df.columns)}")
df.printSchema()

Features Path: /home/img/BigData/Project/data/output/features_parquet
Features exists: True


[Stage 1:>                                                          (0 + 1) / 1]


Total samples: 933
Columns: 20
root
 |-- timestamp_hour: long (nullable = true)
 |-- close: double (nullable = true)
 |-- return_1h: double (nullable = true)
 |-- return_24h: double (nullable = true)
 |-- volatility_24h: double (nullable = true)
 |-- volume_exchange: double (nullable = true)
 |-- tx_count: long (nullable = true)
 |-- volume_btc: double (nullable = true)
 |-- avg_block_size: double (nullable = true)
 |-- whale_tx_count: long (nullable = true)
 |-- whale_volume_btc: double (nullable = true)
 |-- miner_issuance_btc: double (nullable = true)
 |-- nvt_like: double (nullable = true)
 |-- onchain_vs_exchange: double (nullable = true)
 |-- tx_count_zscore: double (nullable = true)
 |-- whale_zscore: double (nullable = true)
 |-- issuance_zscore: double (nullable = true)
 |-- hour_of_day: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- label: integer (nullable = true)



                                                                                

# 2. Temporal Train/Test Split

In [3]:
time_range = df.agg(
    F.min("timestamp_hour").alias("min_ts"),
    F.max("timestamp_hour").alias("max_ts")
).collect()[0]

min_ts = time_range['min_ts']
max_ts = time_range['max_ts']
split_ts = min_ts + int(0.8 * (max_ts - min_ts))

print(f"Zone temporelle couverte: {datetime.utcfromtimestamp(min_ts)} -> {datetime.utcfromtimestamp(max_ts)}")
print(f"Split à partir de : {datetime.utcfromtimestamp(split_ts)}")

train_df = df.filter(F.col("timestamp_hour") < split_ts)
test_df = df.filter(F.col("timestamp_hour") >= split_ts)

train_count = train_df.count()
test_count = test_df.count()

print(f"\nTrain set: {train_count:,} lignes ({train_count/total_count*100:.1f}%)")
print(f"Test set:  {test_count:,} lignes ({test_count/total_count*100:.1f}%)")

print("\n Vérification de la distribution des classes du label:")
print("Train:")
train_df.groupBy("label").count().show()
print("Test:")
test_df.groupBy("label").count().show()

Time range: 2013-12-06 23:00:00 -> 2014-01-14 23:00:00
Split point (80%): 2014-01-07 03:48:00

Train set: 748 samples (80.2%)
Test set:  185 samples (19.8%)

Label distribution:
Train:
+-----+-----+
|label|count|
+-----+-----+
|    1|  380|
|    0|  368|
+-----+-----+

Test:
+-----+-----+
|label|count|
+-----+-----+
|    1|   97|
|    0|   88|
+-----+-----+



# 3. Feature Preparation

In [4]:
all_feature_columns = [col for col in df.columns if col not in ['timestamp_hour', 'label']]

RAW_VALUE_FEATURES = [
    'close',           # Prix brut dépend de l'époque
    'close_lag1', 'close_lag2', 'close_lag3', 'close_lag6', 'close_lag12', 'close_lag24',  # Lags du prix brut
    'volume_btc',      # Volume brut - depend de l'adoption
    'volume_exchange', # Volume exchange brut
    'tx_count',        # Nombre de tx brut - depend de l'adoption
    'whale_tx_count', 'whale_volume_btc',  # Valeurs brutes whales
    'avg_block_size',  # Taille brute
]


generalisable_features = [col for col in all_feature_columns if col not in RAW_VALUE_FEATURES]

print(f"\nListe de toutes les features ({len(all_feature_columns)}):")
for i, col in enumerate(all_feature_columns, 1):
    marker = "  [Brutes]" if col in RAW_VALUE_FEATURES else ""
    print(f"  {i:2d}. {col}{marker}")

print(f"\n Features générales ({len(generalisable_features)}):")
for i, col in enumerate(generalisable_features, 1):
    print(f"  {i:2d}. {col}")


feature_columns = all_feature_columns  


train_clean = train_df.na.fill(0)
test_clean = test_df.na.fill(0)

assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="features_raw",
    handleInvalid="skip"
)


scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withStd=True,
    withMean=True
)

=== FEATURE SETS ===

ALL features (18):
   1. close  [RAW]
   2. return_1h
   3. return_24h
   4. volatility_24h
   5. volume_exchange  [RAW]
   6. tx_count  [RAW]
   7. volume_btc  [RAW]
   8. avg_block_size  [RAW]
   9. whale_tx_count  [RAW]
  10. whale_volume_btc  [RAW]
  11. miner_issuance_btc
  12. nvt_like
  13. onchain_vs_exchange
  14. tx_count_zscore
  15. whale_zscore
  16. issuance_zscore
  17. hour_of_day
  18. day_of_week

GENERALISABLE features (11) - sans valeurs brutes:
   1. return_1h
   2. return_24h
   3. volatility_24h
   4. miner_issuance_btc
   5. nvt_like
   6. onchain_vs_exchange
   7. tx_count_zscore
   8. whale_zscore
   9. issuance_zscore
  10. hour_of_day
  11. day_of_week
Data cleaned (NaN -> 0)
 VectorAssembler and StandardScaler ready


# 4. Baseline Model - Random Classifier

In [5]:
majority_class = train_clean.groupBy("label").count().orderBy(F.desc("count")).first()["label"]
print(f"Classe majoritaire (train): {majority_class}")

test_majority_count = test_clean.filter(F.col("label") == majority_class).count()
baseline_accuracy = test_majority_count / test_count

print(f"\n=== BASELINE ===")
print(f"Strategie : Toujours prédire la classe {majority_class}")
print(f"Baseline Accuracy: {baseline_accuracy:.4f} ({baseline_accuracy*100:.2f}%)")
print(f"\nNotre modèle doit battre: {baseline_accuracy*100:.2f}%")

Classe majoritaire (train): 1

=== BASELINE ===
Strategy: Always predict class 1
Baseline Accuracy: 0.5243 (52.43%)

Notre modele doit battre: 52.43%


# 5. Model Training - Random Forest

In [6]:
rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
    numTrees=100,
    maxDepth=10,
    minInstancesPerNode=5,
    seed=42
)

rf_pipeline = Pipeline(stages=[assembler, scaler, rf])

print("Training Random Forest...")
import time
start_time = time.time()

rf_model = rf_pipeline.fit(train_clean)

rf_train_time = time.time() - start_time
print(f"Temps d'entrainement: {rf_train_time:.2f}s")

rf_predictions = rf_model.transform(test_clean)
rf_predictions.select("timestamp_hour", "label", "prediction", "probability").show(10)

Training Random Forest...


25/12/07 12:21:42 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/12/07 12:21:46 WARN DAGScheduler: Broadcasting large task binary with size 1187.6 KiB
25/12/07 12:21:47 WARN DAGScheduler: Broadcasting large task binary with size 1446.4 KiB
25/12/07 12:21:47 WARN DAGScheduler: Broadcasting large task binary with size 1698.1 KiB


Training time: 6.84s
+--------------+-----+----------+--------------------+
|timestamp_hour|label|prediction|         probability|
+--------------+-----+----------+--------------------+
|    1389067200|    1|       0.0|[0.57960122099219...|
|    1389070800|    1|       1.0|[0.49917989895980...|
|    1389074400|    0|       0.0|[0.62012156436929...|
|    1389078000|    0|       0.0|[0.60592155538844...|
|    1389081600|    0|       0.0|[0.60661454415436...|
|    1389085200|    1|       1.0|[0.46074951594725...|
|    1389088800|    0|       0.0|[0.69595878784893...|
|    1389092400|    1|       1.0|[0.43563204007016...|
|    1389096000|    0|       1.0|[0.48906253750196...|
|    1389099600|    1|       1.0|[0.40581418526870...|
+--------------+-----+----------+--------------------+
only showing top 10 rows


25/12/07 12:21:49 WARN DAGScheduler: Broadcasting large task binary with size 1047.8 KiB


# 6. Model Training - Gradient Boosted Trees

In [7]:
gbt = GBTClassifier(
    labelCol="label",
    featuresCol="features",
    maxIter=50,
    maxDepth=5,
    stepSize=0.1,
    seed=42
)

gbt_pipeline = Pipeline(stages=[assembler, scaler, gbt])

print("Training Gradient Boosted Trees...")
start_time = time.time()

gbt_model = gbt_pipeline.fit(train_clean)

gbt_train_time = time.time() - start_time
print(f"Temps d'entrainement: {gbt_train_time:.2f}s")


gbt_predictions = gbt_model.transform(test_clean)
gbt_predictions.select("timestamp_hour", "label", "prediction").show(10)

Training Gradient Boosted Trees...
Training time: 16.53s
+--------------+-----+----------+
|timestamp_hour|label|prediction|
+--------------+-----+----------+
|    1389067200|    1|       0.0|
|    1389070800|    1|       0.0|
|    1389074400|    0|       0.0|
|    1389078000|    0|       0.0|
|    1389081600|    0|       0.0|
|    1389085200|    1|       1.0|
|    1389088800|    0|       0.0|
|    1389092400|    1|       0.0|
|    1389096000|    0|       1.0|
|    1389099600|    1|       1.0|
+--------------+-----+----------+
only showing top 10 rows


25/12/07 12:22:06 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


# 7. Model Evaluation

In [8]:
binary_evaluator = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

multi_evaluator_acc = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

multi_evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

rf_accuracy = multi_evaluator_acc.evaluate(rf_predictions)
rf_f1 = multi_evaluator_f1.evaluate(rf_predictions)
rf_auc = binary_evaluator.evaluate(rf_predictions)

gbt_accuracy = multi_evaluator_acc.evaluate(gbt_predictions)
gbt_f1 = multi_evaluator_f1.evaluate(gbt_predictions)
gbt_auc = binary_evaluator.evaluate(gbt_predictions)

print("=" * 60)
print("Evaluation des résultats des modèles")
print("=" * 60)
print(f"{'Model':<20} {'Accuracy':>12} {'F1-Score':>12} {'AUC-ROC':>12}")
print("-" * 60)
print(f"{'Baseline':<20} {baseline_accuracy:>12.4f} {'N/A':>12} {'0.5000':>12}")
print(f"{'Random Forest':<20} {rf_accuracy:>12.4f} {rf_f1:>12.4f} {rf_auc:>12.4f}")
print(f"{'GBT':<20} {gbt_accuracy:>12.4f} {gbt_f1:>12.4f} {gbt_auc:>12.4f}")
print("=" * 60)

best_model_name = "Random Forest" if rf_auc > gbt_auc else "GBT"
best_auc = max(rf_auc, gbt_auc)
print(f"\nBest model: {best_model_name} (AUC: {best_auc:.4f})")

best_accuracy = rf_accuracy if rf_auc > gbt_auc else gbt_accuracy
improvement = (best_accuracy - baseline_accuracy) / baseline_accuracy * 100
print(f"Improvement over baseline: {improvement:+.2f}%")

25/12/07 12:22:06 WARN DAGScheduler: Broadcasting large task binary with size 1069.1 KiB
25/12/07 12:22:06 WARN DAGScheduler: Broadcasting large task binary with size 1069.1 KiB
25/12/07 12:22:07 WARN DAGScheduler: Broadcasting large task binary with size 1056.9 KiB


MODEL EVALUATION RESULTS
Model                    Accuracy     F1-Score      AUC-ROC
------------------------------------------------------------
Baseline (majority)        0.5243          N/A       0.5000
Random Forest              0.5730       0.5730       0.6105
GBT                        0.5459       0.5458       0.5975

Best model: Random Forest (AUC: 0.6105)
Improvement over baseline: +9.28%


# 8. Feature Importance

In [9]:
rf_classifier = rf_model.stages[-1]
importances = rf_classifier.featureImportances.toArray()

feature_importance = list(zip(feature_columns, importances))
feature_importance.sort(key=lambda x: x[1], reverse=True)

print("=" * 60)
print("Features les plus importantes pour le Random Forest")
print("=" * 60)
print(f"{'Rank':<6} {'Feature':<25} {'Importance':>12}")
print("-" * 60)

for i, (feat, imp) in enumerate(feature_importance[:15], 1):
    bar = "█" * int(imp * 50)
    print(f"{i:<6} {feat:<25} {imp:>12.4f} {bar}")

print("-" * 60)
print("\nTop 5 features:")
for feat, imp in feature_importance[:5]:
    print(f"  - {feat}: {imp:.4f}")

FEATURE IMPORTANCE (Random Forest)
Rank   Feature                     Importance
------------------------------------------------------------
1      return_1h                       0.0962 ████
2      tx_count_zscore                 0.0875 ████
3      volume_btc                      0.0869 ████
4      volume_exchange                 0.0852 ████
5      hour_of_day                     0.0840 ████
6      tx_count                        0.0796 ███
7      return_24h                      0.0774 ███
8      nvt_like                        0.0763 ███
9      avg_block_size                  0.0757 ███
10     onchain_vs_exchange             0.0723 ███
11     volatility_24h                  0.0698 ███
12     close                           0.0649 ███
13     day_of_week                     0.0443 ██
14     whale_tx_count                  0.0000 
15     whale_volume_btc                0.0000 
------------------------------------------------------------

Top 5 features:
  - return_1h: 0.0962
  - tx_cou

# 9. Confusion Matrix

In [10]:
assembler_gen = VectorAssembler(
    inputCols=generalisable_features,
    outputCol="features_raw",
    handleInvalid="skip"
)

scaler_gen = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withStd=True,
    withMean=True
)

rf_gen = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
    numTrees=100,
    maxDepth=10,
    minInstancesPerNode=5,
    seed=42
)

pipeline_gen = Pipeline(stages=[assembler_gen, scaler_gen, rf_gen])

print(f"\n Entrainement du Random Forest qu'avec des features générales({len(generalisable_features)} features)...")
start_time = time.time()
model_gen = pipeline_gen.fit(train_clean)
gen_train_time = time.time() - start_time
print(f"Temps d'entrainement: {gen_train_time:.2f}s")

predictions_gen = model_gen.transform(test_clean)

gen_accuracy = multi_evaluator_acc.evaluate(predictions_gen)
gen_f1 = multi_evaluator_f1.evaluate(predictions_gen)
gen_auc = binary_evaluator.evaluate(predictions_gen)

print("\n" + "=" * 70)
print("RESULTATS")
print("=" * 70)
print(f"{'Feature Set':<30} {'N_Features':>12} {'Accuracy':>12} {'AUC-ROC':>12}")
print("-" * 70)
print(f"{'Baseline (majority)':<30} {'-':>12} {baseline_accuracy:>12.4f} {'0.5000':>12}")
print(f"{'ALL features':<30} {len(all_feature_columns):>12} {rf_accuracy:>12.4f} {rf_auc:>12.4f}")
print(f"{'GENERALISABLE only':<30} {len(generalisable_features):>12} {gen_accuracy:>12.4f} {gen_auc:>12.4f}")
print("=" * 70)

acc_drop = (rf_accuracy - gen_accuracy) / rf_accuracy * 100
auc_drop = (rf_auc - gen_auc) / rf_auc * 100

print(f"\n Analysis:")
print(f"   Accuracy drop without raw values: {acc_drop:+.2f}%")
print(f"   AUC drop without raw values: {auc_drop:+.2f}%")

if abs(acc_drop) < 5:
    print(f"\n GOOD: Le modele reste performant sans valeurs brutes!")
    print(f"   -> Le modele est probablement generalisable a d'autres periodes.")
else:
    print(f"\n WARNING: Le modele depend beaucoup des valeurs brutes.")
    print(f"   -> Risque d'overfitting temporel, moins generalisable.")

rf_gen_classifier = model_gen.stages[-1]
importances_gen = rf_gen_classifier.featureImportances.toArray()
feature_importance_gen = list(zip(generalisable_features, importances_gen))
feature_importance_gen.sort(key=lambda x: x[1], reverse=True)

print(f"\n Top 5 features (GENERALISABLE model):")
for i, (feat, imp) in enumerate(feature_importance_gen[:5], 1):
    print(f"   {i}. {feat}: {imp:.4f}")


Training RF with GENERALISABLE features only (11 features)...


25/12/07 12:22:09 WARN DAGScheduler: Broadcasting large task binary with size 1132.1 KiB
25/12/07 12:22:09 WARN DAGScheduler: Broadcasting large task binary with size 1400.7 KiB
25/12/07 12:22:10 WARN DAGScheduler: Broadcasting large task binary with size 1660.7 KiB


Training time: 3.58s


25/12/07 12:22:11 WARN DAGScheduler: Broadcasting large task binary with size 1030.6 KiB
25/12/07 12:22:11 WARN DAGScheduler: Broadcasting large task binary with size 1030.6 KiB



RESULTATS
Feature Set                      N_Features     Accuracy      AUC-ROC
----------------------------------------------------------------------
Baseline (majority)                       -       0.5243       0.5000
ALL features                             18       0.5730       0.6105
GENERALISABLE only                       11       0.5405       0.5907

 Analysis:
   Accuracy drop without raw values: +5.66%
   AUC drop without raw values: +3.24%

   -> Risque d'overfitting temporel, moins generalisable.

 Top 5 features (GENERALISABLE model):
   1. nvt_like: 0.1538
   2. return_1h: 0.1388
   3. onchain_vs_exchange: 0.1377
   4. return_24h: 0.1336
   5. tx_count_zscore: 0.1290


25/12/07 12:22:12 WARN DAGScheduler: Broadcasting large task binary with size 1018.3 KiB


In [11]:
best_predictions = rf_predictions if rf_auc > gbt_auc else gbt_predictions

confusion = best_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction")
confusion_data = confusion.collect()

tn = fp = fn = tp = 0
for row in confusion_data:
    if row['label'] == 0 and row['prediction'] == 0:
        tn = row['count']
    elif row['label'] == 0 and row['prediction'] == 1:
        fp = row['count']
    elif row['label'] == 1 and row['prediction'] == 0:
        fn = row['count']
    elif row['label'] == 1 and row['prediction'] == 1:
        tp = row['count']

print("=" * 40)
print(f"CONFUSION MATRIX ({best_model_name})")
print("=" * 40)
print(f"                  Predicted")
print(f"                  0       1")
print(f"Actual  0      {tn:4d}    {fp:4d}")
print(f"        1      {fn:4d}    {tp:4d}")
print("=" * 40)

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

print(f"\nPrecision : {precision:.4f}")
print(f"Recall :    {recall:.4f}")
print(f"Specificity:         {specificity:.4f}")

25/12/07 12:22:12 WARN DAGScheduler: Broadcasting large task binary with size 1065.5 KiB
25/12/07 12:22:12 WARN DAGScheduler: Broadcasting large task binary with size 1009.3 KiB
25/12/07 12:22:12 WARN DAGScheduler: Broadcasting large task binary with size 1009.1 KiB
25/12/07 12:22:12 WARN DAGScheduler: Broadcasting large task binary with size 1009.5 KiB


CONFUSION MATRIX (Random Forest)
                  Predicted
                  0       1
Actual  0        53      35
        1        44      53

Precision (class 1): 0.6023
Recall (class 1):    0.5464
Specificity:         0.6023


# 10. Save Results & Evidence

In [12]:
run_id = f"model_train_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

if not METRICS_FILE.exists():
    with open(METRICS_FILE, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['run_id', 'task', 'note', 'files_read', 'input_size_bytes',
                        'shuffle_read_bytes', 'shuffle_write_bytes', 'timestamp'])

timestamp = datetime.now(timezone.utc).isoformat()

with open(METRICS_FILE, 'a', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    # Log RF results
    writer.writerow([
        run_id, "model_rf",
        f"acc={rf_accuracy:.4f},f1={rf_f1:.4f},auc={rf_auc:.4f}",
        1, 0, 0, 0, timestamp
    ])
    # Log GBT results
    writer.writerow([
        run_id, "model_gbt",
        f"acc={gbt_accuracy:.4f},f1={gbt_f1:.4f},auc={gbt_auc:.4f}",
        1, 0, 0, 0, timestamp
    ])

print(f"Metrics logged to: {METRICS_FILE}")

summary_file = EVIDENCE_DIR / "model_training_summary.txt"
with open(summary_file, 'w', encoding='utf-8') as f:
    f.write(f"# Model Training Summary\n")
    f.write(f"# Date: {datetime.now()}\n")
    f.write(f"# Run ID: {run_id}\n\n")
    
    f.write(f"## Dataset\n")
    f.write(f"Total samples: {total_count}\n")
    f.write(f"Train samples: {train_count} ({train_count/total_count*100:.1f}%)\n")
    f.write(f"Test samples: {test_count} ({test_count/total_count*100:.1f}%)\n")
    f.write(f"Features: {len(feature_columns)}\n\n")
    
    f.write(f"## Results\n")
    f.write(f"Baseline Accuracy: {baseline_accuracy:.4f}\n\n")
    f.write(f"Random Forest:\n")
    f.write(f"  Accuracy: {rf_accuracy:.4f}\n")
    f.write(f"  F1-Score: {rf_f1:.4f}\n")
    f.write(f"  AUC-ROC: {rf_auc:.4f}\n\n")
    f.write(f"GBT:\n")
    f.write(f"  Accuracy: {gbt_accuracy:.4f}\n")
    f.write(f"  F1-Score: {gbt_f1:.4f}\n")
    f.write(f"  AUC-ROC: {gbt_auc:.4f}\n\n")
    
    f.write(f"## Feature Importance (Top 10)\n")
    for i, (feat, imp) in enumerate(feature_importance[:10], 1):
        f.write(f"{i}. {feat}: {imp:.4f}\n")

print(f"Summary saved to: {summary_file}")

best_model = rf_model if rf_auc > gbt_auc else gbt_model
model_path = MODELS_DIR / f"best_model_{best_model_name.lower().replace(' ', '_')}"
best_model.write().overwrite().save(str(model_path))
print(f"Best model saved to: {model_path}")

Metrics logged to: /home/img/BigData/Project/project_metrics_log.csv
Summary saved to: /home/img/BigData/Project/evidence/model_training_summary.txt
Best model saved to: /home/img/BigData/Project/models/best_model_random_forest


# 11. Spark Execution Plan

In [13]:
print("=" * 60)
print("SPARK EXECUTION PLAN (Best Model Predictions)")
print("=" * 60)
best_predictions.explain("formatted")

# Save plan
plan_buffer = StringIO()
with redirect_stdout(plan_buffer):
    best_predictions.explain("formatted")

plan_file = EVIDENCE_DIR / "model_training_explain.txt"
plan_file.write_text(f"# Model Training Execution Plan\n# Date: {datetime.now()}\n\n{plan_buffer.getvalue()}")
print(f"\nExecution plan saved to: {plan_file}")

SPARK EXECUTION PLAN (Best Model Predictions)
== Physical Plan ==
* Project (10)
+- * Project (9)
   +- * Project (8)
      +- * Project (7)
         +- * Project (6)
            +- * Filter (5)
               +- InMemoryTableScan (1) (columnarIn=false, columnarOut=true)
                     +- InMemoryRelation (2)
                           +- * ColumnarToRow (4)
                              +- Scan parquet  (3)


(1) InMemoryTableScan
Output [20]: [avg_block_size#8, close#1, day_of_week#18, hour_of_day#17, issuance_zscore#16, label#19, miner_issuance_btc#11, nvt_like#12, onchain_vs_exchange#13, return_1h#2, return_24h#3, timestamp_hour#0L, tx_count#6L, tx_count_zscore#14, volatility_24h#4, volume_btc#7, volume_exchange#5, whale_tx_count#9L, whale_volume_btc#10, whale_zscore#15]
Arguments: [avg_block_size#8, close#1, day_of_week#18, hour_of_day#17, issuance_zscore#16, label#19, miner_issuance_btc#11, nvt_like#12, onchain_vs_exchange#13, return_1h#2, return_24h#3, timestamp_hour#0L, t

# 12. Final Summary

In [14]:
print("\n" + "=" * 70)
print("            BITCOIN PRICE DIRECTION PREDICTION")
print("                     FINAL RESULTS")
print("=" * 70)

print(f"\n Dataset:")
print(f"   - {total_count:,} samples")
print(f"   - Temporal split: {train_count} train / {test_count} test")
print(f"   - Period: {datetime.utcfromtimestamp(min_ts).date()} to {datetime.utcfromtimestamp(max_ts).date()}")

print(f"\n Performance (ALL features = {len(all_feature_columns)}):")
print(f"   - Baseline (majority): {baseline_accuracy*100:.2f}%")
print(f"   - Best Model ({best_model_name}):")
print(f"       Accuracy: {best_accuracy*100:.2f}%")
print(f"       AUC-ROC:  {best_auc:.4f}")
print(f"       Improvement: {improvement:+.2f}% over baseline")

print(f"\n Ablation Study (GENERALISABLE features = {len(generalisable_features)}):")
print(f"   - Accuracy: {gen_accuracy*100:.2f}%")
print(f"   - AUC-ROC:  {gen_auc:.4f}")
print(f"   - Drop vs ALL: {acc_drop:+.2f}% accuracy, {auc_drop:+.2f}% AUC")
if abs(acc_drop) < 5:
    print(f" Modèle generalisable (faible dependance aux valeurs brutes)")
else:
    print(f" Dépendance aux valeurs brutes (risque d'overfitting temporel)")

print(f"\n Top 5 Features (ALL):")
for i, (feat, imp) in enumerate(feature_importance[:5], 1):
    print(f"   {i}. {feat} ({imp:.4f})")

print(f"\n Top 5 Features (GENERALISABLE):")
for i, (feat, imp) in enumerate(feature_importance_gen[:5], 1):
    print(f"   {i}. {feat} ({imp:.4f})")
    
print(f"\n Artifacts saved:")
print(f"   - Metrics: {METRICS_FILE}")
print(f"   - Summary: {summary_file}")
print(f"   - Model: {model_path}")

print("\n" + "=" * 70)


df.unpersist()
print("\nCache released.")


            BITCOIN PRICE DIRECTION PREDICTION
                     FINAL RESULTS

 Dataset:
   - 933 samples
   - Temporal split: 748 train / 185 test
   - Period: 2013-12-06 to 2014-01-14

 Performance (ALL features = 18):
   - Baseline (majority): 52.43%
   - Best Model (Random Forest):
       Accuracy: 57.30%
       AUC-ROC:  0.6105
       Improvement: +9.28% over baseline

 Ablation Study (GENERALISABLE features = 11):
   - Accuracy: 54.05%
   - AUC-ROC:  0.5907
   - Drop vs ALL: +5.66% accuracy, +3.24% AUC
 Dépendance aux valeurs brutes (risque d'overfitting temporel)

 Top 5 Features (ALL):
   1. return_1h (0.0962)
   2. tx_count_zscore (0.0875)
   3. volume_btc (0.0869)
   4. volume_exchange (0.0852)
   5. hour_of_day (0.0840)

 Top 5 Features (GENERALISABLE):
   1. nvt_like (0.1538)
   2. return_1h (0.1388)
   3. onchain_vs_exchange (0.1377)
   4. return_24h (0.1336)
   5. tx_count_zscore (0.1290)

 Artifacts saved:
   - Metrics: /home/img/BigData/Project/project_metrics_log.