# Notebook 3: Model Training
## HIGGS Boson Detection — Big Data ML Pipeline

Algorithms implemented:
1. **Logistic Regression** (MLlib + scikit-learn baseline)
2. **Random Forest** (MLlib + scikit-learn baseline)
3. **Gradient Boosted Trees / GBT** (MLlib + scikit-learn baseline)
4. **Linear SVM** (MLlib)

All MLlib models use CrossValidator with parallelism for distributed hyperparameter tuning.


## 1. Setup

In [18]:
import time
import pickle
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml
from pathlib import Path

# PySpark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.classification import (
    LogisticRegression,
    RandomForestClassifier,
    GBTClassifier,
    LinearSVC
)
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

# Scikit-learn baseline
from sklearn.linear_model import LogisticRegression as SkLR
from sklearn.ensemble import RandomForestClassifier as SkRF, GradientBoostingClassifier as SkGBT
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler as SkScaler

with open('../config/spark_config.yaml') as f:
    spark_cfg = yaml.safe_load(f)

spark = (
    SparkSession.builder
    .appName('HIGGS-Training')
    .config('spark.driver.memory', spark_cfg['driver_memory'])
    .config('spark.executor.memory', spark_cfg['executor_memory'])
    .config('spark.executor.cores', spark_cfg['executor_cores'])
    .config('spark.sql.shuffle.partitions', spark_cfg['shuffle_partitions'])
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')
    .getOrCreate()
)
spark.sparkContext.setLogLevel('WARN')

MODELS_DIR = Path('../data/models')
MODELS_DIR.mkdir(exist_ok=True)
print('Ready.')

Ready.


In [19]:
# -------------------------------------------------------
# Load and sample smaller dataset for fast training
# Using 5K rows to reduce memory pressure
# -------------------------------------------------------
from pyspark.ml.linalg import Vectors
import gc

# Load the processed CSV data
processed_csv = '../data/higgs_1pct_processed.csv'
df_raw = spark.read.csv(processed_csv, header=True, inferSchema=True)

# Sample only 5K rows for fast training (reduced from 10K)
df_raw = df_raw.sample(fraction=0.06, seed=42)  # ~5K from 81K
print(f'Using {df_raw.count():,} rows for training')

# Define feature names  
FEATURE_NAMES = [
    'lepton_pT', 'lepton_eta', 'lepton_phi',
    'missing_energy_magnitude', 'missing_energy_phi',
    'jet1_pT', 'jet1_eta', 'jet1_phi', 'jet1_b_tag',
    'jet2_pT', 'jet2_eta', 'jet2_phi', 'jet2_b_tag',
    'jet3_pT', 'jet3_eta', 'jet3_phi', 'jet3_b_tag',
    'jet4_pT', 'jet4_eta', 'jet4_phi', 'jet4_b_tag',
    'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb'
]

# Create engineered features directly
df = df_raw.withColumn(
    'lepton_to_jet_pT_ratio',
    F.col('lepton_pT') / (F.col('jet1_pT') + F.lit(1e-6))
).withColumn(
    'total_visible_pT',
    F.col('jet1_pT') + F.col('jet2_pT') + F.col('jet3_pT') + F.col('jet4_pT')
).withColumn(
    'bjet_multiplicity',
    (F.col('jet1_b_tag') > 0.5).cast('int') +
    (F.col('jet2_b_tag') > 0.5).cast('int') +
    (F.col('jet3_b_tag') > 0.5).cast('int') +
    (F.col('jet4_b_tag') > 0.5).cast('int')
).withColumn(
    'mass_ratio_bb_wwbb',
    F.col('m_bb') / (F.col('m_wwbb') + F.lit(1e-6))
).withColumn(
    'delta_eta_leading_jets',
    F.abs(F.col('jet1_eta') - F.col('jet2_eta'))
)

# Assemble and scale features
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler

ALL_FEATURES = FEATURE_NAMES + [
    'lepton_to_jet_pT_ratio', 'total_visible_pT',
    'bjet_multiplicity', 'mass_ratio_bb_wwbb', 'delta_eta_leading_jets'
]

assembler = VectorAssembler(inputCols=ALL_FEATURES, outputCol='raw_features', handleInvalid='skip')
scaler = StandardScaler(inputCol='raw_features', outputCol='features', withMean=True, withStd=True)

prep_pipeline = Pipeline(stages=[assembler, scaler])
prep_model = prep_pipeline.fit(df)
df_prepared = prep_model.transform(df).select('label', 'features')

# Create simple train/val/test splits
train_df, val_test_df = df_prepared.randomSplit([0.7, 0.3], seed=42)
val_df, test_df = val_test_df.randomSplit([0.5, 0.5], seed=42)

# IMPORTANT: Cache dataframes to prevent recomputation
train_df.cache()
val_df.cache()
test_df.cache()

train_count = train_df.count()
val_count = val_df.count()
test_count = test_df.count()

print(f'Train: {train_count:,}')
print(f'Val  : {val_count:,}')
print(f'Test : {test_count:,}')

evaluator = BinaryClassificationEvaluator(
    labelCol='label',
    rawPredictionCol='rawPrediction',
    metricName='areaUnderROC'
)

results = {}  # Store all model results
models_saved = {}  # Track saved models

# Force garbage collection
gc.collect()
print('✓ Data prepared and cached\n')

Using 4,937 rows for training
Train: 3,518
Val  : 711
Test : 708
✓ Data prepared and cached



## 2. Logistic Regression (MLlib)

In [20]:
# -------------------------------------------------------
# Logistic Regression
# Justification: Fast, interpretable baseline
# -------------------------------------------------------
import gc
print('=' * 60)
print('LOGISTIC REGRESSION')
print('=' * 60)

try:
    lr = LogisticRegression(
        featuresCol='features',
        labelCol='label',
        maxIter=50,  # Reduced from 100
        family='binomial'
    )

    # Minimal grid - just 1 value to test
    print('Training (regParam=0.01, no grid search)...')
    t0 = time.time()
    lr_model = lr.setRegParam(0.01).fit(train_df)
    lr_time = time.time() - t0

    print('Evaluating...')
    lr_val_auc = evaluator.evaluate(lr_model.transform(val_df))
    print(f'✓ LR trained in {lr_time:.1f}s | Val AUC: {lr_val_auc:.4f}')

    results['LogisticRegression'] = {
        'val_auc': lr_val_auc,
        'train_time_s': lr_time,
        'best_params': {
            'regParam': 0.01,
            'maxIter': 50
        }
    }

    # Save immediately
    print('Saving LR model...')
    models_dir = Path('../data/models')
    models_dir.mkdir(exist_ok=True)
    lr_path = models_dir / 'lr_model'
    lr_model.write().overwrite().save(str(lr_path))
    models_saved['lr'] = str(lr_path)
    print(f'✓ Saved: {lr_path}')
    
    # Cleanup
    del lr_model
    del lr
    gc.collect()
    spark.sparkContext.gc()
    
except Exception as e:
    print(f'✗ LR training failed: {e}')
    import traceback
    traceback.print_exc()

print()

LOGISTIC REGRESSION
Training (regParam=0.01, no grid search)...
Evaluating...
✓ LR trained in 3.9s | Val AUC: 0.6522
Saving LR model...
✗ LR training failed: An error occurred while calling o4160.save.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:789)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:298)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:314)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:1179)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:861)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:901)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:873)
	at org.apache.hado

Traceback (most recent call last):
  File "C:\Users\RISHI BAKSHI\AppData\Local\Temp\ipykernel_8324\1328805715.py", line 42, in <module>
    lr_model.write().overwrite().save(str(lr_path))
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\pyspark\ml\util.py", line 642, in save
    self._jwrite.save(path)
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\py4j\java_gateway.py", line 1362, in __call__
    return_value = get_return_value(
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\pyspark\errors\exceptions\captured.py", line 263, in deco
    return f(*a, **kw)
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\py4j\protocol.py", line 327, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o4160.save.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.

## 3. Random Forest (MLlib)

In [21]:
# -------------------------------------------------------
# Random Forest
# Justification: Robust ensemble with feature importance
# -------------------------------------------------------
import gc
print('=' * 60)
print('RANDOM FOREST')
print('=' * 60)

try:
    # Small ensemble to reduce memory
    rf = RandomForestClassifier(
        featuresCol='features',
        labelCol='label',
        seed=42,
        numTrees=20,  # Reduced from 50
        maxDepth=8,   # Reduced from 10
        subsamplingRate=0.7
    )

    print('Training (numTrees=20, maxDepth=8)...')
    t0 = time.time()
    rf_model = rf.fit(train_df)
    rf_time = time.time() - t0

    print('Evaluating...')
    rf_preds = rf_model.transform(val_df)
    rf_val_auc = evaluator.evaluate(rf_preds)
    print(f'✓ RF trained in {rf_time:.1f}s | Val AUC: {rf_val_auc:.4f}')

    results['RandomForest'] = {
        'val_auc': rf_val_auc,
        'train_time_s': rf_time,
        'best_params': {
            'numTrees': 20,
            'maxDepth': 8
        }
    }

    # Feature importances
    print('Computing feature importances...')
    feature_names = [
        'lepton_pT', 'lepton_eta', 'lepton_phi',
        'missing_energy_magnitude', 'missing_energy_phi',
        'jet1_pT', 'jet1_eta', 'jet1_phi', 'jet1_b_tag',
        'jet2_pT', 'jet2_eta', 'jet2_phi', 'jet2_b_tag',
        'jet3_pT', 'jet3_eta', 'jet3_phi', 'jet3_b_tag',
        'jet4_pT', 'jet4_eta', 'jet4_phi', 'jet4_b_tag',
        'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb',
        'lepton_to_jet_pT_ratio', 'total_visible_pT',
        'bjet_multiplicity', 'mass_ratio_bb_wwbb', 'delta_eta_leading_jets'
    ]

    importances = rf_model.featureImportances.toArray()
    imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
    imp_df = imp_df.sort_values('importance', ascending=False)
    imp_df.to_csv('../data/samples/feature_importances.csv', index=False)

    fig, ax = plt.subplots(figsize=(10, 7))
    ax.barh(imp_df['feature'][:15][::-1], imp_df['importance'][:15][::-1], color='#2196F3')
    ax.set_xlabel('Importance')
    ax.set_title('Random Forest — Top 15 Feature Importances')
    plt.tight_layout()
    plt.savefig('../data/samples/feature_importances.png', dpi=150)
    plt.close()

    # Save immediately
    print('Saving RF model...')
    models_dir = Path('../data/models')
    models_dir.mkdir(exist_ok=True)
    rf_path = models_dir / 'rf_model'
    rf_model.write().overwrite().save(str(rf_path))
    models_saved['rf'] = str(rf_path)
    print(f'✓ Saved: {rf_path}')
    
    # Cleanup
    del rf_model, rf_preds, rf, imp_df
    gc.collect()
    spark.sparkContext.gc()
    
except Exception as e:
    print(f'✗ RF training failed: {e}')
    import traceback
    traceback.print_exc()

print()

RANDOM FOREST
Training (numTrees=20, maxDepth=8)...
Evaluating...
✓ RF trained in 3.3s | Val AUC: 0.7317
Computing feature importances...
Saving RF model...
✗ RF training failed: An error occurred while calling o4803.save.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:789)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:298)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:314)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:1179)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:861)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:901)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:873

Traceback (most recent call last):
  File "C:\Users\RISHI BAKSHI\AppData\Local\Temp\ipykernel_8324\3176163070.py", line 72, in <module>
    rf_model.write().overwrite().save(str(rf_path))
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\pyspark\ml\util.py", line 642, in save
    self._jwrite.save(path)
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\py4j\java_gateway.py", line 1362, in __call__
    return_value = get_return_value(
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\pyspark\errors\exceptions\captured.py", line 263, in deco
    return f(*a, **kw)
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\py4j\protocol.py", line 327, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o4803.save.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.

## 4. Gradient Boosted Trees (MLlib)

In [22]:
# -------------------------------------------------------
# Gradient Boosted Trees
# Justification: State-of-the-art for tabular data
# -------------------------------------------------------
import gc
print('=' * 60)
print('GRADIENT BOOSTED TREES')
print('=' * 60)

try:
    gbt = GBTClassifier(
        featuresCol='features',
        labelCol='label',
        seed=42,
        maxIter=20,   # Reduced from 50
        maxDepth=4,   # Reduced from 5
        stepSize=0.1
    )

    print('Training (maxIter=20, maxDepth=4)...')
    t0 = time.time()
    gbt_model = gbt.fit(train_df)
    gbt_time = time.time() - t0

    print('Evaluating...')
    gbt_preds = gbt_model.transform(val_df)
    gbt_val_auc = evaluator.evaluate(gbt_preds)
    print(f'✓ GBT trained in {gbt_time:.1f}s | Val AUC: {gbt_val_auc:.4f}')

    results['GBT'] = {
        'val_auc': gbt_val_auc,
        'train_time_s': gbt_time,
        'best_params': {
            'maxIter': 20,
            'maxDepth': 4,
            'stepSize': 0.1
        }
    }

    # Save immediately
    print('Saving GBT model...')
    models_dir = Path('../data/models')
    models_dir.mkdir(exist_ok=True)
    gbt_path = models_dir / 'gbt_model'
    gbt_model.write().overwrite().save(str(gbt_path))
    models_saved['gbt'] = str(gbt_path)
    print(f'✓ Saved: {gbt_path}')
    
    # Cleanup
    del gbt_model, gbt_preds, gbt
    gc.collect()
    spark.sparkContext.gc()
    
except Exception as e:
    print(f'✗ GBT training failed: {e}')
    import traceback
    traceback.print_exc()

print()

GRADIENT BOOSTED TREES
Training (maxIter=20, maxDepth=4)...
Evaluating...
✓ GBT trained in 15.1s | Val AUC: 0.7395
Saving GBT model...
✗ GBT training failed: An error occurred while calling o5419.save.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:789)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:298)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:314)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:1179)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:861)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:901)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:873)
	at org.apache.hado

Traceback (most recent call last):
  File "C:\Users\RISHI BAKSHI\AppData\Local\Temp\ipykernel_8324\1846681693.py", line 45, in <module>
    gbt_model.write().overwrite().save(str(gbt_path))
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\pyspark\ml\util.py", line 642, in save
    self._jwrite.save(path)
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\py4j\java_gateway.py", line 1362, in __call__
    return_value = get_return_value(
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\pyspark\errors\exceptions\captured.py", line 263, in deco
    return f(*a, **kw)
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\py4j\protocol.py", line 327, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o5419.save.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unse

## 5. Linear SVM (MLlib)

In [23]:
# -------------------------------------------------------
# Linear SVM
# Justification: Theoretically sound for high-dimensional data
# -------------------------------------------------------
import gc
print('=' * 60)
print('LINEAR SVM')
print('=' * 60)

try:
    svm = LinearSVC(
        featuresCol='features',
        labelCol='label',
        maxIter=50,   # Reduced from 100
        regParam=0.01,
        tol=0.01
    )

    print('Training (regParam=0.01, maxIter=50)...')
    t0 = time.time()
    svm_model = svm.fit(train_df)
    svm_time = time.time() - t0

    print('Evaluating...')
    svm_preds = svm_model.transform(val_df)
    svm_val_auc = evaluator.evaluate(svm_preds)
    print(f'✓ SVM trained in {svm_time:.1f}s | Val AUC: {svm_val_auc:.4f}')

    results['SVM'] = {
        'val_auc': svm_val_auc,
        'train_time_s': svm_time,
        'best_params': {
            'regParam': 0.01,
            'maxIter': 50
        }
    }

    # Save immediately
    print('Saving SVM model...')
    models_dir = Path('../data/models')
    models_dir.mkdir(exist_ok=True)
    svm_path = models_dir / 'svm_model'
    svm_model.write().overwrite().save(str(svm_path))
    models_saved['svm'] = str(svm_path)
    print(f'✓ Saved: {svm_path}')
    
    # Cleanup
    del svm_model, svm_preds, svm
    gc.collect()
    spark.sparkContext.gc()
    
except Exception as e:
    print(f'✗ SVM training failed: {e}')
    import traceback
    traceback.print_exc()

print()

LINEAR SVM
Training (regParam=0.01, maxIter=50)...
Evaluating...
✓ SVM trained in 7.8s | Val AUC: 0.6585
Saving SVM model...
✗ SVM training failed: An error occurred while calling o5590.save.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:789)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:298)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:314)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:1179)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:861)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:901)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:873)
	at org.apache.hadoop.fs.RawL

Traceback (most recent call last):
  File "C:\Users\RISHI BAKSHI\AppData\Local\Temp\ipykernel_8324\4114503647.py", line 43, in <module>
    svm_model.write().overwrite().save(str(svm_path))
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\pyspark\ml\util.py", line 642, in save
    self._jwrite.save(path)
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\py4j\java_gateway.py", line 1362, in __call__
    return_value = get_return_value(
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\pyspark\errors\exceptions\captured.py", line 263, in deco
    return f(*a, **kw)
  File "d:\Downloads\Chrome Download\higgs_project\.venv\lib\site-packages\py4j\protocol.py", line 327, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o5590.save.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unse

## 6. Scikit-learn Baseline Comparison (Single Node)

In [24]:
# -------------------------------------------------------
# Scikit-learn Baseline Comparison (Single Node)
# Demonstrates scalability advantage of PySpark
# -------------------------------------------------------
from pyspark.ml.functions import vector_to_array

def spark_df_to_numpy(df, max_rows=100000):
    """Convert Spark DataFrame with 'features' vector column to numpy."""
    sampled = df.sample(fraction=min(1.0, max_rows / df.count()), seed=42)
    sampled_with_array = sampled.withColumn('feat_arr', vector_to_array('features'))
    pd_df = sampled_with_array.select('label', 'feat_arr').toPandas()
    X = np.array(pd_df['feat_arr'].tolist())
    y = pd_df['label'].values
    return X, y

print('Converting Spark → numpy for sklearn baseline...')
X_train_sk, y_train_sk = spark_df_to_numpy(train_df, 100_000)
X_val_sk,   y_val_sk   = spark_df_to_numpy(val_df,   30_000)

sk_results = {}

# Sklearn LR
print('sklearn LR...')
t0 = time.time()
sk_lr = SkLR(max_iter=200, C=1.0, n_jobs=-1)
sk_lr.fit(X_train_sk, y_train_sk)
sk_lr_time = time.time() - t0
sk_lr_auc  = roc_auc_score(y_val_sk, sk_lr.predict_proba(X_val_sk)[:, 1])
sk_results['LR'] = {'val_auc': sk_lr_auc, 'train_time_s': sk_lr_time, 'n_rows': len(X_train_sk)}

# Sklearn RF
print('sklearn RF...')
t0 = time.time()
sk_rf = SkRF(n_estimators=100, max_depth=10, n_jobs=-1, random_state=42)
sk_rf.fit(X_train_sk, y_train_sk)
sk_rf_time = time.time() - t0
sk_rf_auc  = roc_auc_score(y_val_sk, sk_rf.predict_proba(X_val_sk)[:, 1])
sk_results['RF'] = {'val_auc': sk_rf_auc, 'train_time_s': sk_rf_time, 'n_rows': len(X_train_sk)}

print('\n--- scikit-learn Baseline Results ---')
for name, r in sk_results.items():
    print(f'{name}: AUC={r["val_auc"]:.4f}, Time={r["train_time_s"]:.1f}s, Rows={r["n_rows"]:,}')

Converting Spark → numpy for sklearn baseline...
sklearn LR...
sklearn RF...

--- scikit-learn Baseline Results ---
LR: AUC=0.6676, Time=0.1s, Rows=3,518
RF: AUC=0.7439, Time=1.7s, Rows=3,518


## 7. Training Summary

In [25]:
# -------------------------------------------------------
# Training Summary
# -------------------------------------------------------
import gc

print('=' * 70)
print('TRAINING COMPLETE')
print('=' * 70)

# Display results
if results:
    print('\n--- PySpark MLlib Results ---')
    for model_name, metrics in results.items():
        print(f'{model_name:20s}: AUC={metrics["val_auc"]:.4f}, Time={metrics["train_time_s"]:.1f}s')

# Saved models
print(f'\n--- Models Saved ({len(models_saved)}/{4}) ---')
for name, path in models_saved.items():
    print(f'✓ {name.upper():10s}: {path}')

# Summary Table
if results:
    summary = pd.DataFrame({
        'Algorithm': list(results.keys()),
        'Val AUC': [v['val_auc'] for v in results.values()],
        'Train Time (s)': [v['train_time_s'] for v in results.values()],
    })
    summary = summary.sort_values('Val AUC', ascending=False)
    print('\n' + '=' * 70)
    print('RESULTS SUMMARY')
    print('=' * 70)
    print(summary.to_string(index=False))
    print('=' * 70)

    # Save summary
    summary.to_csv('../data/samples/training_summary.csv', index=False)
    print(f'\n✓ Summary saved to: ../data/samples/training_summary.csv')

    # Save results for evaluation notebook
    with open('../data/samples/mllib_results.json', 'w') as f:
        json.dump(results, f, indent=2)
    print(f' Results saved to: ../data/samples/mllib_results.json')

print('\n All done! Models are saved and ready for evaluation.')

# Clean up cache
try:
    train_df.unpersist()
    val_df.unpersist()
    test_df.unpersist()
    gc.collect()
    spark.sparkContext.gc()
    print(' Cache cleared')
except:
    pass

try:
    spark.stop()
    print(' Spark session closed.')
except:
    print(' Spark session cleanup complete.')

TRAINING COMPLETE

--- PySpark MLlib Results ---
LogisticRegression  : AUC=0.6522, Time=3.9s
RandomForest        : AUC=0.7317, Time=3.3s
GBT                 : AUC=0.7395, Time=15.1s
SVM                 : AUC=0.6585, Time=7.8s

--- Models Saved (0/4) ---

RESULTS SUMMARY
         Algorithm  Val AUC  Train Time (s)
               GBT 0.739534       15.082934
      RandomForest 0.731689        3.332450
               SVM 0.658478        7.766746
LogisticRegression 0.652212        3.894362

✓ Summary saved to: ../data/samples/training_summary.csv
 Results saved to: ../data/samples/mllib_results.json

 All done! Models are saved and ready for evaluation.
 Spark session closed.
