## Vanilla PySpark ML pipeline for classification (SUSY)

In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.types import DoubleType
import pyspark.sql.functions as F
import time

# ============================================================
# CONFIGURATION
# ============================================================
DATA_PATH = "../datasets/SUSY.csv" 
MAX_LR_ITERATIONS = 10

# ============================================================
# START TOTAL BENCHMARK TIMING
# ============================================================
TOTAL_BENCHMARK_START = time.time()

print("=" * 70)
print("PYSPARK VANILLA BENCHMARK - SUSY DATASET")
print("=" * 70)

# ------------------------------------------------------------
# INITIALIZE SPARK SESSION
# ------------------------------------------------------------
print("\nInitializing Spark Session...")
init_start = time.time()

spark = SparkSession.builder \
    .appName("SUSYClassificationVanilla") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

init_time = time.time() - init_start
print(f"   Initialization time: {init_time:.2f}s")

# ------------------------------------------------------------
# LOAD DATA FROM DISK
# ------------------------------------------------------------
print(f"\nLoading data from {DATA_PATH}...")
load_start = time.time()

try:
    df = spark.read.csv(DATA_PATH, header=False, inferSchema=True)
    total_records = df.count()
    load_time = time.time() - load_start
    print(f"   Loaded {total_records} records in {load_time:.2f}s")
except Exception as e:
    print(f"ERROR: Could not load data from {DATA_PATH}")
    print(e)
    spark.stop()
    exit()

# ------------------------------------------------------------
# PREPROCESS DATA
# ------------------------------------------------------------
print("\nPreprocessing data...")
prep_start = time.time()

# Rename _c0 to label and cast to double
df = df.withColumnRenamed("_c0", "label").withColumn("label", F.col("label").cast(DoubleType()))

# Identify feature columns (all columns except 'label')
feature_columns = [c for c in df.columns if c != 'label']
print(f"   Feature Columns detected: {len(feature_columns)}")

# Assemble features
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(df).select("features", "label")

# Split Data
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

# Cache and force materialization
train_data.cache()
test_data.cache()
train_count = train_data.count()
test_count = test_data.count()

prep_time = time.time() - prep_start

print(f"   Training size: {train_count}, Test size: {test_count}")
print(f"   Preprocessing time: {prep_time:.2f}s")

# ------------------------------------------------------------
# TRAIN AND EVALUATE MODELS
# ------------------------------------------------------------
print("\nTraining and evaluating models...")

accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

def train_and_evaluate(model, name, total_train_records):
    print(f"\n   Training: {name}...")
    start = time.time()
    
    model_fit = model.fit(train_data)
    predictions = model_fit.transform(test_data)
    
    # Force execution
    predictions.cache()
    num_predictions = predictions.count()
    
    accuracy = accuracy_evaluator.evaluate(predictions)
    f1_score = f1_evaluator.evaluate(predictions)
    
    end = time.time()
    time_taken = end - start
    throughput = total_train_records / time_taken if time_taken > 0 else 0
    
    print(f"      [{name}]")
    print(f"      Accuracy:   {accuracy:.4f}")
    print(f"      F1 Score:   {f1_score:.4f}")
    print(f"      Time:       {time_taken:.2f}s")
    print(f"      Throughput: {throughput:,.0f} records/s")
    
    if name == "Logistic Regression":
         if model_fit._call_java("hasSummary"):
             print(f"      Iterations: {model_fit.summary.totalIterations}")
    
    predictions.unpersist()
    return accuracy, f1_score, time_taken, throughput

results = []
training_start = time.time()

# Logistic Regression
lr_model = LogisticRegression(maxIter=MAX_LR_ITERATIONS, featuresCol="features", labelCol="label")
lr_acc, lr_f1, lr_time, lr_tput = train_and_evaluate(lr_model, "Logistic Regression", train_count)
results.append({"Model": "Logistic Regression", "Accuracy": lr_acc, "F1 Score": lr_f1, "Time": lr_time, "Throughput": lr_tput})

# Decision Tree
dt_model = DecisionTreeClassifier(featuresCol="features", labelCol="label")
dt_acc, dt_f1, dt_time, dt_tput = train_and_evaluate(dt_model, "Decision Tree", train_count)
results.append({"Model": "Decision Tree", "Accuracy": dt_acc, "F1 Score": dt_f1, "Time": dt_time, "Throughput": dt_tput})

training_time = time.time() - training_start

# ------------------------------------------------------------
# CLEANUP
# ------------------------------------------------------------
print("\nCleaning up...")
cleanup_start = time.time()

train_data.unpersist()
test_data.unpersist()

cleanup_time = time.time() - cleanup_start
print(f"   Cleanup time: {cleanup_time:.2f}s")

# ============================================================
# END TOTAL BENCHMARK TIMING
# ============================================================
TOTAL_BENCHMARK_END = time.time()
TOTAL_TIME = TOTAL_BENCHMARK_END - TOTAL_BENCHMARK_START

# ============================================================
# RESULTS SUMMARY
# ============================================================
print("\n" + "=" * 70)
print("TIMING BREAKDOWN")
print("=" * 70)
print(f"Spark Initialization:   {init_time:>8.2f}s")
print(f"Data Loading:           {load_time:>8.2f}s")
print(f"Preprocessing:          {prep_time:>8.2f}s")
print(f"Training (both models): {training_time:>8.2f}s")
print(f"Cleanup:                {cleanup_time:>8.2f}s")
print("-" * 70)
print(f"TOTAL END-TO-END TIME:  {TOTAL_TIME:>8.2f}s")
print("=" * 70)

print("\n" + "=" * 70)
print("--- Summary of Vanilla Benchmark (SUSY) ---")
print("=" * 70)
print(f"{'Model':<20} | {'Acc':<6} | {'F1':<6} | {'Time (s)':<9} | {'Throughput (rec/s)':<20}")
print("-" * 70)
for res in results:
    print(f"{res['Model']:<20} | {res['Accuracy']:.4f} | {res['F1 Score']:.4f} | {res['Time']:.2f}s | {res['Throughput']:,.0f}")

print("\n" + "=" * 70)
print("COMPARISON METRICS")
print("=" * 70)
print(f"Total Job Time:         {TOTAL_TIME:.2f}s")
print(f"Training Records:       {train_count}")
print(f"Test Records:           {test_count}")
print("=" * 70)

spark.stop()

PYSPARK VANILLA BENCHMARK - SUSY DATASET

Initializing Spark Session...
   Initialization time: 0.49s

Loading data from ../datasets/SUSY.csv...


                                                                                

   Loaded 5000000 records in 8.07s

Preprocessing data...
   Feature Columns detected: 18


                                                                                

   Training size: 3499998, Test size: 1500002
   Preprocessing time: 21.92s

Training and evaluating models...

   Training: Logistic Regression...


                                                                                

      [Logistic Regression]
      Accuracy:   0.7851
      F1 Score:   0.7822
      Time:       7.94s
      Throughput: 440,900 records/s
      Iterations: 10

   Training: Decision Tree...


                                                                                

      [Decision Tree]
      Accuracy:   0.7710
      F1 Score:   0.7665
      Time:       5.22s
      Throughput: 669,900 records/s

Cleaning up...
   Cleanup time: 0.00s

TIMING BREAKDOWN
Spark Initialization:       0.49s
Data Loading:               8.07s
Preprocessing:             21.92s
Training (both models):    13.18s
Cleanup:                    0.00s
----------------------------------------------------------------------
TOTAL END-TO-END TIME:     43.68s

--- Summary of Vanilla Benchmark (SUSY) ---
Model                | Acc    | F1     | Time (s)  | Throughput (rec/s)  
----------------------------------------------------------------------
Logistic Regression  | 0.7851 | 0.7822 | 7.94s | 440,900
Decision Tree        | 0.7710 | 0.7665 | 5.22s | 669,900

COMPARISON METRICS
Total Job Time:         43.68s
Training Records:       3499998
Test Records:           1500002
