# Vanilla PySpark ML pipeline for classification (IRIS)

In [15]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import time

# ============================================================
# CONFIGURATION
# ============================================================
DATA_PATH = "../datasets/iris.csv" 
MAX_LR_ITERATIONS = 10

# ============================================================
# START TOTAL BENCHMARK TIMING (comparable to MapReduce)
# ============================================================
TOTAL_BENCHMARK_START = time.time()

print("=" * 70)
print("PYSPARK VANILLA BENCHMARK - IRIS DATASET")
print("=" * 70)

# ------------------------------------------------------------
# INITIALIZE SPARK SESSION
# ------------------------------------------------------------
print("\nInitializing Spark Session...")
init_start = time.time()

spark = SparkSession.builder \
    .appName("IrisClassificationVanilla") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Suppress excessive log messages
spark.sparkContext.setLogLevel("ERROR")

init_time = time.time() - init_start
print(f"   Initialization time: {init_time:.2f}s")

# ------------------------------------------------------------
# LOAD DATA FROM DISK
# ------------------------------------------------------------
print(f"\nLoading data from {DATA_PATH}...")
load_start = time.time()

try:
    df = spark.read.csv(DATA_PATH, header=True, inferSchema=True)
    # Force load by triggering action
    total_records = df.count()
    load_time = time.time() - load_start
    print(f"   Loaded {total_records} records in {load_time:.2f}s")
except Exception as e:
    print(f"ERROR: Could not load data from {DATA_PATH}")
    print(e)
    spark.stop()
    exit()

print("\n   Data Schema:")
df.printSchema()
df.show(5, truncate=False)

# ------------------------------------------------------------
# PREPROCESS DATA
# ------------------------------------------------------------
print("\nPreprocessing data...")
prep_start = time.time()

feature_columns = df.columns[:-1] 
label_column = df.columns[-1]

# Convert string label to numeric index
indexer = StringIndexer(inputCol=label_column, outputCol="label")
df_indexed = indexer.fit(df).transform(df)

# Assemble features into single vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(df_indexed).select("features", "label")

# Split Data
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

# Cache and force materialization (important for accurate timing)
train_data.cache()
test_data.cache()
train_count = train_data.count()
test_count = test_data.count()

prep_time = time.time() - prep_start

print(f"   Training size: {train_count}, Test size: {test_count}")
print(f"   Preprocessing time: {prep_time:.2f}s")

# ------------------------------------------------------------
# TRAIN AND EVALUATE MODELS
# ------------------------------------------------------------
print("\nTraining and evaluating models...")

# Initialize evaluators once
accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

def train_and_evaluate(model, name, total_train_records):
    """Trains a model, transforms the test data, and evaluates performance."""
    print(f"\n   Training: {name}...")
    start = time.time()
    
    # Train the model
    model_fit = model.fit(train_data)
    
    # Predict on test data
    predictions = model_fit.transform(test_data)
    
    # CRITICAL: Cache and force execution to ensure all operations complete
    predictions.cache()
    num_predictions = predictions.count()  # Force execution
    
    # Evaluate all required metrics
    accuracy = accuracy_evaluator.evaluate(predictions)
    f1_score = f1_evaluator.evaluate(predictions)
    
    end = time.time()
    time_taken = end - start
    
    # Calculate Throughput: records processed per second
    throughput = total_train_records / time_taken if time_taken > 0 else 0
    
    print(f"      [{name}]")
    print(f"      Accuracy:   {accuracy:.4f}")
    print(f"      F1 Score:   {f1_score:.4f}")
    print(f"      Time:       {time_taken:.2f}s")
    print(f"      Throughput: {throughput:,.0f} records/s")
    
    # For Logistic Regression - show convergence info
    if name == "Logistic Regression":
         if model_fit._call_java("hasSummary"):
             total_iters = model_fit.summary.totalIterations
             print(f"      Iterations: {total_iters}")
             if total_iters == MAX_LR_ITERATIONS:
                 print(f" WARNING: Hit max iterations ({MAX_LR_ITERATIONS})")
    
    # Cleanup
    predictions.unpersist()
    
    return accuracy, f1_score, time_taken, throughput

# Track all results
results = []
training_start = time.time()

# Logistic Regression
lr_model = LogisticRegression(
    maxIter=MAX_LR_ITERATIONS, 
    featuresCol="features", 
    labelCol="label"
)
lr_acc, lr_f1, lr_time, lr_tput = train_and_evaluate(
    lr_model, 
    "Logistic Regression", 
    train_count
)
results.append({
    "Model": "Logistic Regression", 
    "Accuracy": lr_acc, 
    "F1 Score": lr_f1, 
    "Time": lr_time, 
    "Throughput": lr_tput
})

# Decision Tree
dt_model = DecisionTreeClassifier(
    featuresCol="features", 
    labelCol="label"
)
dt_acc, dt_f1, dt_time, dt_tput = train_and_evaluate(
    dt_model, 
    "Decision Tree", 
    train_count
)
results.append({
    "Model": "Decision Tree", 
    "Accuracy": dt_acc, 
    "F1 Score": dt_f1, 
    "Time": dt_time, 
    "Throughput": dt_tput
})

training_time = time.time() - training_start

# ------------------------------------------------------------
# CLEANUP
# ------------------------------------------------------------
print("\n Cleaning up...")
cleanup_start = time.time()

train_data.unpersist()
test_data.unpersist()

cleanup_time = time.time() - cleanup_start
print(f"   Cleanup time: {cleanup_time:.2f}s")

# ============================================================
# END TOTAL BENCHMARK TIMING
# ============================================================
TOTAL_BENCHMARK_END = time.time()
TOTAL_TIME = TOTAL_BENCHMARK_END - TOTAL_BENCHMARK_START

# ============================================================
# RESULTS SUMMARY
# ============================================================
print("\n" + "=" * 70)
print("TIMING BREAKDOWN")
print("=" * 70)
print(f"Spark Initialization:   {init_time:>8.2f}s")
print(f"Data Loading:           {load_time:>8.2f}s")
print(f"Preprocessing:          {prep_time:>8.2f}s")
print(f"Training (both models): {training_time:>8.2f}s")
print(f"Cleanup:                {cleanup_time:>8.2f}s")
print("-" * 70)
print(f"TOTAL END-TO-END TIME:  {TOTAL_TIME:>8.2f}s")
print("=" * 70)

print("\n" + "=" * 70)
print("MODEL PERFORMANCE SUMMARY")
print("=" * 70)
print(f"{'Model':<20} | {'Acc':<6} | {'F1':<6} | {'Time (s)':<9} | {'Throughput (rec/s)':<20}")
print("-" * 70)
for res in results:
    print(f"{res['Model']:<20} | {res['Accuracy']:.4f} | {res['F1 Score']:.4f} | {res['Time']:.2f}s | {res['Throughput']:,.0f}")

print("\n" + "=" * 70)
print("COMPARISON METRICS ")
print("=" * 70)
print(f"Total Job Time:         {TOTAL_TIME:.2f}s")
print(f"Training Records:       {train_count}")
print(f"Test Records:           {test_count}")
print(f"Total Records:          {total_records}")
print("=" * 70)

print("\nBenchmark complete!")

# Stop Spark Session
spark.stop()

# Store final results for analysis
benchmark_summary = {
    'total_time': TOTAL_TIME,
    'init_time': init_time,
    'load_time': load_time,
    'prep_time': prep_time,
    'training_time': training_time,
    'cleanup_time': cleanup_time,
    'models': results,
    'dataset': {
        'name': 'iris',
        'path': DATA_PATH,
        'total_records': total_records,
        'train_records': train_count,
        'test_records': test_count
    }
}

PYSPARK VANILLA BENCHMARK - IRIS DATASET

Initializing Spark Session...
   Initialization time: 0.29s

Loading data from ../datasets/iris.csv...
   Loaded 150 records in 0.40s

   Data Schema:
root
 |-- sepal length (cm): double (nullable = true)
 |-- sepal width (cm): double (nullable = true)
 |-- petal length (cm): double (nullable = true)
 |-- petal width (cm): double (nullable = true)
 |-- species: integer (nullable = true)

+-----------------+----------------+-----------------+----------------+-------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|species|
+-----------------+----------------+-----------------+----------------+-------+
|5.1              |3.5             |1.4              |0.2             |0      |
|4.9              |3.0             |1.4              |0.2             |0      |
|4.7              |3.2             |1.3              |0.2             |0      |
|4.6              |3.1             |1.5              |0.2             |0      |
|5.0   