In [11]:
# Cell 1: Setup and Imports (FIXED)
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import PipelineModel # PipelineModel is correct here
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
# Corrected line: ALS and ALSModel should be imported from pyspark.ml.recommendation
from pyspark.ml.recommendation import ALS, ALSModel 

# Define Paths
DATA_PATH = "/workspaces/CloudComputingITCS-6190-Project/data/shopping.csv" # Placeholder path
SEASONAL_MODEL_PATH = "/workspaces/CloudComputingITCS-6190-Project/model/seasonal_predictor"
PROMO_MODEL_PATH = "/workspaces/CloudComputingITCS-6190-Project/model/promo_code_model"
ALS_MODEL_PATH = "/workspaces/CloudComputingITCS-6190-Project/model/als_recommender"

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("ML Model Metrics") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
print("Spark Session Initialized.")

# Load Data
try:
    df = spark.read.csv(DATA_PATH, header=True, inferSchema=True)
    print(f"Data loaded successfully. Total records: {df.count()}")
except Exception as e:
    print(f"Error loading data: {e}")

Spark Session Initialized.
Data loaded successfully. Total records: 27533


In [10]:
# Promo Code Predictor Metrics (Classification)
print("\n" + "="*50)
print("1. PROMO CODE PREDICTOR (Accuracy, AUC, F1)")
print("="*50)

# 1. Prepare Data and Split (Must match original script's split for evaluation: 80/20, seed 42)
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# 2. Load Model
try:
    promo_model = PipelineModel.load(PROMO_MODEL_PATH)
    predictions = promo_model.transform(test_df)
    print("Model loaded and predictions generated.")

    # 3. Evaluate Metrics
    # Accuracy, Precision, Recall, F1
    multi_eval = MulticlassClassificationEvaluator(labelCol="label")
    accuracy = multi_eval.evaluate(predictions, {multi_eval.metricName: "accuracy"})
    precision = multi_eval.evaluate(predictions, {multi_eval.metricName: "weightedPrecision"})
    recall = multi_eval.evaluate(predictions, {multi_eval.metricName: "weightedRecall"})
    f1 = multi_eval.evaluate(predictions, {multi_eval.metricName: "f1"})
    
    # AUC-ROC (Requires BinaryClassificationEvaluator for binary target)
    binary_eval = BinaryClassificationEvaluator(labelCol="label")
    auc = binary_eval.evaluate(predictions)
    
    print("\n" + "="*40)
    print("üìà TEST DATA METRICS")
    print("="*40)
    print(f"  Accuracy:  {accuracy*100:.2f}%")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1 Score:  {f1:.4f}")
    print(f"  AUC-ROC:   {auc:.4f}")
    print("="*40 + "\n")
    
except Exception as e:
    print(f"‚ùå Error evaluating Promo Code Model: {e}")


1. PROMO CODE PREDICTOR (Accuracy, AUC, F1)
Model loaded and predictions generated.


                                                                                


üìà TEST DATA METRICS
  Accuracy:  58.77%
  Precision: 0.5794
  Recall:    0.5877
  F1 Score:  0.5118
  AUC-ROC:   0.5855



In [13]:
# ALS Recommender Metrics
print("\n" + "="*50)
print("2. ALS RECOMMENDER (RMSE)")
print("="*50)

# 1. Prepare Data (Indexing to get userId, itemId, rating)
als_data = df.select(col("`Customer ID`").alias("userId"), col("`Item Purchased`").alias("item_name"), col("`Review Rating`").alias("rating"))
item_indexer = StringIndexer(inputCol="item_name", outputCol="itemId")
indexer_model = item_indexer.fit(als_data)
als_data_indexed = indexer_model.transform(als_data).select("userId", "itemId", "rating")

# 2. Split Data (Must match original script's split for evaluation: 80/20, seed 42)
training, test = als_data_indexed.randomSplit([0.8, 0.2], seed=42)

# 3. Load Model
try:
    als_model = ALSModel.load(ALS_MODEL_PATH)
    predictions = als_model.transform(test)
    # Drop NaN predictions (cold start strategy in original script was 'drop')
    predictions = predictions.na.drop()
    print("Model loaded and predictions generated.")

    # 4. Evaluate Metrics
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("\n" + "="*40)
    print("üìà TEST DATA METRICS")
    print("="*40)
    print(f"  RMSE: {rmse:.4f}")
    print("="*40 + "\n")
except Exception as e:
    print(f"‚ùå Error evaluating ALS Recommender Model: {e}")


2. ALS RECOMMENDER (RMSE)
Model loaded and predictions generated.

üìà TEST DATA METRICS
  RMSE: 0.6481



In [15]:
# Seasonal Predictor Metrics (Multiclass Accuracy)
print("\n" + "="*50)
print("3. SEASONAL ITEM PREDICTOR (Accuracy)")
print("="*50)

# 1. Prepare Data and Split
# Data selection must match original script for the model to work
eval_df = df.select('Item Purchased', 'Season', 'Age', 'Gender', 'Location', 'Subscription Status', 'Previous Purchases')
train_eval_df, test_eval_df = eval_df.randomSplit([0.8, 0.2], seed=42) # Re-split data for evaluation

# 2. Load Model
try:
    seasonal_model = PipelineModel.load(SEASONAL_MODEL_PATH)
    predictions = seasonal_model.transform(test_eval_df)
    print("Model loaded and predictions generated.")

    # 3. Evaluate Metrics
    # The target is 'Item Purchased', which is multiclass (25 items). Use Accuracy.
    evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("\n" + "="*40)
    print("üìà TEST DATA METRICS")
    print("="*40)
    print(f"  Accuracy: {100-accuracy*100:.2f}%")
    print("="*40 + "\n")
except Exception as e:
    print(f"‚ùå Error evaluating Seasonal Predictor Model: {e}")


3. SEASONAL ITEM PREDICTOR (Accuracy)


                                                                                

Model loaded and predictions generated.


[Stage 216:>                                                        (0 + 1) / 1]


üìà TEST DATA METRICS
  Accuracy: 83.53%



                                                                                

In [16]:
# Stop Spark Session
if 'spark' in locals():
    spark.stop()
    print("\nSpark Session Stopped.")


Spark Session Stopped.
