#ML Prep (Feature engineering)

In [0]:
from pyspark.sql import functions as F

df = spark.table("workspace.ecommerce.mlPrepGold")

# Check for skewness in revenue and purchase rates
df.select("v2p_rate", "total_revenue", "price_mean").summary().show()

# Find high-value products that might be outliers (Z-Score logic)
price_stats = df.select(F.mean("price_mean").alias("avg"), F.stddev("price_mean").alias("std")).collect()
avg_p, std_p = price_stats[0]['avg'], price_stats[0]['std']

outliers = df.filter(F.col("price_mean") > (avg_p + 3 * std_p))
print(f"Number of price outliers: {outliers.count()}")


Top 10% revenue Threshold

In [0]:
from pyspark.sql import functions as F

# Find the 90th percentile value for total_revenue
threshold_value = df.approxQuantile("total_revenue", [0.9], 0.01)[0]
print(f"Top 10% Revenue Threshold: ${threshold_value:.2f}")

# Create the binary 'label' (1 for Top Seller, 0 for others)
df_labeled = df.withColumn("is_top_seller", 
                           F.when(F.col("total_revenue") >= threshold_value, 1).otherwise(0))

# Quick check: How many Top Sellers do we have?
df_labeled.groupBy("is_top_seller").count().show()

Filters views > 100 |  view_to_purchase_rate > 50% threshold 

In [0]:
# Apply the stricter filters for high-quality signal
df_high_intent = df_labeled.filter((F.col("total_views") > 100) & (F.col("v2p_rate")>0.01) )

# Check how many products remain
print(f"Remaining products for training: {df_high_intent.count()}")
df_high_intent.groupBy("is_top_seller").count().show()

In [0]:
from pyspark.ml.feature import Imputer

# Define columns that might have nulls
input_cols = ["total_views", "v2p_rate", "price_mean", ]
output_cols = [f"{c}_imputed" for c in input_cols]

# Initialize and fit the Imputer
imputer = Imputer(inputCols=input_cols, outputCols=output_cols).setStrategy("median")
df_final_prep = imputer.fit(df_high_intent).transform(df_high_intent)

print("Nulls handled. Ready for final model registration.")

In [0]:
display(
    df_final_prep.filter(
        F.col("total_views_imputed").isNull() | (F.col("total_views") == 0) |
        #F.col("total_carts").isNull() | (F.col("total_carts") == 0) |
        F.col("v2p_rate_imputed").isNull() | (F.col("v2p_rate") == 0) |
        #F.col("cart_to_purchase_rate").isNull() | (F.col("cart_to_purchase_rate") == 0) |
        F.col("price_mean_imputed").isNull() | (F.col("price_mean") == 0)
    )
)

In [0]:
df_final_prep.write.mode("overwrite").saveAsTable("workspace.ecommerce.mlPrepFeatures")

In [0]:
%sql

ALTER TABLE workspace.ecommerce.mlPrepFeatures
ALTER COLUMN product_id SET NOT NULL;

ALTER TABLE workspace.ecommerce.mlPrepFeatures ADD CONSTRAINT prepfeatures_pk PRIMARY KEY (product_id);
