In [0]:
%sql
select * from gold.products limit 20;

In [0]:
df = spark.table("gold.products").toPandas()

X = df[["views", "cart_adds"]]
y = df["purchases"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import mlflow
import mlflow.sklearn


In [0]:
models = {
    "linear_regression": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "random_forest": RandomForestRegressor(n_estimators=100, random_state=42)
}


In [0]:
mlflow.set_experiment("/Shared/day13_model_comparison")

for name, model in models.items():
    with mlflow.start_run(run_name=name):

        mlflow.log_param("model_type", name)

        model.fit(X_train, y_train)
        r2 = model.score(X_test, y_test)

        mlflow.log_metric("r2_score", r2)

        mlflow.sklearn.log_model(
            model,
            "model",
            input_example=X_train.head()
        )

        print(f"{name} → R² Score: {r2:.4f}")


In [0]:
spark_df = spark.table("gold.products")


In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["views", "cart_adds"],
    outputCol="features"
)


In [0]:
from pyspark.ml.regression import LinearRegression as SparkLR

lr = SparkLR(
    featuresCol="features",
    labelCol="purchases"
)


In [0]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[assembler, lr])


In [0]:
train_df, test_df = spark_df.randomSplit([0.8, 0.2], seed=42)
pipeline_model = pipeline.fit(train_df)


In [0]:
predictions = pipeline_model.transform(test_df)

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="purchases",
    predictionCol="prediction",
    metricName="r2"
)

r2_pipeline = evaluator.evaluate(predictions)
print(f"Spark Pipeline R²: {r2_pipeline:.4f}")
