## **DAY 13**

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# 1️⃣ Load data from GOLD
df = spark.table("gold.daily_product_metrics").toPandas()

# 2️⃣ Prepare features and target
X = df[["views", "cart_adds"]]
y = df["purchases"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3️⃣ Models
models = {
    "linear_regression": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5),
    "random_forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

# 4️⃣ MLflow training
for name, model in models.items():
    with mlflow.start_run(run_name=name):

        mlflow.log_param("model_type", name)
        mlflow.log_param("test_size", 0.2)

        model.fit(X_train, y_train)

        score = model.score(X_test, y_test)
        mlflow.log_metric("r2_score", score)

        mlflow.sklearn.log_model(model, "model")

        print(f"{name}: R² Score = {score:.4f}")
# Spark ML Pipeline
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR

assembler = VectorAssembler(inputCols=["views","cart_adds"], outputCol="features")
lr = SparkLR(featuresCol="features", labelCol="purchases")
pipeline = Pipeline(stages=[assembler, lr])

spark_df = spark.table("gold.daily_product_metrics")
train, test = spark_df.randomSplit([0.8, 0.2])
model = pipeline.fit(train)



linear_regression: R² Score = 0.6578




decision_tree: R² Score = 0.6919




random_forest: R² Score = 0.6152
