In [0]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [0]:
models = {
    "linear": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5),
    "random_forest": RandomForestRegressor(n_estimators=100)
}

In [0]:
import mlflow
from sklearn.model_selection import train_test_split

In [0]:
for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_model"):
        mlflow.log_param("model_type", name)

In [0]:
df= spark.table("ecommerce.gold.events").toPandas()
X = df[["views"]]
y = df["purchases"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [0]:
model.fit(X_train, y_train)
score = model.score(X_test, y_test)

In [0]:
print(f"{name}: RÂ² = {score:.4f}")

In [0]:
# Spark ML Pipeline
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR

In [0]:
assembler = VectorAssembler(inputCols=["views"], outputCol="features")
lr = SparkLR(featuresCol="features", labelCol="purchases")
pipeline = Pipeline(stages=[assembler, lr])

In [0]:
spark_df = spark.table("ecommerce.gold.events")
train, test = spark_df.randomSplit([0.8, 0.2])
model = pipeline.fit(train)

In [0]:
predictions = model.transform(test)
predictions.display()
