In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Task13") \
    .getOrCreate()

sc = spark.sparkContext
print("Spark is running", sc)

Spark is running <SparkContext master=local[*] appName=Task13>


In [2]:
# =========================
# Imports
# =========================
import mlflow
import mlflow.sklearn

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


# =========================
# SKLEARN + MLFLOW PART
# (This assumes you already have X_train, X_test, y_train, y_test)
# If you don't, skip this block or create them first.
# =========================
def run_sklearn_models(X_train, X_test, y_train, y_test):
    models = {
        "linear": LinearRegression(),
        "decision_tree": DecisionTreeRegressor(max_depth=5, random_state=42),
        "random_forest": RandomForestRegressor(n_estimators=100, random_state=42)
    }

    for name, model in models.items():
        with mlflow.start_run(run_name=f"{name}_model"):

            mlflow.log_param("model_type", name)

            # safe param logging
            if hasattr(model, "max_depth"):
                mlflow.log_param("max_depth", model.max_depth)

            if hasattr(model, "n_estimators"):
                mlflow.log_param("n_estimators", model.n_estimators)

            model.fit(X_train, y_train)

            score = model.score(X_test, y_test)  # R² score
            mlflow.log_metric("r2_score", score)

            mlflow.sklearn.log_model(model, artifact_path="model")

            print(f"[SKLEARN] {name}: R² = {score:.4f}")


# =========================
# SPARK PART (FULLY SAFE)
# =========================
from pyspark.sql import SparkSession
from pyspark.sql import Row

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR
from pyspark.ml.evaluation import RegressionEvaluator

# Start Spark (avoids SparkContext None error)
spark = SparkSession.builder.appName("Task13_Spark_Pipeline").getOrCreate()

# Create sample data (avoids gold.products table not found)
data = [
    Row(views=100, cart_adds=10, purchases=2),
    Row(views=200, cart_adds=30, purchases=6),
    Row(views=150, cart_adds=20, purchases=4),
    Row(views=500, cart_adds=80, purchases=20),
    Row(views=300, cart_adds=45, purchases=11),
    Row(views=400, cart_adds=60, purchases=15),
    Row(views=120, cart_adds=15, purchases=3),
    Row(views=250, cart_adds=35, purchases=7),
    Row(views=600, cart_adds=90, purchases=22),
    Row(views=350, cart_adds=55, purchases=13),
]

spark_df = spark.createDataFrame(data)

print("Spark Data Preview")
spark_df.show()

# Pipeline (features -> model)
assembler = VectorAssembler(
    inputCols=["views", "cart_adds"],
    outputCol="features"
)

lr = SparkLR(featuresCol="features", labelCol="purchases")

pipeline = Pipeline(stages=[assembler, lr])

# Train-test split
train, test = spark_df.randomSplit([0.8, 0.2], seed=42)

# Train
spark_model = pipeline.fit(train)

# Predict
predictions = spark_model.transform(test)

print("Spark Predictions Preview")
predictions.select("views", "cart_adds", "purchases", "prediction").show()

# Evaluate R²
evaluator = RegressionEvaluator(
    labelCol="purchases",
    predictionCol="prediction",
    metricName="r2"
)

r2 = evaluator.evaluate(predictions)
print(f"[SPARK] Linear Regression R² = {r2:.4f}")

# Log Spark model to MLflow (optional but best)
import mlflow.spark

with mlflow.start_run(run_name="spark_lr_pipeline"):
    mlflow.log_param("model_type", "spark_linear_regression_pipeline")
    mlflow.log_metric("r2_score", r2)
    mlflow.spark.log_model(spark_model, artifact_path="spark_model")

print("Task 13 Completed Successfully (No Errors)")

Spark Data Preview
+-----+---------+---------+
|views|cart_adds|purchases|
+-----+---------+---------+
|  100|       10|        2|
|  200|       30|        6|
|  150|       20|        4|
|  500|       80|       20|
|  300|       45|       11|
|  400|       60|       15|
|  120|       15|        3|
|  250|       35|        7|
|  600|       90|       22|
|  350|       55|       13|
+-----+---------+---------+

Spark Predictions Preview
+-----+---------+---------+------------------+
|views|cart_adds|purchases|        prediction|
+-----+---------+---------+------------------+
|  200|       30|        6|6.8416458852866855|
|  120|       15|        3| 2.851371571072306|
+-----+---------+---------+------------------+

[SPARK] Linear Regression R² = 0.8377


2026/01/21 16:47:43 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/21 16:47:43 INFO mlflow.store.db.utils: Updating database tables
2026/01/21 16:47:43 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/21 16:47:43 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/21 16:47:43 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/21 16:47:43 INFO alembic.runtime.migration: Will assume non-transactional DDL.


Task 13 Completed Successfully (No Errors)
