In [17]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow
import mlflow.spark
from sklearn.datasets import fetch_openml
import pandas as pd

def train_linear_regression_model():
    # Create a Spark session
    spark = SparkSession.builder.appName("LinearRegressionExample").getOrCreate()

    # Load the Boston Housing Dataset from scikit-learn
    data = fetch_openml(data_id=42165, as_frame=True)
    boston_df = pd.DataFrame(data.data, columns=data.feature_names)  
    boston_df["target"] = data.target

    # Create a Spark DataFrame from the Boston Housing data
    boston_df_spark = spark.createDataFrame(boston_df)

    # Prepare the features using VectorAssembler
    feature_cols = boston_df.columns[:-1].tolist()  # Exclude the target column
    vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    boston_df_spark = vector_assembler.transform(boston_df_spark)

    # Split the dataset into training and testing sets
    train_data, test_data = boston_df_spark.randomSplit([0.8, 0.2], seed=123)

    # Create a LinearRegression model
    lr = LinearRegression(featuresCol="features", labelCol="target")

    # Fit the model to the training data
    lr_model = lr.fit(train_data)

    # Log model parameters
    mlflow.log_param("model_type", "LinearRegression")

    # Log the model itself
    mlflow.spark.log_model(lr_model, "model")

    return lr_model, test_data, spark

def score_linear_regression_model(model, test_data, spark):
    # Make predictions on the test data
    predictions = model.transform(test_data)
    return predictions

def evaluate_linear_regression_model(predictions):
    # Evaluate the model
    evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)

    # Log RMSE as a metric
    mlflow.log_metric("RMSE", rmse)

    return rmse

def main():
    # Train the Linear Regression model
    lr_model, test_data, spark = train_linear_regression_model()

    # Score the model on the test data
    predictions = score_linear_regression_model(lr_model, test_data, spark)

    # Evaluate the model
    rmse = evaluate_linear_regression_model(predictions)

    print("Root Mean Squared Error (RMSE):", rmse)

    # End the MLflow run
    mlflow.end_run()

    # Stop the Spark session
    spark.stop()

if __name__ == "__main__":
    main()


  warn(
  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:


IllegalArgumentException: Data type string of column MSZoning is not supported.
Data type string of column Street is not supported.
Data type string of column Alley is not supported.
Data type string of column LotShape is not supported.
Data type string of column LandContour is not supported.
Data type string of column Utilities is not supported.
Data type string of column LotConfig is not supported.
Data type string of column LandSlope is not supported.
Data type string of column Neighborhood is not supported.
Data type string of column Condition1 is not supported.
Data type string of column Condition2 is not supported.
Data type string of column BldgType is not supported.
Data type string of column HouseStyle is not supported.
Data type string of column RoofStyle is not supported.
Data type string of column RoofMatl is not supported.
Data type string of column Exterior1st is not supported.
Data type string of column Exterior2nd is not supported.
Data type string of column MasVnrType is not supported.
Data type string of column ExterQual is not supported.
Data type string of column ExterCond is not supported.
Data type string of column Foundation is not supported.
Data type string of column BsmtQual is not supported.
Data type string of column BsmtCond is not supported.
Data type string of column BsmtExposure is not supported.
Data type string of column BsmtFinType1 is not supported.
Data type string of column BsmtFinType2 is not supported.
Data type string of column Heating is not supported.
Data type string of column HeatingQC is not supported.
Data type string of column CentralAir is not supported.
Data type string of column Electrical is not supported.
Data type string of column KitchenQual is not supported.
Data type string of column Functional is not supported.
Data type string of column FireplaceQu is not supported.
Data type string of column GarageType is not supported.
Data type string of column GarageFinish is not supported.
Data type string of column GarageQual is not supported.
Data type string of column GarageCond is not supported.
Data type string of column PavedDrive is not supported.
Data type string of column PoolQC is not supported.
Data type string of column Fence is not supported.
Data type string of column MiscFeature is not supported.
Data type string of column SaleType is not supported.
Data type string of column SaleCondition is not supported.

23/10/27 18:38:46 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 4492611 ms exceeds timeout 120000 ms
23/10/27 18:38:46 WARN SparkContext: Killing executors is not supported by current scheduler.
23/10/27 18:38:52 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$

In [6]:
spark = SparkSession.builder.appName("LinearRegressionExample").getOrCreate()


In [7]:
data = fetch_openml(data_id=42165, as_frame=True)
boston_df = pd.DataFrame(data.data, columns=data.feature_names)  
boston_df["target"] = data.target



  warn(


In [8]:
# Create a Spark DataFrame from the Boston Housing data
boston_df_spark = spark.createDataFrame(boston_df)



  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:


In [15]:
feature_cols = boston_df.columns[:-1].tolist()  # Exclude the target column


In [16]:
vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
