In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("a4").getOrCreate()
df = spark.read.csv("2019-01-h1.csv", header=True, inferSchema=True)
df.select("passenger_count", "pulocationid", "dolocationid", "total_amount").show(10)

# Creating trainDF and testDF
trainDF, testDF = df.randomSplit([0.8, 0.2], seed=42)

+---------------+------------+------------+------------+
|passenger_count|pulocationid|dolocationid|total_amount|
+---------------+------------+------------+------------+
|            1.0|       151.0|       239.0|        9.95|
|            1.0|       239.0|       246.0|        16.3|
|            3.0|       236.0|       236.0|         5.8|
|            5.0|       193.0|       193.0|        7.55|
|            5.0|       193.0|       193.0|       55.55|
|            5.0|       193.0|       193.0|       13.31|
|            5.0|       193.0|       193.0|       55.55|
|            1.0|       163.0|       229.0|        9.05|
|            1.0|       229.0|         7.0|        18.5|
|            2.0|       141.0|       234.0|        13.0|
+---------------+------------+------------+------------+
only showing top 10 rows



In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor

# Combining input columns into a single features vector
assembler = VectorAssembler(inputCols=["passenger_count", "pulocationid", "dolocationid"], outputCol="features")

# Creating a Decision Tree Regressor
dt = DecisionTreeRegressor(featuresCol="features", labelCol="total_amount")

# Setting maxBins 
dt = dt.setMaxBins(1000)


In [8]:
from pyspark.ml import Pipeline

# Putting assembler and regressor into a pipeline
pipeline = Pipeline(stages=[assembler, dt])


In [9]:
# Training the model using the training DataFrame
model = pipeline.fit(trainDF)

In [10]:
# Making predictions on the test set
predictions = model.transform(testDF)

# Showing the features, total amount, and prediction
predictions.select("passenger_count", "pulocationid", "dolocationid", "total_amount", "prediction").show(10)


+---------------+------------+------------+------------+------------------+
|passenger_count|pulocationid|dolocationid|total_amount|        prediction|
+---------------+------------+------------+------------+------------------+
|            1.0|       223.0|       223.0|         4.3| 12.02645461205501|
|            1.0|       234.0|       186.0|         6.2| 12.02645461205501|
|            1.0|       158.0|       249.0|         5.8| 12.02645461205501|
|            1.0|       140.0|       237.0|        10.3| 12.02645461205501|
|            1.0|       148.0|        79.0|         8.8|15.722258274971887|
|            1.0|       233.0|       198.0|        27.3| 12.02645461205501|
|            1.0|       158.0|       164.0|        14.8| 12.02645461205501|
|            4.0|       161.0|       229.0|         6.8| 12.02645461205501|
|            1.0|       143.0|       262.0|        15.3| 12.02645461205501|
|            3.0|        37.0|        36.0|         7.3|  17.8758503968399|
+-----------

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator

# Setting up the evaluator
evaluator = RegressionEvaluator(
    labelCol="total_amount",
    predictionCol="prediction",
    metricName="rmse"
)

# Calculating RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")


Root Mean Squared Error (RMSE): 48.54838873706549
