In [2]:
from pyspark.sql import SparkSession

In [3]:
# Create a SparkSession instance (an entry point to all Spark functions)
spark = SparkSession.builder.appName("MYAPP").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/29 02:09:53 INFO SparkEnv: Registering MapOutputTracker
25/04/29 02:09:53 INFO SparkEnv: Registering BlockManagerMaster
25/04/29 02:09:53 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/04/29 02:09:53 INFO SparkEnv: Registering OutputCommitCoordinator


In [4]:
df = spark.read.csv('gs://dataproc-staging-us-central1-740540545323-gfe3jrkb/data/2019-01-h1.csv', header=True,
    inferSchema=True)
df_taxi = df.select("passenger_count", "pulocationid", "dolocationid", "total_amount")
df_taxi.show(10)

                                                                                

+---------------+------------+------------+------------+
|passenger_count|pulocationid|dolocationid|total_amount|
+---------------+------------+------------+------------+
|            1.0|       151.0|       239.0|        9.95|
|            1.0|       239.0|       246.0|        16.3|
|            3.0|       236.0|       236.0|         5.8|
|            5.0|       193.0|       193.0|        7.55|
|            5.0|       193.0|       193.0|       55.55|
|            5.0|       193.0|       193.0|       13.31|
|            5.0|       193.0|       193.0|       55.55|
|            1.0|       163.0|       229.0|        9.05|
|            1.0|       229.0|         7.0|        18.5|
|            2.0|       141.0|       234.0|        13.0|
+---------------+------------+------------+------------+
only showing top 10 rows



In [5]:
trainDF, testDF = df_taxi.randomSplit([.8, .2])

In [6]:
from pyspark.ml.feature import VectorAssembler

In [7]:
vecAss = VectorAssembler(inputCols = ["passenger_count", "pulocationid", "dolocationid"], outputCol = "features")
vecTrainDF = vecAss.transform(trainDF)
vecTrainDF.show(10)

[Stage 3:>                                                          (0 + 1) / 1]

+---------------+------------+------------+------------+--------------+
|passenger_count|pulocationid|dolocationid|total_amount|      features|
+---------------+------------+------------+------------+--------------+
|            0.0|         1.0|         1.0|        90.0| [0.0,1.0,1.0]|
|            0.0|         1.0|         1.0|      116.75| [0.0,1.0,1.0]|
|            0.0|         4.0|         4.0|         4.8| [0.0,4.0,4.0]|
|            0.0|         4.0|         4.0|        5.75| [0.0,4.0,4.0]|
|            0.0|         4.0|        17.0|        20.3|[0.0,4.0,17.0]|
|            0.0|         4.0|        68.0|        15.8|[0.0,4.0,68.0]|
|            0.0|         4.0|        79.0|         5.3|[0.0,4.0,79.0]|
|            0.0|         4.0|        79.0|         5.8|[0.0,4.0,79.0]|
|            0.0|         4.0|        79.0|        6.35|[0.0,4.0,79.0]|
|            0.0|         4.0|        79.0|         7.8|[0.0,4.0,79.0]|
+---------------+------------+------------+------------+--------

                                                                                

In [8]:
from pyspark.ml.regression import DecisionTreeRegressor

In [9]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol="total_amount")
dt = dt.setMaxBins(1000)

In [10]:
from pyspark.ml import Pipeline

In [11]:
pipeline = Pipeline(stages = [vecAss, dt])
pipelineModel = pipeline.fit(trainDF) # ML tranformer DF --> DF + prediction

                                                                                

In [12]:
predDF = pipelineModel.transform(testDF)
predDF.show(10)

[Stage 18:>                                                         (0 + 1) / 1]

+---------------+------------+------------+------------+---------------+------------------+
|passenger_count|pulocationid|dolocationid|total_amount|       features|        prediction|
+---------------+------------+------------+------------+---------------+------------------+
|            0.0|         4.0|         4.0|         4.3|  [0.0,4.0,4.0]|17.841418823273425|
|            0.0|         4.0|        33.0|       17.75| [0.0,4.0,33.0]|17.841418823273425|
|            0.0|         4.0|        68.0|        12.8| [0.0,4.0,68.0]|17.841418823273425|
|            0.0|         4.0|        80.0|       15.95| [0.0,4.0,80.0]|17.841418823273425|
|            0.0|         4.0|       137.0|        9.35|[0.0,4.0,137.0]|17.841418823273425|
|            0.0|         4.0|       144.0|        9.45|[0.0,4.0,144.0]|17.841418823273425|
|            0.0|         7.0|         7.0|        0.31|  [0.0,7.0,7.0]|17.841418823273425|
|            0.0|         7.0|         7.0|         3.3|  [0.0,7.0,7.0]|17.84141

                                                                                

In [13]:
from pyspark.ml.evaluation import RegressionEvaluator
evalr = RegressionEvaluator(predictionCol = 'prediction',
                          labelCol = 'total_amount', 
                          metricName = 'rmse')
rmse = evalr.evaluate(predDF)
print(rmse)



33.26615172455253


                                                                                