In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import VectorUDT

from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.pipeline import PipelineModel

# Start Spark Session

In [2]:
# Start spark session. 

spark = SparkSession\
            .builder\
            .master("spark://spark-master:7077")\
            .appName("2_car_data_predictions_MLlib_jupyter")\
            .config("spark.executor.memory", "3G")\
            .config("spark.driver.memory", "3G")\
            .getOrCreate()


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/31 16:16:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Load the Pre-processed Car Data Parquet File

In [3]:
# Read in Parquet file.

car_df = spark.read.parquet("/data/car_data.parquet")


                                                                                

In [4]:
# Print the dataframe schema.

car_df.printSchema()


root
 |-- Car_Name: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Selling_Price: double (nullable = true)
 |-- Present_Price: double (nullable = true)
 |-- Kms_Driven: integer (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Seller_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Owner: integer (nullable = true)
 |-- Car_Age: integer (nullable = true)



In [5]:
# Show a few sample records. 

car_df.show(5, False)


[Stage 1:>                                                          (0 + 1) / 1]

+----------------------+----+-------------+-------------+----------+---------+-----------+------------+-----+-------+
|Car_Name              |Year|Selling_Price|Present_Price|Kms_Driven|Fuel_Type|Seller_Type|Transmission|Owner|Car_Age|
+----------------------+----+-------------+-------------+----------+---------+-----------+------------+-----+-------+
|ertiga                |2014|6.0          |9.95         |45000     |Diesel   |Dealer     |Manual      |0    |8      |
|city                  |2015|8.4          |13.6         |34000     |Petrol   |Dealer     |Manual      |0    |7      |
|TVS Apache RTR 160    |2012|0.6          |0.81         |19000     |Petrol   |Individual |Manual      |0    |10     |
|etios liva            |2011|2.65         |5.71         |43000     |Petrol   |Dealer     |Manual      |0    |11     |
|Hero Honda Passion Pro|2012|0.3          |0.51         |60000     |Petrol   |Individual |Manual      |0    |10     |
+----------------------+----+-------------+-------------

                                                                                

In [6]:
# What's the partitioning situation in the Parquet file ?

car_df\
    .withColumn("partitionId", F.spark_partition_id())\
    .groupBy("partitionId")\
    .count()\
    .orderBy(F.asc("count"))\
    .show()



+-----------+-----+
|partitionId|count|
+-----------+-----+
|          2|   50|
|          4|   50|
|          0|   50|
|          3|   50|
|          5|   50|
|          1|   51|
+-----------+-----+



                                                                                

# Prepare Data for Algorithmic Input

In [7]:
# Map a string column of labels to an ML column of label indices 

# (Colums : 'Fuel_Type', 'Seller_Type', 'Transmission').


car_df = car_df.drop('fuel_Type_idx', 'seller_type_idx', 'transmission_idx')

indexer = StringIndexer(inputCols=['Fuel_Type', 'Seller_Type', 'Transmission'],
                        outputCols=['fuel_Type_idx', 'seller_type_idx', 'transmission_idx']
)

car_df = indexer.fit(car_df).transform(car_df)


                                                                                

In [8]:
# We created 3 new columns. Show aggregate counts.

car_df.select('Fuel_Type','fuel_Type_idx')\
    .groupBy('Fuel_Type','fuel_Type_idx')\
    .count()\
    .orderBy(F.col('fuel_Type_idx').asc())\
    .show()

car_df.select('Seller_Type','seller_type_idx')\
    .groupBy('Seller_Type','seller_type_idx')\
    .count()\
    .orderBy(F.col('seller_type_idx').asc())\
    .show()

car_df.select('Transmission','transmission_idx')\
    .groupBy('Transmission','transmission_idx')\
    .count()\
    .orderBy(F.col('transmission_idx').asc())\
    .show()

+---------+-------------+-----+
|Fuel_Type|fuel_Type_idx|count|
+---------+-------------+-----+
|   Petrol|          0.0|  239|
|   Diesel|          1.0|   60|
|      CNG|          2.0|    2|
+---------+-------------+-----+

+-----------+---------------+-----+
|Seller_Type|seller_type_idx|count|
+-----------+---------------+-----+
|     Dealer|            0.0|  195|
| Individual|            1.0|  106|
+-----------+---------------+-----+

+------------+----------------+-----+
|Transmission|transmission_idx|count|
+------------+----------------+-----+
|      Manual|             0.0|  261|
|   Automatic|             1.0|   40|
+------------+----------------+-----+



In [9]:
# The car_df dataframe will show the 3 extra columns : 'fuel_Type_idx', 'seller_type_idx', 'transmission_idx'

car_df.printSchema()


root
 |-- Car_Name: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Selling_Price: double (nullable = true)
 |-- Present_Price: double (nullable = true)
 |-- Kms_Driven: integer (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Seller_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Owner: integer (nullable = true)
 |-- Car_Age: integer (nullable = true)
 |-- fuel_Type_idx: double (nullable = false)
 |-- seller_type_idx: double (nullable = false)
 |-- transmission_idx: double (nullable = false)



In [10]:
# One-hot encoding. For string type input data, it is common to encode categorical features using StringIndexer first.

# Drop the columns I'm just about to create if they exist.
car_df = car_df.drop('fuel_Type_vec', 'seller_type_vec', 'transmission_vec')

encoder = OneHotEncoder(inputCols=['fuel_Type_idx', 'seller_type_idx', 'transmission_idx'],
                        outputCols=['fuel_Type_vec', 'seller_type_vec', 'transmission_vec'],
                        dropLast=True
)
model = encoder.fit(car_df)
car_df = model.transform(car_df)


In [11]:
# Something to be aware of ...

# PySpark automatically drops the last category is not included BY DEFAULT. This is to avoid a Dummy Variable Trap
# linear regression models....

# What is the Dummy Variable Trap (DVT)? The DVT occurs when two or more dummy variables 
# created by one-hot encoding are highly correlated (multi-collinear). This means that one variable can be 
# predicted from the others, making it difficult to interpret predicted coefficient variables in regression models.


# https://stackoverflow.com/questions/39500213/why-does-sparks-onehotencoder-drop-the-last-category-by-default

# But basically, ... dropping the last cat. value is done to avoid a DVT where one input variable can be predicted 
# from the others (eg. don't need a 1hot encoding of [isBoy, isGirl] when an encoding [isBoy] would give the same info). 
# The solution to the DVT is to drop one (not necessarily the last) of the cat. variables.


# There are 3 fuel types but the 'fuel_type_vec' column has only 2 elements because of this DVT last category drop thing.

car_df\
    .select('Fuel_type', 'fuel_type_idx', 'fuel_type_vec')\
    .distinct()\
    .show(10)
                                                                                                                                  

+---------+-------------+-------------+
|Fuel_type|fuel_type_idx|fuel_type_vec|
+---------+-------------+-------------+
|   Petrol|          0.0|(2,[0],[1.0])|
|   Diesel|          1.0|(2,[1],[1.0])|
|      CNG|          2.0|    (2,[],[])|
+---------+-------------+-------------+



In [12]:
# Print the car_df schema. 

# The car_df dataframe will show the 3 extra columns : 'fuel_Type_vec', 'seller_type_vec', 'transmission_vec'

car_df.printSchema()


root
 |-- Car_Name: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Selling_Price: double (nullable = true)
 |-- Present_Price: double (nullable = true)
 |-- Kms_Driven: integer (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Seller_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Owner: integer (nullable = true)
 |-- Car_Age: integer (nullable = true)
 |-- fuel_Type_idx: double (nullable = false)
 |-- seller_type_idx: double (nullable = false)
 |-- transmission_idx: double (nullable = false)
 |-- fuel_Type_vec: vector (nullable = true)
 |-- seller_type_vec: vector (nullable = true)
 |-- transmission_vec: vector (nullable = true)



In [13]:
# https://spark.apache.org/docs/latest/ml-features.html#vectorassembler

# Reminder... sparse vector (size, [non-zero indices], [non-zero values]) 
# [1.0,0.0] = (2,[0],[1.0])

# Drop the column if exists
car_df = car_df.drop('algorithmic_input')


# Assemble features I'm interested in using as a large vector column called 'algorithmic_input'.
assembler = VectorAssembler(
    inputCols=[
        'Present_Price', 
        'Kms_Driven', 
        'Owner', 
        'Car_Age', 
        'fuel_Type_vec', 
        'seller_type_vec',
        'transmission_vec'
    ],
    outputCol='algorithmic_input')


car_df = assembler.transform(car_df)


In [14]:
# Print the car_df schema. 

# The car_df dataframe will show the 1 extra columns : 'algorithmic_input'

car_df.printSchema()

root
 |-- Car_Name: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Selling_Price: double (nullable = true)
 |-- Present_Price: double (nullable = true)
 |-- Kms_Driven: integer (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Seller_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Owner: integer (nullable = true)
 |-- Car_Age: integer (nullable = true)
 |-- fuel_Type_idx: double (nullable = false)
 |-- seller_type_idx: double (nullable = false)
 |-- transmission_idx: double (nullable = false)
 |-- fuel_Type_vec: vector (nullable = true)
 |-- seller_type_vec: vector (nullable = true)
 |-- transmission_vec: vector (nullable = true)
 |-- algorithmic_input: vector (nullable = true)



In [15]:
# Show the some of the table columns

car_df.select('algorithmic_input', 'Selling_Price').show(10, False)


+---------------------------------------+-------------+
|algorithmic_input                      |Selling_Price|
+---------------------------------------+-------------+
|[9.95,45000.0,0.0,8.0,0.0,1.0,1.0,1.0] |6.0          |
|[13.6,34000.0,0.0,7.0,1.0,0.0,1.0,1.0] |8.4          |
|[0.81,19000.0,0.0,10.0,1.0,0.0,0.0,1.0]|0.6          |
|[5.71,43000.0,0.0,11.0,1.0,0.0,1.0,1.0]|2.65         |
|[0.51,60000.0,0.0,10.0,1.0,0.0,0.0,1.0]|0.3          |
|[6.8,33019.0,0.0,8.0,1.0,0.0,1.0,1.0]  |3.75         |
|[12.04,15000.0,0.0,8.0,1.0,0.0,1.0,0.0]|7.5          |
|[9.9,56701.0,0.0,9.0,1.0,0.0,1.0,1.0]  |5.0          |
|[23.15,11000.0,0.0,5.0,1.0,0.0,1.0,0.0]|19.75        |
|[0.81,42000.0,0.0,8.0,1.0,0.0,0.0,1.0] |0.42         |
+---------------------------------------+-------------+
only showing top 10 rows



# Model Training

In [16]:
# Create train/test split. Seed it for reproducibility.
seed = 111
train_df, test_df = car_df.randomSplit([0.7, 0.3], seed=seed)

print('Training Dataset Count : {}'.format(train_df.count()))
print('Test Dataset Count : {}'.format(test_df.count()))


                                                                                

Training Dataset Count : 212
Test Dataset Count : 89


In [17]:
# Train a simlpe RandomForestRegressor model. This very simple model is meant 
# to show model feature inputs and outputs. The model is kept simple on purpose 
# to keep training time short.


# For more information on model tuning and ParamGrid, 
# this is a good resource to start with : https://medium.com/rahasak/random-forest-classifier-with-apache-spark-c63b4a23a7cc

# Instantiate RandomForestRegressor class
rf = RandomForestRegressor(featuresCol='algorithmic_input', 
                           labelCol='Selling_Price'
)

# Set some parameters ...
#
# numTrees : Number of trees in the random forest.
# maxDepth : Maximum depth of a tree. Increasing the depth makes the model more powerful, but deep trees take longer to train.
# impurity : Criterion used for information gain calculation
# setFeatureSubsetStrategy : auto -> Automatically select the number of features to consider for splits at each tree node
# seed : Use a random seed number , allowing to repeat the results

rf.setNumTrees(200)
rf.setMaxDepth(20)
rf.setImpurity("variance")
rf.setFeatureSubsetStrategy("auto")
rf.setSeed(seed)


RandomForestRegressor_15b81f2ff96f

In [18]:
# Chain rf model in a Pipeline. Could have included previous steps above (indexer, encoder, assembler) ...
pipeline = Pipeline(stages=[rf])

# Train model
model = pipeline.fit(train_df)


22/03/31 16:17:28 WARN DAGScheduler: Broadcasting large task binary with size 1361.6 KiB
22/03/31 16:17:29 WARN DAGScheduler: Broadcasting large task binary with size 1952.2 KiB
22/03/31 16:17:29 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
22/03/31 16:17:30 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
22/03/31 16:17:31 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
22/03/31 16:17:32 WARN DAGScheduler: Broadcasting large task binary with size 3.8 MiB
22/03/31 16:17:32 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
22/03/31 16:17:33 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
22/03/31 16:17:33 WARN DAGScheduler: Broadcasting large task binary with size 1456.1 KiB


In [19]:
# Make predictions.

predictions = model.transform(test_df)

# Select example rows to display.
predictions.select("algorithmic_input", "prediction", "Selling_Price").show(5, False)


+--------------------------------------+-------------------+-------------+
|algorithmic_input                     |prediction         |Selling_Price|
+--------------------------------------+-------------------+-------------+
|[0.47,21000.0,0.0,9.0,1.0,0.0,0.0,1.0]|0.33882113444809425|0.27         |
|[0.74,5000.0,0.0,7.0,1.0,0.0,0.0,1.0] |0.5792188833732439 |0.65         |
|[0.99,45000.0,0.0,9.0,1.0,0.0,0.0,1.0]|0.518399004410532  |0.5          |
|[6.8,33019.0,0.0,8.0,1.0,0.0,1.0,1.0] |4.1583864987559815 |3.75         |
|[5.8,40023.0,0.0,7.0,1.0,0.0,1.0,1.0] |4.57722290710424   |4.0          |
+--------------------------------------+-------------------+-------------+
only showing top 5 rows



# Model Evaluation

In [20]:
# Evaluate Model.

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="Selling_Price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

evaluator = RegressionEvaluator(labelCol="Selling_Price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print("R Squared (R2) on test data = %g" % r2)

evaluator = RegressionEvaluator(labelCol="Selling_Price", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)
print("Mean Absolute Error (MAE) on test data = %g" % mae)

evaluator = RegressionEvaluator(labelCol="Selling_Price", predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(predictions)
print("Mean Square Error (MSE) on test data = %g" % mse)

                                                                                

Root Mean Squared Error (RMSE) on test data = 1.7682
R Squared (R2) on test data = 0.879918
Mean Absolute Error (MAE) on test data = 0.858415
Mean Square Error (MSE) on test data = 3.12654


# Persist Model

In [21]:

# Overwriting to avoid errors in running notebook multiple times..
model.write().overwrite().save('/data/rf.mdl')


                                                                                

# Load Saved Model and Score 

In [22]:
persisted_model = PipelineModel.load('/data/rf.mdl')

                                                                                

In [23]:
to_predict_schema = T.StructType([
    T.StructField('Selling_Price', T.DoubleType(), False),
    T.StructField('values_array', T.ArrayType(T.FloatType()), False)
])

to_predict_data = [
    (0.27, [0.47,21000.0,0.0,9.0,1.0,0.0,0.0,1.0]),
]

to_predict_df = spark.createDataFrame(data=to_predict_data, schema=to_predict_schema)
to_predict_df = to_predict_df.drop('algorithmic_input')

# UDF to convert values_array into a VectorUDT. 
# VectorUDT is what is required to call the model for prediction. 
list_to_vector_udf = F.udf(lambda l: Vectors.dense(l), VectorUDT())


to_predict_df = to_predict_df.select(
    to_predict_df["Selling_Price"], 
    list_to_vector_udf(to_predict_df["values_array"]).alias("algorithmic_input")
)

print('The input dataframe')
to_predict_df.show(10, False)

predictions = model.transform(to_predict_df)

print()
print('The predictions dataframe')
predictions.select("algorithmic_input", "prediction", "Selling_Price").show(5, False)

The input dataframe


                                                                                

+-------------+----------------------------------------------------+
|Selling_Price|algorithmic_input                                   |
+-------------+----------------------------------------------------+
|0.27         |[0.4699999988079071,21000.0,0.0,9.0,1.0,0.0,0.0,1.0]|
+-------------+----------------------------------------------------+


The predictions dataframe
+----------------------------------------------------+-------------------+-------------+
|algorithmic_input                                   |prediction         |Selling_Price|
+----------------------------------------------------+-------------------+-------------+
|[0.4699999988079071,21000.0,0.0,9.0,1.0,0.0,0.0,1.0]|0.33882113444809425|0.27         |
+----------------------------------------------------+-------------------+-------------+



In [24]:
spark.stop()