In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import VectorUDT

from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.pipeline import PipelineModel

# Start Spark Session

In [None]:
# Start spark session. 

spark = SparkSession\
            .builder\
            .master("spark://spark-master:7077")\
            .appName("2_car_data_predictions_MLlib_jupyter")\
            .config("spark.executor.memory", "3G")\
            .config("spark.driver.memory", "3G")\
            .config("spark.sql.execution.arrow.pyspark.enabled", "true")\
            .getOrCreate()


In [None]:
# Print the Spark session configuration. 

print("Spark Session configuration : ")

print('===')

for e in spark.sparkContext.getConf().getAll():
    print(e)

print('===')


# Load the Pre-processed Car Data Parquet File

In [None]:
# Read in Parquet file.

car_df = spark.read.parquet("/data/car_data.parquet")


In [None]:
# Parquet files maintain the schema along with the data.
# Print the dataframe schema. 

car_df.printSchema()


In [None]:
# Show a few sample records. 

car_df.show(5, False)


In [None]:
# What's the partitioning situation in the Parquet file ?

car_df\
    .withColumn("partitionId", F.spark_partition_id())\
    .groupBy("partitionId")\
    .count()\
    .orderBy(F.asc("count"))\
    .show()

# Prepare Data for Algorithmic Input

In [None]:
# Map a string column of labels to an ML column of label indices 

# (Colums : 'Fuel_Type', 'Seller_Type', 'Transmission').


car_df = car_df.drop('fuel_Type_idx', 'seller_type_idx', 'transmission_idx')

indexer = StringIndexer(inputCols=['Fuel_Type', 'Seller_Type', 'Transmission'],
                        outputCols=['fuel_Type_idx', 'seller_type_idx', 'transmission_idx']
)

car_df = indexer.fit(car_df).transform(car_df)


In [None]:
# We created 3 new columns. Show aggregate counts.

car_df.select('Fuel_Type','fuel_Type_idx')\
    .groupBy('Fuel_Type','fuel_Type_idx')\
    .count()\
    .orderBy(F.col('fuel_Type_idx').asc())\
    .show()

car_df.select('Seller_Type','seller_type_idx')\
    .groupBy('Seller_Type','seller_type_idx')\
    .count()\
    .orderBy(F.col('seller_type_idx').asc())\
    .show()

car_df.select('Transmission','transmission_idx')\
    .groupBy('Transmission','transmission_idx')\
    .count()\
    .orderBy(F.col('transmission_idx').asc())\
    .show()

In [None]:
# The car_df dataframe will show the 3 extra columns : 'fuel_Type_idx', 'seller_type_idx', 'transmission_idx'

car_df.printSchema()


In [None]:
# One-hot encoding. For string type input data, it is common to 
# encode categorical features using StringIndexer first.

# Drop the columns I'm just about to create if they exist.
car_df = car_df.drop('fuel_Type_vec', 'seller_type_vec', 'transmission_vec')

encoder = OneHotEncoder(inputCols=['fuel_Type_idx', 'seller_type_idx', 'transmission_idx'],
                        outputCols=['fuel_Type_vec', 'seller_type_vec', 'transmission_vec'],
                        dropLast=True
)
model = encoder.fit(car_df)
car_df = model.transform(car_df)


# Visual One Hot encoding ? 

<img src="media/one_hot.png" alt="one_hot" width="800"/>

Image Source : https://medium.com/@michaeldelsole/what-is-one-hot-encoding-and-how-to-do-it-f0ae272f1179

In [None]:
# Something to be aware of ...

# PySpark automatically drops the last category is not included BY DEFAULT. This is to avoid a Dummy Variable Trap
# linear regression models....

# What is the Dummy Variable Trap (DVT)? The DVT occurs when two or more dummy variables 
# created by one-hot encoding are highly correlated (multi-collinear). This means that one variable can be 
# predicted from the others, making it difficult to interpret predicted coefficient variables in regression models.


# https://stackoverflow.com/questions/39500213/why-does-sparks-onehotencoder-drop-the-last-category-by-default

# But basically, ... dropping the last cat. value is done to avoid a DVT where one input variable can be predicted 
# from the others (eg. don't need a 1hot encoding of [isBoy, isGirl] when an encoding [isBoy] would give the same info). 
# The solution to the DVT is to drop one (not necessarily the last) of the cat. variables.


# There are 3 fuel types but the 'fuel_type_vec' column has only 2 elements because of this DVT last category drop thing.

# https://spark.apache.org/docs/latest/ml-features.html#vectorassembler

# Reminder... sparse vector (size, [non-zero indices], [non-zero values]) 
# [1.0,0.0] = (2,[0],[1.0]) - > (vector of size 2, [index 0], [gets value 1.0])

car_df\
    .select('Fuel_type', 'fuel_type_idx', 'fuel_type_vec')\
    .distinct()\
    .show(10)
                                                                                                                                  

In [None]:
# Print the car_df schema. 

# The car_df dataframe will show the 3 extra columns : 'fuel_Type_vec', 'seller_type_vec', 'transmission_vec'

car_df.printSchema()


In [None]:
# Drop the column if exists
car_df = car_df.drop('algorithmic_input')


# Assemble features I'm interested in using as a large vector column called 'algorithmic_input'.
assembler = VectorAssembler(
    inputCols=[
        'Present_Price', 
        'Kms_Driven', 
        'Owner', 
        'Car_Age', 
        'fuel_Type_vec', 
        'seller_type_vec',
        'transmission_vec'
    ],
    outputCol='algorithmic_input')


car_df = assembler.transform(car_df)


In [None]:
# Print the car_df schema. 

# The car_df dataframe will show the 1 extra columns : 'algorithmic_input'

car_df.printSchema()

In [None]:
# Show the some of the table columns

car_df.select('algorithmic_input', 'Selling_Price').show(10, False)


# Model Training

In [None]:
# Create train/test split. Seed it for reproducibility.
seed = 111
train_df, test_df = car_df.randomSplit([0.7, 0.3], seed=seed)

print('Training Dataset Count : {}'.format(train_df.count()))
print('Test Dataset Count : {}'.format(test_df.count()))


In [None]:
# Train a simlpe RandomForestRegressor model. This very simple model is meant 
# to show model feature inputs and outputs. The model is kept simple on purpose 
# to keep training time short.


# For more information on model tuning and ParamGrid, 
# this is a good resource to start with : https://medium.com/rahasak/random-forest-classifier-with-apache-spark-c63b4a23a7cc

# Instantiate RandomForestRegressor class
rf = RandomForestRegressor(featuresCol='algorithmic_input', 
                           labelCol='Selling_Price'
)

# Set some parameters ...
#
# numTrees : Number of trees in the random forest.
# maxDepth : Maximum depth of a tree. Increasing the depth makes the model more powerful, but deep trees take longer to train.
# impurity : Criterion used for information gain calculation
# setFeatureSubsetStrategy : auto -> Automatically select the number of features to consider for splits at each tree node
# seed : Use a random seed number , allowing to repeat the results

rf.setNumTrees(200)
rf.setMaxDepth(20)
rf.setImpurity("variance")
rf.setFeatureSubsetStrategy("auto")
rf.setSeed(seed)


In [None]:
# Chain rf model in a Pipeline. Could have included previous steps above (indexer, encoder, assembler) ...
pipeline = Pipeline(stages=[rf])

# Train model
model = pipeline.fit(train_df)


In [None]:
# Make predictions.

predictions = model.transform(test_df)

# Select example rows to display.
predictions.select("algorithmic_input", "prediction", "Selling_Price").show(5, False)


# Model Evaluation

In [None]:
# Evaluate Model.

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="Selling_Price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

evaluator = RegressionEvaluator(labelCol="Selling_Price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print("R Squared (R2) on test data = %g" % r2)

evaluator = RegressionEvaluator(labelCol="Selling_Price", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)
print("Mean Absolute Error (MAE) on test data = %g" % mae)

evaluator = RegressionEvaluator(labelCol="Selling_Price", predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(predictions)
print("Mean Square Error (MSE) on test data = %g" % mse)

# Persist Model

In [None]:

# Overwriting to avoid errors in running notebook multiple times..
model.write().overwrite().save('/data/rf.mdl')


# Load Saved Model and Score 

In [None]:
persisted_model = PipelineModel.load('/data/rf.mdl')

In [None]:
to_predict_schema = T.StructType([
    T.StructField('Selling_Price', T.DoubleType(), False),
    T.StructField('values_array', T.ArrayType(T.FloatType()), False)
])

to_predict_data = [
    (0.27, [0.47,21000.0,0.0,9.0,1.0,0.0,0.0,1.0]),
]

to_predict_df = spark.createDataFrame(data=to_predict_data, schema=to_predict_schema)
to_predict_df = to_predict_df.drop('algorithmic_input')

# UDF to convert values_array into a VectorUDT. 
# VectorUDT is what is required to call the model for prediction. 
list_to_vector_udf = F.udf(lambda l: Vectors.dense(l), VectorUDT())


to_predict_df = to_predict_df.select(
    to_predict_df["Selling_Price"], 
    list_to_vector_udf(to_predict_df["values_array"]).alias("algorithmic_input")
)

print('The input dataframe')
to_predict_df.show(10, False)

predictions = persisted_model.transform(to_predict_df)

print()
print('The predictions dataframe')
predictions.select("algorithmic_input", "prediction", "Selling_Price").show(5, False)

In [None]:
spark.stop()