In [None]:
pip install pyspark

In [None]:
import pandas as pd 
import numpy as np
import pyspark

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("CarPrediction").getOrCreate()

In [None]:
# Using read.csv to read the dataset
df = spark.read.csv("../input/used-car-dataset-ford-and-mercedes/audi.csv", inferSchema=True, header=True)

In [None]:
# Checks if there are any null values in the column
for col in df.columns:
  print(col, "\t", "with null values: ", df.filter(df[col].isNull()).count())

In [None]:
# Checks if there are any values with '?'
for col in df.columns:
  print(col, "\t", "with ? values: ", df.filter(df[col]=="?").count())

In [None]:
df.printSchema() # outputs the schema types which are automatically determined by spark

In [None]:
df.show()

In [None]:
# Checking the columns of the dataframe 
df.columns

In [None]:
# Checking the type of each column in the dataframe
df.describe()

In [None]:
# taking the cars whose price is less than 15000 and the transmission is Manual
df.filter((df["transmission"] == "Manual") & (df["price"]<15000)).show()

In [None]:
df.groupBy("model")

In [None]:
df.groupBy("model").count().show()

In [None]:
df.groupby("model").mean()[["model","avg(price)"]].show()

In [None]:
df.groupBy("year").mean()[["year","avg(tax)"]].show()

In [None]:
df.take(10)

**Performing Data Visualization**
* Here we perform data visualization using matplotlib plotting functions

In [None]:
# Performing data visualization
import matplotlib.pyplot as plt


In [None]:
# Relationship between year and price
x1 = df.toPandas()["year"].values.tolist()
y1 = df.toPandas()["price"].values.tolist()
plt.scatter(x1,y1)


In [None]:
# Relationship between model and price
x2 = df.toPandas()["model"].values.tolist()
y2 = df.toPandas()["price"].values.tolist()
ax = plt.subplot()
ax.barh(x2,y2)
ax.set_xlabel("Price")
ax.set_ylabel("Model")

In [None]:
# Relationship between mileage and price
y3 = df.toPandas()["mileage"]
plt.figure(figsize=(10,8))
plt.bar(x2,y3)
plt.xlabel("Model")
plt.ylabel("Mileage")
plt.title(label="Mileage Per Model")

In [None]:
# Relationship between price and engine size
x12 = df.toPandas()["mileage"].values.tolist()
y12 = df.toPandas()["price"].values.tolist()
plt.figure(figsize=(5,5))
plt.scatter(x12,y12)
plt.xlabel("Mileage of Car")
plt.ylabel("Price of Car")
plt.title("Relationship between price and mileage")


In [None]:
# Relationship between model and average tax per model
x4 = df.groupBy("model").mean()[["model","avg(tax)"]].toPandas()["model"].values.tolist()
y4 = df.groupBy("model").mean()[["model","avg(tax)"]].toPandas()["avg(tax)"].values.tolist()
plt.figure(figsize=(10,8))
plt.bar(x4, y4)
plt.xlabel("model")
plt.ylabel("Average Tax")
plt.title("Average Tax Per Model")


In [None]:
# Classifying the cars based on transmission
x5 = df.groupBy("transmission").count().toPandas()["transmission"].values.tolist()
y5 = df.groupBy("transmission").count().toPandas()["count"].values.tolist()
plt.figure(figsize=(5,5))
plt.bar(x5, y5)
plt.xlabel("Transmission Type")
plt.ylabel("Count")
plt.title("Types of Transmission")


In [None]:
# Relationship between transmission and price
x6 = df.groupBy("transmission").count().toPandas()["transmission"].values.tolist()
y6 = df.groupBy("transmission").mean().toPandas()["avg(price)"].values.tolist()
plt.figure(figsize=(5,5))
plt.bar(x6, y6)
plt.xlabel("Transmission Type")
plt.ylabel("Average Price")
plt.title("Average Price of each transmission")


In [None]:
# Classification of cars based on their fuel type
x7 = df.groupBy("fuelType").count().toPandas()["fuelType"].values.tolist()
y7 = df.groupBy("fuelType").count().toPandas()["count"].values.tolist()
plt.figure(figsize=(5,5))
plt.bar(x7, y7)
plt.xlabel("Fuel Type")
plt.ylabel("Number of Cars")
plt.title("Number of car available based on Fuel Type")


In [None]:
# Relationship between fuel type and price
x8 = df.groupBy("fuelType").count().toPandas()["fuelType"].values.tolist()
y8 = df.groupBy("fuelType").mean().toPandas()["avg(price)"].values.tolist()
plt.figure(figsize=(5,5))
plt.bar(x8, y8)
plt.xlabel("Fuel Type")
plt.ylabel("Average Price of Cars")
plt.title("Average Price of car based on Fuel Type")


In [None]:
# Relationship between price and engine size
x9 = df.groupBy("engineSize").mean().toPandas()["engineSize"].values.tolist()
y9 = df.groupBy("engineSize").mean().toPandas()["avg(price)"].values.tolist()
plt.figure(figsize=(5,5))
plt.bar(x9,y9,width=0.05)
plt.xlabel("Engine Size")
plt.ylabel("Average Price")
plt.title("Average Price based on Size of Engine")


In [None]:
# Relationship between price and miles per gallon (mpg)
x10 = df.groupBy("mpg").mean().toPandas()["mpg"].values.tolist()
y10 = df.groupBy("mpg").mean().toPandas()["avg(price)"].values.tolist()
plt.figure(figsize=(10,10))
plt.bar(x10,y10,width=0.5)
plt.xlabel("Mpg(Miles-Per-Gallon)")
plt.ylabel("Average Price")
plt.title("Average Price based on MPG(Miles-Per-Gallon)")


**Feature Engineering(DataTransformation)**
* Performing oneHotEncoder

In [None]:
# converting categorical data in model column using the one hot encoder 
from pyspark.ml.feature import (VectorAssembler, VectorIndexer,OneHotEncoder,StringIndexer)

In [None]:
# Performing one-hot-encoding on model column
model_indexer = StringIndexer(inputCol='model', outputCol='modelIndex')
model_encoder = OneHotEncoder(inputCol='modelIndex', outputCol='modelVector')

In [None]:
# Performing one-hot-encoding on transmission column
transmission_indexer = StringIndexer(inputCol='transmission', outputCol='transmissionIndex')
transmission_encoder = OneHotEncoder(inputCol='transmissionIndex', outputCol='transmissionVec')

In [None]:
# Performing one-hot-encoding on fuel column
fuel_type_indexer = StringIndexer(inputCol='fuelType', outputCol='fuelTypeIndex')
fuel_type_encoder = OneHotEncoder(inputCol='fuelTypeIndex', outputCol='fuelTypeVector')

In [None]:
# Performing VectorAssembling on all columns except our target column
assembler = VectorAssembler(inputCols=['modelVector', 'year', 'transmissionVec'
            ,'mileage','fuelTypeVector','tax','mpg','engineSize'],outputCol='features')

**Building Regression Model**
* Using multiple regression algorithms

In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
linear_reg = LinearRegression(featuresCol='features',labelCol='price', maxIter=10)

In [None]:
from pyspark.ml.regression import LinearRegressionModel
linear_reg_model = LinearRegressionModel()

In [None]:
# Splitting the dataset into train and test 
trainData, testData = df.randomSplit([0.7,.03])

In [None]:
# Creating the pipeline for the model with linear regression algorithm
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[model_indexer,transmission_indexer,fuel_type_indexer,
                            model_encoder,transmission_encoder,fuel_type_encoder,
                            assembler,linear_reg])


In [None]:
linear_fit = pipeline.fit(trainData)

In [None]:
results = linear_fit.transform(testData)
# linear_fit.predict(testData)

In [None]:
results.head(5)

In [None]:
# Checking out a binaryclassification evalutor
from pyspark.ml.evaluation import BinaryClassificationEvaluator
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='price')

In [None]:
results.select('price','prediction').show()

In [None]:
# Plotting the predicted value by linear regression against the original value
graph_res = results.select('price', 'prediction').toPandas()
x13 = graph_res["price"]
x14 = graph_res["prediction"]
plt.figure(figsize=(10,10))
plt.scatter(np.arange(0,len(x13),1),x13,label='Original Price')
plt.scatter(np.arange(0,len(x14),1),x14, label="Predicted Price")
plt.legend(loc="upper left")

In [None]:
my_eval.evaluate(results)

In [None]:
# Analyzing the performance of linear regression using RegressionEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
my_ref_eval = RegressionEvaluator(predictionCol='prediction',labelCol='price')

In [None]:
my_ref_eval.evaluate(results)

In [None]:
# R2 score
my_ref_eval.evaluate(results, {my_ref_eval.metricName:"r2"})

In [None]:
# RMSE score
my_ref_eval.evaluate(results, {my_ref_eval.metricName:"rmse"})

In [None]:
# MSE value
my_ref_eval.evaluate(results, {my_ref_eval.metricName:"mse"})

**RandomForest Regression Algorithm**

In [None]:
from pyspark.ml.regression import RandomForestRegressor

In [None]:
randomforest = RandomForestRegressor(featuresCol='features',labelCol='price')

In [None]:
# Creating a new pipeline with random Forest Regressor 
from pyspark.ml import Pipeline
pipeline1 = Pipeline(stages=[model_indexer,transmission_indexer,fuel_type_indexer,
                            model_encoder,transmission_encoder,fuel_type_encoder,
                            assembler,randomforest])
# df.na.drop()

In [None]:
# Executing the random forest regression algorithm
randomforestresult = pipeline1.fit(trainData)
randomFResult = randomforestresult.transform(testData)

In [None]:
print(randomFResult.head(5))

In [None]:
# Plotting the predicted price value by random forest regressor against the original price
graph_random = randomFResult.select('price', 'prediction').toPandas()
x15 = graph_random["price"]
x16 = graph_random["prediction"]
plt.figure(figsize=(10,10))
plt.scatter(np.arange(0,len(x15),1),x15,marker='x',label='Original Price')
plt.xlabel("Index/Each entry in the dataset")
plt.ylabel("Price")
plt.scatter(np.arange(0,len(x16),1),x16,marker='s', label="Predicted Price")
plt.legend(loc="upper left")

In [None]:
# Evaluating the performance of the random forest regressor using RegressionEvaluator 
from pyspark.ml.evaluation import RegressionEvaluator
my_ref_eval1 = RegressionEvaluator(predictionCol='prediction',labelCol='price')

In [None]:
my_ref_eval1.evaluate(randomFResult)

In [None]:
# R2 Score
my_ref_eval.evaluate(randomFResult, {my_ref_eval.metricName:"r2"})

In [None]:
# RMSE value
my_ref_eval.evaluate(randomFResult, {my_ref_eval.metricName:"rmse"})

In [None]:
# MSE value
my_ref_eval.evaluate(randomFResult, {my_ref_eval.metricName:"mse"})

**Gradient Boosting Regression**

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator


# Train a GBT model.
gbt = GBTRegressor(featuresCol="features",labelCol="price", maxIter=10)

# Chain indexer and GBT in a Pipeline
pipeline_gbt = Pipeline(stages=[model_indexer,transmission_indexer,fuel_type_indexer,
                            model_encoder,transmission_encoder,fuel_type_encoder,
                            assembler, gbt])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "price", "features").show(5)


In [None]:
# Plotting the predicted price value by GBT against the original price
graph_gbt = predictions.select("prediction", "price").toPandas()
x17 = graph_gbt["price"]
x18 = graph_gbt["prediction"]
plt.figure(figsize=(10,10))
plt.scatter(np.arange(0,len(x17),1),x17,marker='x',label='Original Price')
plt.xlabel("Index/Each entry in the dataset")
plt.ylabel("Price")
plt.scatter(np.arange(0,len(x18),1),x18,marker='s', label="Predicted Price")
plt.legend(loc="upper left")

In [None]:

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


In [None]:
evaluatorR2 = RegressionEvaluator(
    labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluatorR2.evaluate(predictions)
print("R2 Score on test data= %g" % r2)
gbtModel = model.stages[1]
print(gbtModel)  # summary only
