In [19]:
!pip install pyspark



In [22]:
#Initialize SparkSession and SparkContext
from pyspark.sql import SparkSession
from pyspark import SparkContext

In [23]:
#Create a Spark Session
SpSession = SparkSession.builder.master("local[*]").getOrCreate()

In [24]:
#Get the Spark Context from Spark Session    
SpContext = SpSession.sparkContext

In [25]:
#Load the CSV file into a RDD
autoData = SpContext.textFile("/content/drive/My Drive/auto-miles-per-gallon.csv")
autoData.cache()
autoData.take(5)
#Remove the first line (contains headers)
dataLines = autoData.filter(lambda x: "CYLINDERS" not in x)
dataLines.count()

398

In [27]:
#Cleanup Data

from pyspark.sql import Row

#Use default for average HP
avgHP =SpContext.broadcast(80.0)

#Function to cleanup Data
def CleanupData( inputStr) :
    global avgHP
    attList=inputStr.split(",")
    
    #Replace ? values with a normal value
    hpValue = attList[3]
    if hpValue == "?":
        hpValue=avgHP.value
       
    #Create a row with cleaned up and converted data
    values= Row(     MPG=float(attList[0]),\
                     CYLINDERS=float(attList[1]), \
                     DISPLACEMENT=float(attList[2]), 
                     HORSEPOWER=float(hpValue),\
                     WEIGHT=float(attList[4]), \
                     ACCELERATION=float(attList[5]), \
                     MODELYEAR=float(attList[6]),\
                     NAME=attList[7]  ) 
    return values

#Run map for cleanup
autoMap = dataLines.map(CleanupData)
autoMap.cache()
autoMap.take(5)

[Row(MPG=18.0, CYLINDERS=8.0, DISPLACEMENT=307.0, HORSEPOWER=130.0, WEIGHT=3504.0, ACCELERATION=12.0, MODELYEAR=70.0, NAME='chevrolet chevelle malibu'),
 Row(MPG=15.0, CYLINDERS=8.0, DISPLACEMENT=350.0, HORSEPOWER=165.0, WEIGHT=3693.0, ACCELERATION=11.5, MODELYEAR=70.0, NAME='buick skylark 320'),
 Row(MPG=18.0, CYLINDERS=8.0, DISPLACEMENT=318.0, HORSEPOWER=150.0, WEIGHT=3436.0, ACCELERATION=11.0, MODELYEAR=70.0, NAME='plymouth satellite'),
 Row(MPG=16.0, CYLINDERS=8.0, DISPLACEMENT=304.0, HORSEPOWER=150.0, WEIGHT=3433.0, ACCELERATION=12.0, MODELYEAR=70.0, NAME='amc rebel sst'),
 Row(MPG=17.0, CYLINDERS=8.0, DISPLACEMENT=302.0, HORSEPOWER=140.0, WEIGHT=3449.0, ACCELERATION=10.5, MODELYEAR=70.0, NAME='ford torino')]

In [28]:
#Create a Data Frame with the data. 
autoDf = SpSession.createDataFrame(autoMap)


In [34]:
#Perform Data Analytics
#See descriptive analytics.
autoDf.select("MPG","CYLINDERS").describe().show()


#Find correlation between predictors and target
from pyspark.mllib.stat import  Statistics



+-------+-----------------+------------------+
|summary|              MPG|         CYLINDERS|
+-------+-----------------+------------------+
|  count|              398|               398|
|   mean|23.51457286432161| 5.454773869346734|
| stddev|7.815984312565782|1.7010042445332125|
|    min|              9.0|               3.0|
|    max|             46.6|               8.0|
+-------+-----------------+------------------+



In [35]:
#Prepare data for ML


#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

from pyspark.ml.linalg import Vectors
def transformToLabeledPoint(row) :
    lp = ( row["MPG"], Vectors.dense([row["ACCELERATION"],\
                        row["DISPLACEMENT"], \
                        row["WEIGHT"]]))
    return lp
    
autoLp = autoMap.map(transformToLabeledPoint)
autoDF = SpSession.createDataFrame(autoLp,["label", "features"])
autoDF.select("label","features").show(10)

+-----+-------------------+
|label|           features|
+-----+-------------------+
| 18.0|[12.0,307.0,3504.0]|
| 15.0|[11.5,350.0,3693.0]|
| 18.0|[11.0,318.0,3436.0]|
| 16.0|[12.0,304.0,3433.0]|
| 17.0|[10.5,302.0,3449.0]|
| 15.0|[10.0,429.0,4341.0]|
| 14.0| [9.0,454.0,4354.0]|
| 14.0| [8.5,440.0,4312.0]|
| 14.0|[10.0,455.0,4425.0]|
| 15.0| [8.5,390.0,3850.0]|
+-----+-------------------+
only showing top 10 rows



In [38]:
#Perform Machine Learning


#Split into training and testing data
(trainingData, testData) = autoDF.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()

#Build the model on training data
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=10)
lrModel = lr.fit(trainingData)

#Print the metrics
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

#Predict on the test data
predictions = lrModel.transform(testData)
predictions.select("prediction","label","features").show()

#Find R2 for Linear Regression
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
evaluator.evaluate(predictions)

Coefficients: [0.15725544298723257,-0.01159976712307584,-0.006149152249937106]
Intercept: 41.57245247130714
+------------------+-----+-------------------+
|        prediction|label|           features|
+------------------+-----+-------------------+
|12.716503703465946| 13.0|[13.0,351.0,4363.0]|
|14.741119669260655| 14.0|[13.0,318.0,4096.0]|
|22.319969023487833| 15.0|[19.5,250.0,3158.0]|
| 12.12039900522457| 16.0| [9.5,400.0,4278.0]|
|23.520025902763713| 18.0|[14.5,171.0,2984.0]|
|20.849824651053083| 19.0|[15.0,250.0,3282.0]|
|23.478763930232535| 20.0|[16.0,232.0,2914.0]|
| 26.03149473563134| 23.0|[15.0,115.0,2694.0]|
| 28.39736756039587| 26.0|[12.5,121.0,2234.0]|
|29.352758310511547| 26.0| [17.7,98.0,2255.0]|
| 29.82897852614972| 26.0| [18.0,96.0,2189.0]|
| 32.38731726297246| 26.0| [20.5,97.0,1835.0]|
|29.629784691317617| 27.0| [14.5,97.0,2130.0]|
|28.464263746521016| 28.0|[15.5,140.0,2264.0]|
|  31.1435886086778| 29.5| [12.2,97.0,1825.0]|
| 30.13387749167121| 30.0| [14.5,88.0,2065.0]|

0.7049337311455797