In [1]:
#import the sparksession
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName("Linear_regression_model").getOrCreate()

In [3]:
#let's import the Linear Regression class from the ML package 
from pyspark.ml.regression import LinearRegression

In [4]:
#getting the dataset 
df=spark.read.csv('auto_imports.csv',header=True,inferSchema=True)

In [5]:
df.printSchema()

root
 |-- symboling: integer (nullable = true)
 |-- normalized-losses: string (nullable = true)
 |-- make: string (nullable = true)
 |-- fuel-type: string (nullable = true)
 |-- aspiration: string (nullable = true)
 |-- num-of-doors: string (nullable = true)
 |-- body-style: string (nullable = true)
 |-- drive-wheels: string (nullable = true)
 |-- engine-location: string (nullable = true)
 |-- wheel-base: double (nullable = true)
 |-- length: double (nullable = true)
 |-- width: double (nullable = true)
 |-- height: double (nullable = true)
 |-- curb-weight: integer (nullable = true)
 |-- engine-type: string (nullable = true)
 |-- num-of-cylinders: string (nullable = true)
 |-- engine-size: integer (nullable = true)
 |-- fuel-system: string (nullable = true)
 |-- bore: string (nullable = true)
 |-- stroke: string (nullable = true)
 |-- compression-ratio: double (nullable = true)
 |-- horsepower: string (nullable = true)
 |-- peak-rpm: string (nullable = true)
 |-- city-mpg: integer (nu

In [6]:
#lets view some of the specific columns 
df.columns

['symboling',
 'normalized-losses',
 'make',
 'fuel-type',
 'aspiration',
 'num-of-doors',
 'body-style',
 'drive-wheels',
 'engine-location',
 'wheel-base',
 'length',
 'width',
 'height',
 'curb-weight',
 'engine-type',
 'num-of-cylinders',
 'engine-size',
 'fuel-system',
 'bore',
 'stroke',
 'compression-ratio',
 'horsepower',
 'peak-rpm',
 'city-mpg',
 'highway-mpg',
 'price']

In [7]:
df.select(['make','fuel-type','body-style']).show()

+-----------+---------+-----------+
|       make|fuel-type| body-style|
+-----------+---------+-----------+
|alfa-romero|      gas|convertible|
|alfa-romero|      gas|convertible|
|alfa-romero|      gas|  hatchback|
|       audi|      gas|      sedan|
|       audi|      gas|      sedan|
|       audi|      gas|      sedan|
|       audi|      gas|      sedan|
|       audi|      gas|      wagon|
|       audi|      gas|      sedan|
|        bmw|      gas|      sedan|
|        bmw|      gas|      sedan|
|        bmw|      gas|      sedan|
|        bmw|      gas|      sedan|
|        bmw|      gas|      sedan|
|        bmw|      gas|      sedan|
|        bmw|      gas|      sedan|
|        bmw|      gas|      sedan|
|  chevrolet|      gas|  hatchback|
|  chevrolet|      gas|  hatchback|
|  chevrolet|      gas|      sedan|
+-----------+---------+-----------+
only showing top 20 rows



In [8]:
#Since some of our columns are String type and we need to build the 
#linear regression model, Let's convert them to numeric using 
#String Indexer


In [9]:
from pyspark.ml.feature import StringIndexer

In [10]:
indexers=[StringIndexer(inputCol=column,outputCol=column+"_index"
                       ).fit(df)
         for column in list(set(df.columns))]

In [11]:
#use the pipeline class to transform the existing dataframe
from pyspark.ml import Pipeline

In [12]:
pipeline=Pipeline(stages=indexers)
df_r=pipeline.fit(df).transform(df)

In [13]:
df_r.columns

['symboling',
 'normalized-losses',
 'make',
 'fuel-type',
 'aspiration',
 'num-of-doors',
 'body-style',
 'drive-wheels',
 'engine-location',
 'wheel-base',
 'length',
 'width',
 'height',
 'curb-weight',
 'engine-type',
 'num-of-cylinders',
 'engine-size',
 'fuel-system',
 'bore',
 'stroke',
 'compression-ratio',
 'horsepower',
 'peak-rpm',
 'city-mpg',
 'highway-mpg',
 'price',
 'wheel-base_index',
 'num-of-doors_index',
 'peak-rpm_index',
 'city-mpg_index',
 'highway-mpg_index',
 'bore_index',
 'height_index',
 'aspiration_index',
 'num-of-cylinders_index',
 'fuel-type_index',
 'price_index',
 'engine-type_index',
 'width_index',
 'curb-weight_index',
 'horsepower_index',
 'fuel-system_index',
 'normalized-losses_index',
 'engine-size_index',
 'engine-location_index',
 'stroke_index',
 'symboling_index',
 'drive-wheels_index',
 'body-style_index',
 'length_index',
 'make_index',
 'compression-ratio_index']

In [14]:
#the _index columns are the transformed columns 
#Lets view the difference 
df_r.select(['make','make_index']).show()

+-----------+----------+
|       make|make_index|
+-----------+----------+
|alfa-romero|      17.0|
|alfa-romero|      17.0|
|alfa-romero|      17.0|
|       audi|      13.0|
|       audi|      13.0|
|       audi|      13.0|
|       audi|      13.0|
|       audi|      13.0|
|       audi|      13.0|
|        bmw|      11.0|
|        bmw|      11.0|
|        bmw|      11.0|
|        bmw|      11.0|
|        bmw|      11.0|
|        bmw|      11.0|
|        bmw|      11.0|
|        bmw|      11.0|
|  chevrolet|      16.0|
|  chevrolet|      16.0|
|  chevrolet|      16.0|
+-----------+----------+
only showing top 20 rows



In [15]:
#Now let's import the vector assembler to create our feature column 
#to predict the label
from pyspark.ml.feature import VectorAssembler

In [16]:
#Time to pick some indexed columns to make the feature set 
assembler=VectorAssembler(inputCols=['peak-rpm_index',
 'aspiration_index',
 'engine-size_index',
 'highway-mpg_index',
 'drive-wheels_index',
 'num-of-doors_index',
 'length_index',
 'stroke_index',
 'bore_index',
 'fuel-type_index',
 'make_index',
 'curb-weight_index',
 'horsepower_index',
 'wheel-base_index',
 'engine-type_index',
 'width_index',
 'fuel-system_index',
 'height_index',
 'num-of-cylinders_index'
    ],outputCol='features')

In [17]:
output=assembler.transform(df_r)

In [18]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[2.0,0.0,18.0,13....|
|[2.0,0.0,18.0,13....|
|[2.0,0.0,12.0,14....|
|(19,[0,2,3,6,8,10...|
|[1.0,0.0,16.0,12....|
|[1.0,0.0,16.0,0.0...|
|(19,[0,2,6,8,10,1...|
|(19,[0,2,6,8,10,1...|
|[1.0,1.0,40.0,24....|
|[6.0,0.0,4.0,8.0,...|
|[6.0,0.0,4.0,8.0,...|
|[15.0,0.0,19.0,7....|
|[15.0,0.0,19.0,7....|
|[15.0,0.0,19.0,0....|
|[4.0,0.0,20.0,12....|
|[4.0,0.0,20.0,12....|
|[4.0,0.0,20.0,24....|
|[13.0,0.0,29.0,29...|
|[4.0,0.0,6.0,20.0...|
|[4.0,0.0,6.0,20.0...|
+--------------------+
only showing top 20 rows



In [19]:
#Since our feature column is ready and price is our label column, 
#lets make the final set 
final=output.select(['features','price'])

In [20]:
final.show()

+--------------------+-----+
|            features|price|
+--------------------+-----+
|[2.0,0.0,18.0,13....|16500|
|[2.0,0.0,18.0,13....|16500|
|[2.0,0.0,12.0,14....|16500|
|(19,[0,2,3,6,8,10...|13950|
|[1.0,0.0,16.0,12....|17450|
|[1.0,0.0,16.0,0.0...|15250|
|(19,[0,2,6,8,10,1...|17710|
|(19,[0,2,6,8,10,1...|18920|
|[1.0,1.0,40.0,24....|23875|
|[6.0,0.0,4.0,8.0,...|16430|
|[6.0,0.0,4.0,8.0,...|16925|
|[15.0,0.0,19.0,7....|20970|
|[15.0,0.0,19.0,7....|21105|
|[15.0,0.0,19.0,0....|24565|
|[4.0,0.0,20.0,12....|30760|
|[4.0,0.0,20.0,12....|41315|
|[4.0,0.0,20.0,24....|36880|
|[13.0,0.0,29.0,29...| 5151|
|[4.0,0.0,6.0,20.0...| 6295|
|[4.0,0.0,6.0,20.0...| 6575|
+--------------------+-----+
only showing top 20 rows



In [21]:
#Now let's create the train and the test sets
train_set,test_set=final.randomSplit([0.7,0.3])

In [22]:
train_set.describe().show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|               145|
|   mean|13346.889655172414|
| stddev| 8063.371586926312|
|    min|              5118|
|    max|             45400|
+-------+------------------+



In [24]:
test_set.describe().show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|                56|
|   mean|12898.910714285714|
| stddev| 7712.085175587694|
|    min|              5151|
|    max|             40960|
+-------+------------------+



In [25]:
#Time to build the model 
lr=LinearRegression(labelCol='price')
lr_model=lr.fit(train_set)

In [26]:
#evaluation
test_res=lr_model.evaluate(test_set)
test_res.r2

0.4753957850905234

In [27]:
#We got only 47% of accuracy.But that's fine since 
#this is our first attempt
#We can add or remove columns and make other transformations to 
#improve the accuracy 


In [28]:
#Now lets create one more model and test the accuracy 
assembler1=VectorAssembler(inputCols=['engine-size_index',
 'num-of-doors_index',
 'length_index',
 'stroke_index',
 'bore_index',
 'drive-wheels_index',
 'body-style_index',
 'compression-ratio_index',
 'make_index',
 'curb-weight_index',
 'horsepower_index',
 'wheel-base_index',
 'engine-type_index',
 'width_index',
 'fuel-system_index',
 'height_index',
 'num-of-cylinders_index'
    ],outputCol='features1')
#this time I have removed some of the columns 

In [29]:
output1=assembler1.transform(df_r)

In [30]:
final_data=output1.select('features1','price')

In [31]:
final_data.show()

+--------------------+-----+
|           features1|price|
+--------------------+-----+
|[18.0,1.0,39.0,27...|16500|
|[18.0,1.0,39.0,27...|16500|
|[12.0,1.0,54.0,18...|16500|
|(17,[0,2,4,7,8,9,...|13950|
|[16.0,0.0,30.0,0....|17450|
|[16.0,1.0,52.0,0....|15250|
|[16.0,0.0,20.0,0....|17710|
|[16.0,0.0,20.0,0....|18920|
|[40.0,0.0,20.0,0....|23875|
|[4.0,1.0,10.0,24....|16430|
|[4.0,0.0,10.0,24....|16925|
|[19.0,1.0,10.0,13...|20970|
|[19.0,0.0,10.0,13...|21105|
|[19.0,0.0,26.0,13...|24565|
|[20.0,0.0,26.0,4....|30760|
|[20.0,1.0,57.0,4....|41315|
|[20.0,0.0,48.0,4....|36880|
|[29.0,1.0,67.0,3....| 5151|
|[6.0,1.0,58.0,22....| 6295|
|[6.0,0.0,50.0,22....| 6575|
+--------------------+-----+
only showing top 20 rows



In [32]:
#creating one more train and testset 
train_set1,test_set1=final_data.randomSplit([0.7,0.3])

In [33]:
lr1=LinearRegression(featuresCol='features1',labelCol='price')

In [34]:
lr_model1=lr1.fit(train_set1)

In [35]:
#lets evaluate the new model
test_res1=lr_model1.evaluate(test_set1)
test_res1.r2

0.7293865174738596

In [36]:
#This is a huge improvement. we got 72% this time. 
#time to predict
unlabled=test_set1.select('features1')

In [37]:
prediction=lr_model1.transform(unlabled)
prediction.show()

+--------------------+------------------+
|           features1|        prediction|
+--------------------+------------------+
|(17,[0,1,3,4,6,7,...| 6447.604210519698|
|(17,[0,1,3,4,6,7,...|11671.732283842859|
|(17,[0,2,3,4,7,8,...| 8135.455498816507|
|(17,[0,2,3,4,8,9,...|10192.703386790017|
|(17,[0,2,3,4,9,10...|  6001.27015953499|
|(17,[0,2,3,4,9,10...| 7004.987426870744|
|(17,[0,2,3,8,9,10...| 7484.009466438379|
|(17,[0,2,4,7,8,9,...| 7909.312229606625|
|(17,[0,3,4,6,7,8,...| 8998.932227432557|
|(17,[0,3,4,7,8,9,...| 7654.075675295014|
|(17,[2,3,4,6,7,9,...| 8485.378770503205|
|(17,[2,3,4,7,9,10...| 9959.823122298436|
|[0.0,0.0,8.0,4.0,...| 8466.610183768229|
|[0.0,0.0,8.0,4.0,...| 9196.893854181684|
|[0.0,0.0,13.0,8.0...| 9988.748641588476|
|[0.0,1.0,8.0,4.0,...| 7762.038375291028|
|[0.0,1.0,8.0,4.0,...| 7762.038375291028|
|[0.0,1.0,37.0,8.0...|8177.3301943853985|
|[0.0,2.0,8.0,4.0,...| 6508.635460660711|
|[1.0,0.0,21.0,3.0...|15314.030403037148|
+--------------------+------------

In [None]:
#Prediction column contains the predicted price. Thanks :) 