In [14]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Car_Analysis_LR').getOrCreate()

In [15]:
## Read the dataset
df_car=spark.read.csv('car data.csv',header=True,inferSchema=True)

In [16]:
df_car.printSchema()

root
 |-- Car_Name: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Selling_Price: double (nullable = true)
 |-- Present_Price: double (nullable = true)
 |-- Kms_Driven: integer (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Seller_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Owner: integer (nullable = true)



### We will use Machine Learning Feature "Vector Assembler" for group the columns with a string values in it.

In [None]:
#["Selling_Price","Owner"]------>Independent feature

In [26]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["Present_Price","Kms_Driven"],outputCol="Independent_feature")

In [27]:
df_car2=featureassembler.transform(df_car)
df_car2.show()

+-------------+----+-------------+-------------+----------+---------+-----------+------------+-----+-------------------+
|     Car_Name|Year|Selling_Price|Present_Price|Kms_Driven|Fuel_Type|Seller_Type|Transmission|Owner|Independent_feature|
+-------------+----+-------------+-------------+----------+---------+-----------+------------+-----+-------------------+
|         ritz|2014|         3.35|         5.59|     27000|   Petrol|     Dealer|      Manual|    0|     [5.59,27000.0]|
|          sx4|2013|         4.75|         9.54|     43000|   Diesel|     Dealer|      Manual|    0|     [9.54,43000.0]|
|         ciaz|2017|         7.25|         9.85|      6900|   Petrol|     Dealer|      Manual|    0|      [9.85,6900.0]|
|      wagon r|2011|         2.85|         4.15|      5200|   Petrol|     Dealer|      Manual|    0|      [4.15,5200.0]|
|        swift|2014|          4.6|         6.87|     42450|   Diesel|     Dealer|      Manual|    0|     [6.87,42450.0]|
|vitara brezza|2018|         9.2

In [28]:
df_car2.columns

['Car_Name',
 'Year',
 'Selling_Price',
 'Present_Price',
 'Kms_Driven',
 'Fuel_Type',
 'Seller_Type',
 'Transmission',
 'Owner',
 'Independent_feature']

In [30]:
df_final=df_car2.select("Independent_feature","Selling_Price")
df_final.show(5)

+-------------------+-------------+
|Independent_feature|Selling_Price|
+-------------------+-------------+
|     [5.59,27000.0]|         3.35|
|     [9.54,43000.0]|         4.75|
|      [9.85,6900.0]|         7.25|
|      [4.15,5200.0]|         2.85|
|     [6.87,42450.0]|          4.6|
+-------------------+-------------+
only showing top 5 rows



In [32]:
from pyspark.ml.regression import LinearRegression
#train test split
train_data,test_data=df_final.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent_feature',labelCol="Selling_Price")
regressor=regressor.fit(train_data)

# Coefficients

In [33]:
regressor.coefficients

DenseVector([0.5196, -0.0])

## Intercepts

In [35]:
regressor.intercept

1.4239946266477388

# Predictions

In [37]:
pred_results=regressor.evaluate(test_data)

In [38]:
pred_results.predictions.show()

+-------------------+-------------+------------------+
|Independent_feature|Selling_Price|        prediction|
+-------------------+-------------+------------------+
|     [0.32,35000.0]|         0.18|0.9202553060486405|
|     [0.47,21000.0]|         0.27|1.2662051999814032|
|      [0.51,4000.0]|         0.45|1.6124270419641684|
|     [0.51,60000.0]|          0.3|0.5403990468657576|
|     [0.52,19000.0]|         0.35|1.3304728791447809|
|     [0.57,55000.0]|          0.2|0.6672929902056655|
|      [0.58,1900.0]|         0.25|1.6890014428541669|
|     [0.73,12000.0]|         0.42|1.5735964317535065|
|      [0.74,5000.0]|         0.65|1.7127961241513519|
|    [0.787,16000.0]|         0.65|1.5266413036922923|
|    [0.787,75000.0]|         0.38|0.3971832374278954|
|     [0.81,11800.0]|         0.65|1.6189946472489245|
|     [0.826,6000.0]|          0.5|1.7383400269867018|
|      [0.83,5500.0]|          0.4|1.7499901827185838|
|      [0.84,5000.0]|         0.78| 1.764758054256792|
|     [0.8