In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('test 5').getOrCreate()

In [3]:
df = spark.read.csv('dataset/test3.csv',header=True,inferSchema=True)
df.show(3)

+-----+------+--------+-----------+
| Item|Weight|     MRP|MRP_Imputed|
+-----+------+--------+-----------+
|FDA15|   9.3|249.8092|   249.8092|
|DRC01|  5.92| 48.2692|    48.2692|
|FDN15|  17.5| 141.618|    141.618|
+-----+------+--------+-----------+
only showing top 3 rows



In [4]:
df.printSchema()

root
 |-- Item: string (nullable = true)
 |-- Weight: double (nullable = true)
 |-- MRP: double (nullable = true)
 |-- MRP_Imputed: double (nullable = true)



In [5]:
df.columns

['Item', 'Weight', 'MRP', 'MRP_Imputed']

In [7]:
training = df.drop('MRP_Imputed')
training.show(3)

+-----+------+--------+
| Item|Weight|     MRP|
+-----+------+--------+
|FDA15|   9.3|249.8092|
|DRC01|  5.92| 48.2692|
|FDN15|  17.5| 141.618|
+-----+------+--------+
only showing top 3 rows



In [8]:
training.columns

['Item', 'Weight', 'MRP']

In [10]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["Weight","MRP"],outputCol="Independent Features")

In [12]:
output=featureassembler.transform(training)
output.show()

+-----+------+--------+--------------------+
| Item|Weight|     MRP|Independent Features|
+-----+------+--------+--------------------+
|FDA15|   9.3|249.8092|      [9.3,249.8092]|
|DRC01|  5.92| 48.2692|      [5.92,48.2692]|
|FDN15|  17.5| 141.618|      [17.5,141.618]|
|FDX07|  19.2| 182.095|      [19.2,182.095]|
|NCD19|  8.93| 53.8614|      [8.93,53.8614]|
|FDP36|10.395| 51.4008|    [10.395,51.4008]|
|FDO10| 13.65| 57.6588|     [13.65,57.6588]|
|FDP10|   0.0|107.7622|      [0.0,107.7622]|
|FDH17|  16.2| 96.9726|      [16.2,96.9726]|
|FDU28|  19.2|187.8214|     [19.2,187.8214]|
|FDY07|  11.8| 45.5402|      [11.8,45.5402]|
|FDA03|  18.5|144.1102|     [18.5,144.1102]|
|FDX32|  15.1|145.4786|     [15.1,145.4786]|
|FDS46|  17.6|119.6782|     [17.6,119.6782]|
|FDF32| 16.35|196.4426|    [16.35,196.4426]|
|FDP49|   9.0| 56.3614|       [9.0,56.3614]|
|NCB42|  11.8|115.3492|     [11.8,115.3492]|
|FDP49|   9.0| 54.3614|       [9.0,54.3614]|
|DRI11|   0.0|113.2834|      [0.0,113.2834]|
|FDU02| 13

In [13]:
output.columns

['Item', 'Weight', 'MRP', 'Independent Features']

In [14]:
finalized_data=output.select("Independent Features","MRP")

In [15]:
finalized_data.show()

+--------------------+--------+
|Independent Features|     MRP|
+--------------------+--------+
|      [9.3,249.8092]|249.8092|
|      [5.92,48.2692]| 48.2692|
|      [17.5,141.618]| 141.618|
|      [19.2,182.095]| 182.095|
|      [8.93,53.8614]| 53.8614|
|    [10.395,51.4008]| 51.4008|
|     [13.65,57.6588]| 57.6588|
|      [0.0,107.7622]|107.7622|
|      [16.2,96.9726]| 96.9726|
|     [19.2,187.8214]|187.8214|
|      [11.8,45.5402]| 45.5402|
|     [18.5,144.1102]|144.1102|
|     [15.1,145.4786]|145.4786|
|     [17.6,119.6782]|119.6782|
|    [16.35,196.4426]|196.4426|
|       [9.0,56.3614]| 56.3614|
|     [11.8,115.3492]|115.3492|
|       [9.0,54.3614]| 54.3614|
|      [0.0,113.2834]|113.2834|
|    [13.35,230.5352]|230.5352|
+--------------------+--------+
only showing top 20 rows



In [17]:
from pyspark.ml.regression import LinearRegression
##train test split
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='MRP')
regressor=regressor.fit(train_data)

In [18]:
### Coefficients
regressor.coefficients

DenseVector([0.0, 1.0])

In [19]:
### Intercepts
regressor.intercept

0.0

In [20]:
### Prediction
pred_results=regressor.evaluate(test_data)

In [21]:
pred_results.predictions.show()

+--------------------+-------+----------+
|Independent Features|    MRP|prediction|
+--------------------+-------+----------+
|       [0.0,34.2216]|34.2216|   34.2216|
|       [0.0,35.0558]|35.0558|   35.0558|
|       [0.0,35.2532]|35.2532|   35.2532|
|       [0.0,35.4558]|35.4558|   35.4558|
|       [0.0,36.9874]|36.9874|   36.9874|
|       [0.0,37.8506]|37.8506|   37.8506|
|       [0.0,37.8822]|37.8822|   37.8822|
|        [0.0,39.548]| 39.548|    39.548|
|       [0.0,39.9506]|39.9506|   39.9506|
|       [0.0,40.2822]|40.2822|   40.2822|
|       [0.0,40.9822]|40.9822|   40.9822|
|       [0.0,41.2454]|41.2454|   41.2454|
|       [0.0,43.2796]|43.2796|   43.2796|
|       [0.0,43.4086]|43.4086|   43.4086|
|       [0.0,43.4112]|43.4112|   43.4112|
|        [0.0,44.377]| 44.377|    44.377|
|       [0.0,44.9428]|44.9428|   44.9428|
|       [0.0,45.4744]|45.4744|   45.4744|
|       [0.0,46.1402]|46.1402|   46.1402|
|       [0.0,46.2086]|46.2086|   46.2086|
+--------------------+-------+----

In [22]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(0.0, 0.0)