In [8]:
from pyspark.sql import SparkSession

In [9]:
spark=SparkSession.builder.appName('Missing').getOrCreate()

In [10]:
#Read the data set
training=spark.read.csv('female.csv',header=True,inferSchema=True)
training.select('chestcircumference','waistcircumference','weightkg').show()

+------------------+------------------+--------+
|chestcircumference|waistcircumference|weightkg|
+------------------+------------------+--------+
|               922|               850|     657|
|               839|               708|     534|
|               874|               727|     663|
|              1008|               923|     782|
|              1089|              1163|     886|
|              1031|               968|     837|
|               988|               964|     732|
|               968|               914|     760|
|               831|               755|     545|
|              1010|               944|     684|
|              1065|               935|     990|
|              1034|               942|     742|
|              1063|               995|     781|
|               937|               848|     718|
|               989|               929|     709|
|               940|               835|     672|
|               835|               772|     620|
|               897|

In [15]:
training.show()

+---------+------------------------------+--------------+---------------------+------------------+------------+-----------------------+----------------+-----------------+-------------------------+----------------+----------------+------------------+----------------+-------------------------+------------------+--------------------+------------+-------------+-----------------+----------------------+-----------------+---------------+------------+------------------+----------+-----------+------------+---------------------+------------------------------+----------+---------+-------------+---------------+----------------+---------------------+----------+-------------------------+--------------------------+---------------------+-----------------+-------------------+-----------+-----------------+----------+-----------+-----------------+----------+----------------------+-----------+----------+-----------------+------------------+---------------------+----------+-----------+--------------------+

In [11]:
type(training)

pyspark.sql.dataframe.DataFrame

In [12]:
#Selecting features as independent variable
#create new feature

from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["waistcircumference","chestcircumference"],outputCol="Independent Feature")

In [16]:
output=featureassembler.transform(training.select('chestcircumference','waistcircumference','weightkg'))

In [17]:
output.show()

+------------------+------------------+--------+-------------------+
|chestcircumference|waistcircumference|weightkg|Independent Feature|
+------------------+------------------+--------+-------------------+
|               922|               850|     657|      [850.0,922.0]|
|               839|               708|     534|      [708.0,839.0]|
|               874|               727|     663|      [727.0,874.0]|
|              1008|               923|     782|     [923.0,1008.0]|
|              1089|              1163|     886|    [1163.0,1089.0]|
|              1031|               968|     837|     [968.0,1031.0]|
|               988|               964|     732|      [964.0,988.0]|
|               968|               914|     760|      [914.0,968.0]|
|               831|               755|     545|      [755.0,831.0]|
|              1010|               944|     684|     [944.0,1010.0]|
|              1065|               935|     990|     [935.0,1065.0]|
|              1034|              

In [19]:
finalized_data=output.select("Independent Feature","weightkg")

In [20]:
finalized_data.show()

+-------------------+--------+
|Independent Feature|weightkg|
+-------------------+--------+
|      [850.0,922.0]|     657|
|      [708.0,839.0]|     534|
|      [727.0,874.0]|     663|
|     [923.0,1008.0]|     782|
|    [1163.0,1089.0]|     886|
|     [968.0,1031.0]|     837|
|      [964.0,988.0]|     732|
|      [914.0,968.0]|     760|
|      [755.0,831.0]|     545|
|     [944.0,1010.0]|     684|
|     [935.0,1065.0]|     990|
|     [942.0,1034.0]|     742|
|     [995.0,1063.0]|     781|
|      [848.0,937.0]|     718|
|      [929.0,989.0]|     709|
|      [835.0,940.0]|     672|
|      [772.0,835.0]|     620|
|      [805.0,897.0]|     595|
|      [868.0,979.0]|     741|
|      [792.0,853.0]|     605|
+-------------------+--------+
only showing top 20 rows



In [25]:
#Train Test Split
from pyspark.ml.regression import LinearRegression
train_data, test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Feature',labelCol='weightkg')
regressor=regressor.fit(train_data)

In [26]:
##coefficients
regressor.coefficients

DenseVector([0.5173, 0.5818])

In [28]:
##Intercepts
regressor.intercept

-319.59515378564225

In [29]:
##Prediction
pred_result=regressor.evaluate(test_data)

In [30]:
pred_result.predictions.show()

+-------------------+--------+------------------+
|Independent Feature|weightkg|        prediction|
+-------------------+--------+------------------+
|      [615.0,695.0]|     358| 402.8474138995171|
|      [628.0,798.0]|     470|469.49276791868414|
|      [650.0,825.0]|     485|496.58015804301266|
|      [665.0,845.0]|     470| 515.9743531868419|
|      [669.0,776.0]|     504|477.90231560855506|
|      [672.0,780.0]|     510|481.78115463732087|
|      [672.0,878.0]|     520| 538.7931902754042|
|      [679.0,862.0]|     487| 533.1060095449247|
|      [682.0,802.0]|     516|  499.752498769228|
|      [690.0,867.0]|     546| 541.7047831486839|
|      [693.0,869.0]|     480| 544.4201112460603|
|      [697.0,864.0]|     529| 543.5804234722359|
|      [698.0,790.0]|     478| 501.0477913994884|
|      [698.0,814.0]|     499| 515.0099225761619|
|      [698.0,890.0]|     526|  559.223337968961|
|      [699.0,892.0]|     532| 560.9041212890129|
|      [700.0,784.0]|     536| 498.5918033826446|


In [33]:
pred_result.meanAbsoluteError

42.92489811684537

In [34]:
pred_result.meanSquaredError

3015.892239887554