### Linear Regression in PySpark

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('pyml').getOrCreate()

In [6]:
training = spark.read.csv('test1.csv', header = True, inferSchema = True)

In [7]:
training.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|   Srijit| 29|         6| 10000|
|    Parna| 25|         4|  8000|
|Soumyajit| 26|         4|  8400|
|   Rounak| 29|         7| 18000|
|   Varsha| 28|         6| 12000|
|   Anisha| 27|         5| 14000|
|   Aritra| 28|         5| 11000|
+---------+---+----------+------+



In [8]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [9]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [12]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["Age","Experience"],outputCol="Independent Features")

In [13]:
output=featureassembler.transform(training)

In [14]:
output.show()

+---------+---+----------+------+--------------------+
|     Name|Age|Experience|Salary|Independent Features|
+---------+---+----------+------+--------------------+
|   Srijit| 29|         6| 10000|          [29.0,6.0]|
|    Parna| 25|         4|  8000|          [25.0,4.0]|
|Soumyajit| 26|         4|  8400|          [26.0,4.0]|
|   Rounak| 29|         7| 18000|          [29.0,7.0]|
|   Varsha| 28|         6| 12000|          [28.0,6.0]|
|   Anisha| 27|         5| 14000|          [27.0,5.0]|
|   Aritra| 28|         5| 11000|          [28.0,5.0]|
+---------+---+----------+------+--------------------+



In [15]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Features']

In [16]:
finalized_data=output.select("Independent Features","Salary")

In [17]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [29.0,6.0]| 10000|
|          [25.0,4.0]|  8000|
|          [26.0,4.0]|  8400|
|          [29.0,7.0]| 18000|
|          [28.0,6.0]| 12000|
|          [27.0,5.0]| 14000|
|          [28.0,5.0]| 11000|
+--------------------+------+



In [18]:
from pyspark.ml.regression import LinearRegression
##train test split
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regressor=regressor.fit(train_data)

In [19]:
regressor.coefficients

DenseVector([63.1579, 2810.5263])

In [20]:
### Intercepts
regressor.intercept

-4336.842105257627

In [21]:
pred_results=regressor.evaluate(test_data)

In [22]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [29.0,6.0]| 10000|14357.894736841878|
+--------------------+------+------------------+



In [23]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(4357.894736841878, 18991246.537394136)