In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession


In [4]:
spark=SparkSession.builder.getOrCreate()

In [5]:
spark

In [6]:
training=spark.read.option('header','true').csv(r'C:\Users\poorn\OneDrive\Documents\Prajapati_Sir_Naresh_IT\sample1.csv')
training.show()

+--------+---+----------+------+
|    Name|age|Experience|Salary|
+--------+---+----------+------+
|    jack| 31|        10| 30000|
|    alex| 30|         8| 25000|
|caroline| 29|         4| 20000|
|    paul| 24|         3| 20000|
|  sandra| 21|         1| 15000|
|casandra| 23|         2| 18000|
+--------+---+----------+------+



In [7]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



In [8]:
#Age and experience columne string to int
from pyspark.sql.types import IntegerType

training = training.withColumn('age', training['age'].cast(IntegerType()))
training = training.withColumn('Experience', training['Experience'].cast(IntegerType()))
training = training.withColumn('Salary', training['Salary'].cast(IntegerType()))

In [9]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["age","Experience"],outputCol="Independent Features")


In [10]:
output=featureassembler.transform(training)

In [11]:
output.show()

+--------+---+----------+------+--------------------+
|    Name|age|Experience|Salary|Independent Features|
+--------+---+----------+------+--------------------+
|    jack| 31|        10| 30000|         [31.0,10.0]|
|    alex| 30|         8| 25000|          [30.0,8.0]|
|caroline| 29|         4| 20000|          [29.0,4.0]|
|    paul| 24|         3| 20000|          [24.0,3.0]|
|  sandra| 21|         1| 15000|          [21.0,1.0]|
|casandra| 23|         2| 18000|          [23.0,2.0]|
+--------+---+----------+------+--------------------+



In [12]:
output.columns

['Name', 'age', 'Experience', 'Salary', 'Independent Features']

In [13]:
finalized_data=output.select(["Salary", "Independent Features"])

In [14]:
finalized_data.show()

+------+--------------------+
|Salary|Independent Features|
+------+--------------------+
| 30000|         [31.0,10.0]|
| 25000|          [30.0,8.0]|
| 20000|          [29.0,4.0]|
| 20000|          [24.0,3.0]|
| 15000|          [21.0,1.0]|
| 18000|          [23.0,2.0]|
+------+--------------------+



In [15]:
train_data, test_data= finalized_data.randomSplit([0.75,0.25])

In [16]:
training.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [17]:
from pyspark.ml.regression import LinearRegression

In [18]:
regressor = LinearRegression( featuresCol='Independent Features', labelCol='Salary')
regressor = regressor.fit(train_data)

In [20]:
### Coefficients
regressor.coefficients

DenseVector([-263.7076, 1767.624])

In [21]:
### Intercepts
regressor.intercept

19919.060052212404

In [22]:
### Prediction
pred_results=regressor.evaluate(test_data)

In [23]:
pred_results.predictions.show()



+------+--------------------+-----------------+
|Salary|Independent Features|       prediction|
+------+--------------------+-----------------+
| 20000|          [29.0,4.0]|19342.03655352618|
+------+--------------------+-----------------+

