## PySpark ML

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Machine Learning').getOrCreate()

In [3]:
spark

### Read the Dataset

In [4]:
training = spark.read.csv('test2.csv', header=True, inferSchema=True)

In [5]:
training.show()

+-------+---+----------+------+
|   NAME|AGE|EXPERIENCE|SALARY|
+-------+---+----------+------+
|    ABI| 31|        10| 30000|
|BANERJI| 15|         8| 25000|
| CHARLI| 30|         8| 20000|
|   DOMC| 25|         6| 20100|
| HARSHA| 29|         3| 15000|
|   PAUL| 32|         5| 18250|
+-------+---+----------+------+



In [6]:
training.printSchema()

root
 |-- NAME: string (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- EXPERIENCE: integer (nullable = true)
 |-- SALARY: integer (nullable = true)



In [7]:
training.columns

['NAME', 'AGE', 'EXPERIENCE', 'SALARY']

[AGE, EXPERIENCE] -----> new feature ----> Independent features

In [9]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=["AGE", "EXPERIENCE"], outputCol="Independent features")

In [10]:
output = featureassembler.transform(training)

In [11]:
output.show()

+-------+---+----------+------+--------------------+
|   NAME|AGE|EXPERIENCE|SALARY|Independent features|
+-------+---+----------+------+--------------------+
|    ABI| 31|        10| 30000|         [31.0,10.0]|
|BANERJI| 15|         8| 25000|          [15.0,8.0]|
| CHARLI| 30|         8| 20000|          [30.0,8.0]|
|   DOMC| 25|         6| 20100|          [25.0,6.0]|
| HARSHA| 29|         3| 15000|          [29.0,3.0]|
|   PAUL| 32|         5| 18250|          [32.0,5.0]|
+-------+---+----------+------+--------------------+



In [12]:
output.columns

['NAME', 'AGE', 'EXPERIENCE', 'SALARY', 'Independent features']

In [13]:
finalized_data = output.select("Independent features", "SALARY")
finalized_data.show()

+--------------------+------+
|Independent features|SALARY|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [15.0,8.0]| 25000|
|          [30.0,8.0]| 20000|
|          [25.0,6.0]| 20100|
|          [29.0,3.0]| 15000|
|          [32.0,5.0]| 18250|
+--------------------+------+



In [14]:
from pyspark.ml.regression import LinearRegression

In [15]:
## Train Test Split
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='Independent features', labelCol='SALARY')
regressor = regressor.fit(train_data)

In [None]:
## Cefficients
regressor.coefficients

DenseVector([-272.0314, 1047.8382])

In [17]:
## Intercepts
regressor.intercept

20484.30388911884

In [18]:
### Predictions
pred_results = regressor.evaluate(test_data)

In [19]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent features|SALARY|        prediction|
+--------------------+------+------------------+
|          [25.0,6.0]| 20100|19970.547165908156|
|         [31.0,10.0]| 30000|22529.711419114574|
+--------------------+------+------------------+



In [20]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(3799.870707488635, 27910984.758980803)