# Machine learning library pyspark
## Linear regression

In [2]:
import os
import pandas as pd
from pyspark.sql import SparkSession

In [3]:
CURR_DIR = os.path.abspath('')
DOC_NAME = os.path.join(CURR_DIR, '..', "docs", "employee_dataset.csv")

In [4]:
df = pd.read_csv(DOC_NAME)
df_describe = df.describe()
print(df_describe)

             Age  Experience        Salary
count  11.000000   11.000000     10.000000
mean   42.454545    5.181818  35190.000000
std    19.054467    2.638870  15749.317446
min    23.000000    2.000000   4500.000000
25%    24.500000    3.500000  27500.000000
50%    36.000000    5.000000  35600.000000
75%    56.000000    6.500000  42800.000000
max    78.000000   10.000000  65000.000000


In [9]:
spark = SparkSession.builder.appName('Practise').getOrCreate()
training = spark.read.csv(DOC_NAME, header=True, inferSchema=True)
training = training.na.drop(how ='any')
training.show()

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
| aaa| 25|         2| 45000|
| bbb| 56|         4| 65000|
| ccc| 24|         5| 35000|
| ddd| 36|         7| 25000|
| iii| 65|         5|  4500|
| jjj| 45|         3| 25000|
+----+---+----------+------+



In [10]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [12]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

### group feature
['Age', 'Experience']---->new feature --> Independent feature

In [19]:
from pyspark.ml.feature import VectorAssembler

In [23]:
feature_assmebler = VectorAssembler(inputCols=['Age', 'Experience'], outputCol="Independent feature")

In [27]:
output = feature_assmebler.transform(training)
output.show()
output.columns

+----+---+----------+------+-------------------+
|Name|Age|Experience|Salary|Independent feature|
+----+---+----------+------+-------------------+
| aaa| 25|         2| 45000|         [25.0,2.0]|
| bbb| 56|         4| 65000|         [56.0,4.0]|
| ccc| 24|         5| 35000|         [24.0,5.0]|
| ddd| 36|         7| 25000|         [36.0,7.0]|
| iii| 65|         5|  4500|         [65.0,5.0]|
| jjj| 45|         3| 25000|         [45.0,3.0]|
+----+---+----------+------+-------------------+



['Name', 'Age', 'Experience', 'Salary', 'Independent feature']

In [30]:
finalized_data = output.select('Independent feature','Salary')
finalized_data.show()

+-------------------+------+
|Independent feature|Salary|
+-------------------+------+
|         [25.0,2.0]| 45000|
|         [56.0,4.0]| 65000|
|         [24.0,5.0]| 35000|
|         [36.0,7.0]| 25000|
|         [65.0,5.0]|  4500|
|         [45.0,3.0]| 25000|
+-------------------+------+



In [63]:
### import linear regression
from pyspark.ml.regression import LinearRegression
### train and test data split
train_data, test_data = finalized_data.randomSplit([0.9,0.2])

In [64]:
### regressor and fit
regressor = LinearRegression(featuresCol='Independent feature', labelCol='Salary')
regressor = regressor.fit(train_data)

In [65]:
### coefficients
regressor.coefficients

DenseVector([-252.9989, -4064.3402])

In [66]:
### Intercepts
regressor.intercept

61456.37949836427

In [68]:
### Prediction of test data
prediction_result = regressor.evaluate(test_data)
prediction_result.predictions.show()

+-------------------+------+-----------------+
|Independent feature|Salary|       prediction|
+-------------------+------+-----------------+
|         [24.0,5.0]| 35000|35062.70447110146|
+-------------------+------+-----------------+



In [69]:
### Error prediction
prediction_result.meanAbsoluteError,prediction_result.meanSquaredError

(62.70447110146051, 3931.8506961138964)