# pyspark mllib 

**Example**

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mlib').getOrCreate()

In [2]:
import pandas as pd
data = pd.DataFrame({'Name':['Krish','Sudhanshu','Sunny','Paul','Harsha','Shubham'],
                     'Age':[31,30,29,24,21,23],
                     'Experience':[10,8,4,3,1,2],
                     'Salary':[30000,25000,20000,20000,15000,18000]})
data.to_csv('test5_pyspark.csv',index=False)

In [3]:
df_pyspark = spark.read.csv('test5_pyspark.csv',header=True,inferSchema=True)
df_pyspark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [4]:
df_pyspark.columns

['Name', 'Age', 'Experience', 'Salary']

In [5]:
# ['Age','Experience']-----> new feature ------> independent feature-------> group these features using vectorassembler

In [6]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['Age','Experience'],outputCol='Independent Features')

In [7]:
output=featureassembler.transform(df_pyspark)
output.show()

+---------+---+----------+------+--------------------+
|     Name|Age|Experience|Salary|Independent Features|
+---------+---+----------+------+--------------------+
|    Krish| 31|        10| 30000|         [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|          [30.0,8.0]|
|    Sunny| 29|         4| 20000|          [29.0,4.0]|
|     Paul| 24|         3| 20000|          [24.0,3.0]|
|   Harsha| 21|         1| 15000|          [21.0,1.0]|
|  Shubham| 23|         2| 18000|          [23.0,2.0]|
+---------+---+----------+------+--------------------+



In [8]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Features']

In [9]:
finalized_dt = output.select('Independent Features','Salary')
finalized_dt.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
+--------------------+------+



**Train test split**

In [10]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = finalized_dt.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol='Independent Features',labelCol='Salary')
regressor = regressor.fit(train_data)

In [11]:
#Coefficients
regressor.coefficients

DenseVector([-258.8832, 1670.0508])

In [12]:
#Intercepts
regressor.intercept

20543.147208121274

In [13]:
#Prediction
pred_results = regressor.evaluate(test_data)

In [14]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [21.0,1.0]| 15000|16776.649746192812|
|          [24.0,3.0]| 20000|19340.101522842593|
+--------------------+------+------------------+



In [15]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(1218.2741116751095, 1795975.1604008244)

- Note : Databricks is an open and unified data analytics platform for data engineering, data science, machine learning and analytics