In [1]:
!pip install pyspark



In [2]:
import pyspark
import pandas as pd

In [3]:
pd.read_csv('dataset.csv')

Unnamed: 0,Name,Age,Experience,Salary
0,Krish,31.0,10.0,30000.0
1,Sudhanshu,30.0,8.0,25000.0
2,Sunny,29.0,6.0,15000.0
3,Ishan,32.0,4.0,10000.0
4,Nishan,26.0,,
5,Rahamatullah,62.0,60.0,
6,,,,


In [4]:
# we have to create a spark session
from pyspark.sql import SparkSession

In [8]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [9]:
spark

In [10]:
# Try to read a dataset with respect to spark
df_pyspark = spark.read.csv('dataset.csv')

In [11]:
df_pyspark

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string]

In [12]:
df_pyspark.show()

+------------+----+----------+------+
|         _c0| _c1|       _c2|   _c3|
+------------+----+----------+------+
|        Name| Age|Experience|Salary|
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



In [13]:
spark.read.option('header','true').csv('dataset.csv')

DataFrame[Name: string, Age: string, Experience: string, Salary: string]

In [14]:
df_pyspark = spark.read.option('header','true').csv('dataset.csv').show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



In [15]:
df_pyspark = spark.read.option('header','true').csv('dataset.csv')

In [16]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [17]:
df_pyspark.head(3)

[Row(Name='Krish', Age='31', Experience='10', Salary='30000'),
 Row(Name='Sudhanshu', Age='30', Experience='8', Salary='25000'),
 Row(Name='Sunny', Age='29', Experience='6', Salary='15000')]

In [18]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



## Pyspark Data Frame - part1

In [19]:
from pyspark.sql import SparkSession

In [20]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

In [21]:
spark

In [54]:
## read dataset
df_pyspark = spark.read.option('header','true').csv('dataset.csv', inferSchema = True)

In [55]:
## check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [24]:
df_pyspark = spark.read.csv('dataset.csv', header = True, inferSchema = True)
df_pyspark.show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



In [25]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



In [26]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [27]:
df_pyspark.columns

['Name', 'Age', 'Experience', 'Salary']

In [28]:
df_pyspark.head(3)

[Row(Name='Krish', Age='31', Experience='10', Salary='30000'),
 Row(Name='Sudhanshu', Age='30', Experience='8', Salary='25000'),
 Row(Name='Sunny', Age='29', Experience='6', Salary='15000')]

In [29]:
df_pyspark.show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



In [30]:
df_pyspark.select('Name').show()

+------------+
|        Name|
+------------+
|       Krish|
|   Sudhanshu|
|       Sunny|
|       Ishan|
|      Nishan|
|Rahamatullah|
|        NULL|
+------------+



In [31]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

In [32]:
df_pyspark.select(['Name','Experience']).show()

+------------+----------+
|        Name|Experience|
+------------+----------+
|       Krish|        10|
|   Sudhanshu|         8|
|       Sunny|         6|
|       Ishan|         4|
|      Nishan|      null|
|Rahamatullah|        60|
|        NULL|      NULL|
+------------+----------+



In [33]:
df_pyspark.dtypes

[('Name', 'string'),
 ('Age', 'string'),
 ('Experience', 'string'),
 ('Salary', 'string')]

In [34]:
df_pyspark.describe().show()

+-------+-----+------------------+------------------+-----------------+
|summary| Name|               Age|        Experience|           Salary|
+-------+-----+------------------+------------------+-----------------+
|  count|    7|                 7|                 6|                5|
|   mean| null|              35.0|              17.6|          20000.0|
| stddev| null|13.386560424545209|23.807561823924768|9128.709291752768|
|    min|Ishan|                26|                10|            10000|
|    max|Sunny|              NULL|              NULL|             NULL|
+-------+-----+------------------+------------------+-----------------+



In [35]:
### Adding columns in DataFrame
df_pyspark = df_pyspark.withColumn('Experience After 2 years',df_pyspark['Experience']+2)

In [36]:
df_pyspark.show()

+------------+----+----------+------+------------------------+
|        Name| Age|Experience|Salary|Experience After 2 years|
+------------+----+----------+------+------------------------+
|       Krish|  31|        10| 30000|                    12.0|
|   Sudhanshu|  30|         8| 25000|                    10.0|
|       Sunny|  29|         6| 15000|                     8.0|
|       Ishan|  32|         4| 10000|                     6.0|
|      Nishan|  26|      null|  null|                    null|
|Rahamatullah|  62|        60|  null|                    62.0|
|        NULL|NULL|      NULL|  NULL|                    null|
+------------+----+----------+------+------------------------+



In [37]:
### Drop the columns
df_pyspark = df_pyspark.drop('Experience after 2 years')

In [38]:
df_pyspark.show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



In [39]:
### rename the column
df_pyspark.withColumnRenamed('Name','New Name').show()

+------------+----+----------+------+
|    New Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



## Pyspark Handling Missing Values

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [3]:
spark

In [4]:
df_pyspark = spark.read.csv('dataset.csv', header = True, inferSchema = True)

In [5]:
df_pyspark.show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|          23|null|      null|  null|
|        null|null|         2|  null|
+------------+----+----------+------+



In [6]:
## dropping row
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         6| 15000|
|  32|         4| 10000|
|  26|      null|  null|
|  62|        60|  null|
|null|      null|  null|
|null|         2|  null|
+----+----------+------+



In [7]:
## Dropping a row where null values are present
df_pyspark.na.drop().show()
#df_pyspark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         6| 15000|
|    Ishan| 32|         4| 10000|
+---------+---+----------+------+



In [8]:
### any = how
df_pyspark.show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|          23|null|      null|  null|
|        null|null|         2|  null|
+------------+----+----------+------+



In [9]:
df_pyspark.na.drop(how = "all").show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|          23|null|      null|  null|
|        null|null|         2|  null|
+------------+----+----------+------+



In [10]:
## threshold
df_pyspark.na.drop(how="any", thresh = 4).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         6| 15000|
|    Ishan| 32|         4| 10000|
+---------+---+----------+------+



In [11]:
df_pyspark.na.drop(how = "any", subset = ['Experience']).show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|Rahamatullah|  62|        60|  null|
|        null|null|         2|  null|
+------------+----+----------+------+



In [12]:
df_pyspark.na.fill('Missing Values').show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|         Krish|  31|        10| 30000|
|     Sudhanshu|  30|         8| 25000|
|         Sunny|  29|         6| 15000|
|         Ishan|  32|         4| 10000|
|        Nishan|  26|      null|  null|
|  Rahamatullah|  62|        60|  null|
|            23|null|      null|  null|
|Missing Values|null|         2|  null|
+--------------+----+----------+------+



In [13]:
df_pyspark.na.fill('Missing Value','Experience').show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|          23|null|      null|  null|
|        null|null|         2|  null|
+------------+----+----------+------+



In [14]:
df_pyspark.na.fill('Missing Value',subset = ['Experience','Salary']).show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|          23|null|      null|  null|
|        null|null|         2|  null|
+------------+----+----------+------+



In [56]:
spark.read.option('header','true').csv('dataset.csv', inferSchema = True)

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

In [57]:
df_pyspark = spark.read.option('header','true').csv('dataset.csv', inferSchema = True)

In [58]:
df_pyspark

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

## Handling missing values with mean of that specific columns


In [59]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['Age','Experience','Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age','Experience','Salary']]
    ).setStrategy("mean")

In [61]:
# Add imputation to columns to dataframe
imputer.fit(df_pyspark).transform(df_pyspark).show()

+------------+----+----------+------+-----------+------------------+--------------+
|        Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+------------+----+----------+------+-----------+------------------+--------------+
|       Krish|  31|        10| 30000|         31|                10|         30000|
|   Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|       Sunny|  29|         6| 15000|         29|                 6|         15000|
|       Ishan|  32|         4| 10000|         32|                 4|         10000|
|      Nishan|  26|      null|  null|         26|                15|         20000|
|Rahamatullah|  62|        60|  null|         62|                60|         20000|
|          23|null|      null|  null|         35|                15|         20000|
|        null|null|         2|  null|         35|                 2|         20000|
+------------+----+----------+------+-----------+------------------+--------

In [29]:
df_pyspark.show()

+------------+----+----------+------+
|         _c0| _c1|       _c2|   _c3|
+------------+----+----------+------+
|        Name| Age|Experience|Salary|
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|          23|null|      null|  null|
|        null|null|         2|  null|
+------------+----+----------+------+



## Pyspark dataFrames
## - Filter Operations
## - &, | , ==
## ~

In [62]:
from pyspark.sql import SparkSession

In [63]:
spark = SparkSession.builder.appName("Dataframe").getOrCreate()

In [64]:
spark

In [68]:
#df_pyspark = spark.read.option('header','true').csv('dataset.csv', inferSchema = True)
df_pyspark = spark.read.csv('dataset.csv',header = True, inferSchema = True)

In [69]:
df_pyspark

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

In [70]:
df_pyspark.show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|          23|null|      null|  null|
|        null|null|         2|  null|
+------------+----+----------+------+



### Filter Operations

In [71]:
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols = ['Age','Experience','Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age','Experience','Salary']]
).setStrategy("mean")

In [74]:
df_sample = imputer.fit(df_pyspark).transform(df_pyspark)

In [76]:
df_sample.show()

+------------+----+----------+------+-----------+------------------+--------------+
|        Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+------------+----+----------+------+-----------+------------------+--------------+
|       Krish|  31|        10| 30000|         31|                10|         30000|
|   Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|       Sunny|  29|         6| 15000|         29|                 6|         15000|
|       Ishan|  32|         4| 10000|         32|                 4|         10000|
|      Nishan|  26|      null|  null|         26|                15|         20000|
|Rahamatullah|  62|        60|  null|         62|                60|         20000|
|          23|null|      null|  null|         35|                15|         20000|
|        null|null|         2|  null|         35|                 2|         20000|
+------------+----+----------+------+-----------+------------------+--------

In [92]:
df_pyspark = spark.read.csv('testset.csv', header = True, inferSchema = True)

In [93]:
df_pyspark.show()

+---------+------------+------+
|     Name|  Department|Salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [95]:
 ### salary of people less than equal to 20000
df_pyspark.filter("Salary <= 3000").show()

+------+------------+------+
|  Name|  Department|Salary|
+------+------------+------+
|Mahesh|Data Science|  3000|
| Sunny|    Big Data|  2000|
+------+------------+------+



In [97]:
df_pyspark.filter("Salary <= 3000").select(['Name','Department']).show()

+------+------------+
|  Name|  Department|
+------+------------+
|Mahesh|Data Science|
| Sunny|    Big Data|
+------+------------+



In [98]:
df_pyspark.filter(df_pyspark['Salary'] <= 3000).show()

+------+------------+------+
|  Name|  Department|Salary|
+------+------------+------+
|Mahesh|Data Science|  3000|
| Sunny|    Big Data|  2000|
+------+------------+------+



In [100]:
df_pyspark.filter((df_pyspark['Salary'] <= 3000) & (df_pyspark['Department'] == 'Big Data')).show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
|Sunny|  Big Data|  2000|
+-----+----------+------+



In [101]:
df_pyspark.filter(~(df_pyspark['Salary'] <= 3000)).show()

+---------+------------+------+
|     Name|  Department|Salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
+---------+------------+------+



## PySpark Groupby and Aggregate functions

In [102]:
from pyspark.sql import SparkSession

In [103]:
spark = SparkSession.builder.appName("Groupby_and_aggregate_functions").getOrCreate()

In [104]:
spark

In [108]:
df_pyspark = spark.read.csv('testset.csv', header = True, inferSchema = True)

In [109]:
df_pyspark

DataFrame[Name: string, Department: string, Salary: int]

In [110]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [111]:
## Groupby
### Grouped by Names to get max salary
df_pyspark.groupby('Name').sum().show()

+---------+-----------+
|     Name|sum(Salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Sunny|      12000|
|    Krish|      19000|
|   Mahesh|       7000|
+---------+-----------+



In [112]:
### Group by Departments to get max salary
df_pyspark.groupBy('Department').sum().show()

+------------+-----------+
|  Department|sum(Salary)|
+------------+-----------+
|         IOT|      15000|
|    Big Data|      15000|
|Data Science|      43000|
+------------+-----------+



In [113]:
df_pyspark.groupBy('Department').mean().show()

+------------+-----------+
|  Department|avg(Salary)|
+------------+-----------+
|         IOT|     7500.0|
|    Big Data|     3750.0|
|Data Science|    10750.0|
+------------+-----------+



In [114]:
df_pyspark.groupBy('Department').count().show()

+------------+-----+
|  Department|count|
+------------+-----+
|         IOT|    2|
|    Big Data|    4|
|Data Science|    4|
+------------+-----+



In [115]:
df_pyspark.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|      73000|
+-----------+



In [116]:
df_pyspark.groupBy('Name').max().show()

+---------+-----------+
|     Name|max(Salary)|
+---------+-----------+
|Sudhanshu|      20000|
|    Sunny|      10000|
|    Krish|      10000|
|   Mahesh|       4000|
+---------+-----------+



## Examples of PySpark ML

In [169]:
from pyspark.sql import SparkSession

In [170]:
spark = SparkSession.builder.appName("Missing").getOrCreate()

In [171]:
spark

In [172]:
training = spark.read.csv('test1.csv', header = True, inferSchema = True)

In [173]:
training.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|   Subham| 23|         2| 10000|
+---------+---+----------+------+



In [174]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [175]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [None]:
[Age, Experience] -----> new feature -----> independent feature

In [149]:
from pyspark.ml.feature import VectorAssembler

In [150]:
featureassembler = VectorAssembler(inputCols = ['Age','Experience'], outputCol = "Independent Feature")

In [151]:
output = featureassembler.transform(training)

In [152]:
output

DataFrame[Name: string, Age: int, Experience: int, Salary: int, Independent Feature: vector]

In [153]:
output.show()

+---------+---+----------+------+-------------------+
|     Name|Age|Experience|Salary|Independent Feature|
+---------+---+----------+------+-------------------+
|    Krish| 31|        10| 30000|        [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|         [30.0,8.0]|
|    Sunny| 29|         4| 20000|         [29.0,4.0]|
|     Paul| 24|         3| 20000|         [24.0,3.0]|
|   Harsha| 21|         1| 15000|         [21.0,1.0]|
|   Subham| 23|         2| 10000|         [23.0,2.0]|
+---------+---+----------+------+-------------------+



In [154]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Feature']

In [155]:
finalized_data = output.select("Independent Feature","Salary")

In [156]:
finalized_data.show()

+-------------------+------+
|Independent Feature|Salary|
+-------------------+------+
|        [31.0,10.0]| 30000|
|         [30.0,8.0]| 25000|
|         [29.0,4.0]| 20000|
|         [24.0,3.0]| 20000|
|         [21.0,1.0]| 15000|
|         [23.0,2.0]| 10000|
+-------------------+------+



In [157]:
from pyspark.ml.regression import LinearRegression

In [162]:
## train test split
train_data,test_data = finalized_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol = 'Independent Feature', labelCol = 'Salary')
regressor = regressor.fit(train_data)

In [163]:
### coefficients
regressor.coefficients

DenseVector([500.5382, 1501.6146])

In [164]:
### intercepts
regressor.intercept

-823.4660925731338

In [165]:
### prediction
pred_results = regressor.evaluate(test_data)

In [167]:
pred_results.predictions.show()

+-------------------+------+------------------+
|Independent Feature|Salary|        prediction|
+-------------------+------+------------------+
|         [21.0,1.0]| 15000|11189.451022604904|
+-------------------+------+------------------+



In [168]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(3810.548977395096, 14520283.50912681)