In [4]:
!pip install pyspark



In [5]:
import pyspark

In [6]:
import pandas as pd
pd.read_csv('test1.csv')

Unnamed: 0,Name,Age,Experience
0,Shashank,23,10
1,Mihir,22,8
2,Pranav,23,4


In [7]:
type(pd.read_csv('test1.csv'))

pandas.core.frame.DataFrame

In [8]:
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [10]:
spark

In [11]:
df_pyspark = spark.read.csv('test1.csv')

In [12]:
df_pyspark

DataFrame[_c0: string, _c1: string, _c2: string]

In [13]:
df_pyspark.show()

+--------+---+----------+
|     _c0|_c1|       _c2|
+--------+---+----------+
|    Name|Age|Experience|
|Shashank| 23|        10|
|   Mihir| 22|         8|
|  Pranav| 23|         4|
+--------+---+----------+



In [14]:
df_pyspark = spark.read.option('header', 'true').csv('test1.csv')

In [15]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Shashank| 23|        10|
|   Mihir| 22|         8|
|  Pranav| 23|         4|
+--------+---+----------+



In [16]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [17]:
df_pyspark.head(3)

[Row(Name='Shashank', Age='23', Experience='10'),
 Row(Name='Mihir', Age='22', Experience='8'),
 Row(Name='Pranav', Age='23', Experience='4')]

In [18]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



## PART-1

In [19]:
from pyspark.sql import SparkSession

In [20]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [21]:
spark

### PySpark dataframe and Reading the dataset

In [22]:
## read the dataset
df_pyspark=spark.read.option('header','true').csv('test1.csv', inferSchema=True)
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Shashank| 23|        10|
|   Mihir| 22|         8|
|  Pranav| 23|         4|
+--------+---+----------+



### Checking the datatypes of the column(schema)

In [23]:
### Check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [24]:
df_pyspark = spark.read.csv('test1.csv', header=True, inferSchema=True)
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Shashank| 23|        10|
|   Mihir| 22|         8|
|  Pranav| 23|         4|
+--------+---+----------+



In [25]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



### Selecting columns and indexing

In [26]:
df_pyspark.columns

['Name', 'Age', 'Experience']

In [27]:
df_pyspark.select('Name').show()

+--------+
|    Name|
+--------+
|Shashank|
|   Mihir|
|  Pranav|
+--------+



In [28]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

In [29]:
df_pyspark.select(['Name', 'Experience'])

DataFrame[Name: string, Experience: int]

In [30]:
df_pyspark.select(['Name', 'Experience']).show()

+--------+----------+
|    Name|Experience|
+--------+----------+
|Shashank|        10|
|   Mihir|         8|
|  Pranav|         4|
+--------+----------+



In [31]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

### Check describe option similar to pandas

In [32]:
df_pyspark.describe().show()

+-------+--------+------------------+-----------------+
|summary|    Name|               Age|       Experience|
+-------+--------+------------------+-----------------+
|  count|       3|                 3|                3|
|   mean|    null|22.666666666666668|7.333333333333333|
| stddev|    null|0.5773502691896258|3.055050463303893|
|    min|   Mihir|                22|                4|
|    max|Shashank|                23|               10|
+-------+--------+------------------+-----------------+



### Adding columns

In [33]:
df_pyspark = df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience']+2)
df_pyspark.show()

+--------+---+----------+------------------------+
|    Name|Age|Experience|Experience after 2 years|
+--------+---+----------+------------------------+
|Shashank| 23|        10|                      12|
|   Mihir| 22|         8|                      10|
|  Pranav| 23|         4|                       6|
+--------+---+----------+------------------------+



### Dropping columns

In [34]:
df_pyspark = df_pyspark.drop('Experience after 2 years')
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Shashank| 23|        10|
|   Mihir| 22|         8|
|  Pranav| 23|         4|
+--------+---+----------+



### Rename the column

In [35]:
df_pyspark = df_pyspark.withColumnRenamed('Name', 'Updated Name')
df_pyspark.show()

+------------+---+----------+
|Updated Name|Age|Experience|
+------------+---+----------+
|    Shashank| 23|        10|
|       Mihir| 22|         8|
|      Pranav| 23|         4|
+------------+---+----------+



# PART-2- Pyspark Handling Missing Values

In [36]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [37]:
df_pyspark = spark.read.csv('test2.csv',header=True, inferSchema=True)
df_pyspark.show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Krish|  31|        10| 30000|
|  Mihir|  28|        12| 45000|
| Pranav|  25|        15| 50000|
|Devansh|null|      null| 75000|
|   null|  27|        13| 40000|
|   null|  29|      null|  null|
+-------+----+----------+------+



Droping column

In [38]:
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  28|        12| 45000|
|  25|        15| 50000|
|null|      null| 75000|
|  27|        13| 40000|
|  29|      null|  null|
+----+----------+------+



Dropping the row

In [39]:
df_pyspark.na.drop().show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Krish| 31|        10| 30000|
| Mihir| 28|        12| 45000|
|Pranav| 25|        15| 50000|
+------+---+----------+------+



In [40]:
#if all values in row is null, then only delete that row
df_pyspark.na.drop(how="all").show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Krish|  31|        10| 30000|
|  Mihir|  28|        12| 45000|
| Pranav|  25|        15| 50000|
|Devansh|null|      null| 75000|
|   null|  27|        13| 40000|
|   null|  29|      null|  null|
+-------+----+----------+------+



In [41]:
# any-> if any value in row is null delete that entire row, bydefault-any
df_pyspark.na.drop(how="any").show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Krish| 31|        10| 30000|
| Mihir| 28|        12| 45000|
|Pranav| 25|        15| 50000|
+------+---+----------+------+



In [42]:
# thresh-> atleast these many non null values
df_pyspark.na.drop(how="any", thresh=2).show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Krish|  31|        10| 30000|
|  Mihir|  28|        12| 45000|
| Pranav|  25|        15| 50000|
|Devansh|null|      null| 75000|
|   null|  27|        13| 40000|
+-------+----+----------+------+



In [43]:
#subset-> delete only in that column wherever is null
df_pyspark.na.drop(how="any", subset=['Experience']).show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Krish| 31|        10| 30000|
| Mihir| 28|        12| 45000|
|Pranav| 25|        15| 50000|
|  null| 27|        13| 40000|
+------+---+----------+------+



### Filling missing values

In [47]:
#only name column is replaced as it is string and new value is also string
df_pyspark.na.fill('Missing Values').show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|         Krish|  31|        10| 30000|
|         Mihir|  28|        12| 45000|
|        Pranav|  25|        15| 50000|
|       Devansh|null|      null| 75000|
|Missing Values|  27|        13| 40000|
|Missing Values|  29|      null|  null|
+--------------+----+----------+------+



In [53]:
df_pyspark.na.fill(100000000,'Experience').show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Krish|  31|        10| 30000|
|  Mihir|  28|        12| 45000|
| Pranav|  25|        15| 50000|
|Devansh|null| 100000000| 75000|
|   null|  27|        13| 40000|
|   null|  29| 100000000|  null|
+-------+----+----------+------+



In [54]:
df_pyspark.na.fill(1000000000,['Experience','Age']).show()

+-------+----------+----------+------+
|   Name|       Age|Experience|Salary|
+-------+----------+----------+------+
|  Krish|        31|        10| 30000|
|  Mihir|        28|        12| 45000|
| Pranav|        25|        15| 50000|
|Devansh|1000000000|1000000000| 75000|
|   null|        27|        13| 40000|
|   null|        29|1000000000|  null|
+-------+----------+----------+------+



### Imputer function-> Replace null values with mean, median or mode

In [55]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['Age', 'Experience', 'Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age', 'Experience', 'Salary']]
).setStrategy("mean")

Add imputation column to dataframe

In [57]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------+----+----------+------+-----------+------------------+--------------+
|   Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+-------+----+----------+------+-----------+------------------+--------------+
|  Krish|  31|        10| 30000|         31|                10|         30000|
|  Mihir|  28|        12| 45000|         28|                12|         45000|
| Pranav|  25|        15| 50000|         25|                15|         50000|
|Devansh|null|      null| 75000|         28|                12|         75000|
|   null|  27|        13| 40000|         27|                13|         40000|
|   null|  29|      null|  null|         29|                12|         48000|
+-------+----+----------+------+-----------+------------------+--------------+



# PART-4

In [58]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('dataframe').getOrCreate()

In [60]:
df_pyspark = spark.read.csv('test1.csv', header = True, inferSchema=True)
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Shashank| 23|        10|
|   Mihir| 22|         8|
|  Pranav| 23|         4|
+--------+---+----------+



## Filter Operations

Age of people less than 23

In [61]:
df_pyspark.filter("Age<23").show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|Mihir| 22|         8|
+-----+---+----------+



In [63]:
df_pyspark.filter("Age<23").select("Name").show()

+-----+
| Name|
+-----+
|Mihir|
+-----+



In [65]:
df_pyspark.filter(df_pyspark['Age']<23).show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|Mihir| 22|         8|
+-----+---+----------+



In [67]:
df_pyspark.filter((df_pyspark['Age']<=23) & (df_pyspark['Experience']<=8)).show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Mihir| 22|         8|
|Pranav| 23|         4|
+------+---+----------+



In [69]:
df_pyspark.filter(~(df_pyspark['Age']<23)).show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Shashank| 23|        10|
|  Pranav| 23|         4|
+--------+---+----------+



## PySpark GroupBy and Aggregate Functions

In [71]:
from pyspark.sql import SparkSession 
spark = SparkSession.builder.appName("Practise").getOrCreate()

In [77]:
df_pyspark = spark.read.csv('test3.csv', header=True, inferSchema=True)
df_pyspark.show()

+--------+------------+------+
|    Name| Departments|Salary|
+--------+------------+------+
|   Krish|Data Science|100000|
|Shashank|          ML|200000|
|   Mihir|         IOT|240000|
|  Pranav|     Andriod|300000|
|   Mihir|Data Science|450000|
|   Krish|          ML|670000|
|  Pranav|         IOT|240000|
|Shashank|     Andriod|300000|
+--------+------------+------+



In [78]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- Salary: integer (nullable = true)



### GroupBy

Sum of salary of each individual

In [79]:
df_pyspark.groupBy('Name').sum().show()

+--------+-----------+
|    Name|sum(Salary)|
+--------+-----------+
|   Mihir|     690000|
|  Pranav|     540000|
|   Krish|     770000|
|Shashank|     500000|
+--------+-----------+



In [85]:
df_pyspark.groupBy('Name').max().show()

+--------+-----------+
|    Name|max(Salary)|
+--------+-----------+
|   Mihir|     450000|
|  Pranav|     300000|
|   Krish|     670000|
|Shashank|     300000|
+--------+-----------+



Groupby departments which gives total salary

In [80]:
df_pyspark.groupBy('Departments').sum().show()

+------------+-----------+
| Departments|sum(Salary)|
+------------+-----------+
|         IOT|     480000|
|          ML|     870000|
|     Andriod|     600000|
|Data Science|     550000|
+------------+-----------+



In [84]:
df_pyspark.groupBy('Departments').max().show()

+------------+-----------+
| Departments|max(Salary)|
+------------+-----------+
|         IOT|     240000|
|          ML|     670000|
|     Andriod|     300000|
|Data Science|     450000|
+------------+-----------+



In [81]:
df_pyspark.groupBy('Departments').mean().show()

+------------+-----------+
| Departments|avg(Salary)|
+------------+-----------+
|         IOT|   240000.0|
|          ML|   435000.0|
|     Andriod|   300000.0|
|Data Science|   275000.0|
+------------+-----------+



In [82]:
df_pyspark.groupBy('Departments').count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|          ML|    2|
|     Andriod|    2|
|Data Science|    2|
+------------+-----+



In [83]:
df_pyspark.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|    2500000|
+-----------+



## Examples of PysparkML

In [88]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Missing').getOrCreate()

In [89]:
training = spark.read.csv('mldemo1.csv', header=True, inferSchema=True)
training.show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|   Krish| 31|        10| 30000|
|Shashank| 30|         8| 25000|
|   Mihir| 29|         4| 20000|
| Devansh| 24|         3| 20000|
|  Pranav| 21|         1| 15000|
| Shubham| 23|         2| 18000|
+--------+---+----------+------+



In [90]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [91]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

[Age, Experience] ---> new feature ---> independent feature

In [96]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ["Age", "Experience"], outputCol = "Independent Features")

In [97]:
output = featureassembler.transform(training)

In [98]:
output.show()

+--------+---+----------+------+--------------------+
|    Name|Age|Experience|Salary|Independent Features|
+--------+---+----------+------+--------------------+
|   Krish| 31|        10| 30000|         [31.0,10.0]|
|Shashank| 30|         8| 25000|          [30.0,8.0]|
|   Mihir| 29|         4| 20000|          [29.0,4.0]|
| Devansh| 24|         3| 20000|          [24.0,3.0]|
|  Pranav| 21|         1| 15000|          [21.0,1.0]|
| Shubham| 23|         2| 18000|          [23.0,2.0]|
+--------+---+----------+------+--------------------+



In [99]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Features']

In [101]:
finalized_data = output.select("Independent Features", "Salary")
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
+--------------------+------+



train test split

In [102]:
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])

Regeression Model

In [103]:
from pyspark.ml.regression import LinearRegression
regressor = LinearRegression(featuresCol = 'Independent Features', labelCol = 'Salary')
regressor = regressor.fit(train_data)

Coefficients

In [104]:
regressor.coefficients

DenseVector([-250.0, 1750.0])

Intercepts

In [105]:
regressor.intercept

20250.000000000553

Prediction

In [106]:
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show()



+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [21.0,1.0]| 15000| 16750.00000000008|
|          [24.0,3.0]| 20000|19500.000000000044|
|          [30.0,8.0]| 25000|26749.999999999993|
+--------------------+------+------------------+



In [108]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(1333.333333333343, 2125000.0000000703)

## MultiLinear Regression

In [109]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [111]:
df = spark.read.csv('mldemo2.csv', header=True, inferSchema=True)
df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [112]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [113]:
df.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

### Handling categorical features

In [115]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCols=["sex","smoker","day","time"], outputCols = ["sex_indexed", "smoker_indexed","day_indexed","time_indexed"])
df_r = indexer.fit(df).transform(df)
df_r.show()

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        0.0|           0.0|        1.0|         0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|


In [120]:
from pyspark.ml.feature import VectorAssembler
featureAssembler = VectorAssembler(inputCols=['tip','size','sex_indexed','smoker_indexed','day_indexed','time_indexed'],
               outputCol = "Independent Features")
output = featureAssembler.transform(df_r)
output.show()

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+--------------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|Independent Features|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+--------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|[1.01,2.0,1.0,0.0...|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|[1.66,3.0,0.0,0.0...|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|[3.5,3.0,0.0,0.0,...|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|[3.31,2.0,0.0,0.0...|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|[3.61,4.0,1.0,0.0...|
|     25.29|4.71|  Male|    No|S

In [124]:
output.select("Independent Features").show()

+--------------------+
|Independent Features|
+--------------------+
|[1.01,2.0,1.0,0.0...|
|[1.66,3.0,0.0,0.0...|
|[3.5,3.0,0.0,0.0,...|
|[3.31,2.0,0.0,0.0...|
|[3.61,4.0,1.0,0.0...|
|[4.71,4.0,0.0,0.0...|
|[2.0,2.0,0.0,0.0,...|
|[3.12,4.0,0.0,0.0...|
|[1.96,2.0,0.0,0.0...|
|[3.23,2.0,0.0,0.0...|
|[1.71,2.0,0.0,0.0...|
|[5.0,4.0,1.0,0.0,...|
|[1.57,2.0,0.0,0.0...|
|[3.0,4.0,0.0,0.0,...|
|[3.02,2.0,1.0,0.0...|
|[3.92,2.0,0.0,0.0...|
|[1.67,3.0,1.0,0.0...|
|[3.71,3.0,0.0,0.0...|
|[3.5,3.0,1.0,0.0,...|
|(6,[0,1],[3.35,3.0])|
+--------------------+
only showing top 20 rows



In [126]:
finalizedData = output.select("Independent Features","total_bill")
finalizedData.show()

+--------------------+----------+
|Independent Features|total_bill|
+--------------------+----------+
|[1.01,2.0,1.0,0.0...|     16.99|
|[1.66,3.0,0.0,0.0...|     10.34|
|[3.5,3.0,0.0,0.0,...|     21.01|
|[3.31,2.0,0.0,0.0...|     23.68|
|[3.61,4.0,1.0,0.0...|     24.59|
|[4.71,4.0,0.0,0.0...|     25.29|
|[2.0,2.0,0.0,0.0,...|      8.77|
|[3.12,4.0,0.0,0.0...|     26.88|
|[1.96,2.0,0.0,0.0...|     15.04|
|[3.23,2.0,0.0,0.0...|     14.78|
|[1.71,2.0,0.0,0.0...|     10.27|
|[5.0,4.0,1.0,0.0,...|     35.26|
|[1.57,2.0,0.0,0.0...|     15.42|
|[3.0,4.0,0.0,0.0,...|     18.43|
|[3.02,2.0,1.0,0.0...|     14.83|
|[3.92,2.0,0.0,0.0...|     21.58|
|[1.67,3.0,1.0,0.0...|     10.33|
|[3.71,3.0,0.0,0.0...|     16.29|
|[3.5,3.0,1.0,0.0,...|     16.97|
|(6,[0,1],[3.35,3.0])|     20.65|
+--------------------+----------+
only showing top 20 rows



In [127]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = finalizedData.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol = "Independent Features", labelCol = "total_bill")
regressor = regressor.fit(train_data)

In [128]:
regressor.coefficients

DenseVector([2.9429, 3.4637, -1.2533, 0.8428, 0.0573, -1.6568])

In [129]:
regressor.intercept

2.403276150548062

In [131]:
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show()



+--------------------+----------+------------------+
|Independent Features|total_bill|        prediction|
+--------------------+----------+------------------+
|(6,[0,1],[1.25,2.0])|     10.51|13.009362154457783|
| (6,[0,1],[2.0,2.0])|     12.69|15.216521403313156|
|(6,[0,1],[2.31,3.0])|     18.69| 19.59255752074876|
|(6,[0,1],[2.64,3.0])|     17.59|20.563707590245123|
| (6,[0,1],[3.0,4.0])|     20.45|25.086887657604418|
|(6,[0,1],[3.15,3.0])|     20.08|22.064575879466776|
|(6,[0,1],[3.35,3.0])|     20.65|22.653151679161542|
|(6,[0,1],[3.39,2.0])|     11.61|19.307123211191783|
|(6,[0,1],[3.76,2.0])|     18.24|  20.3959884406271|
|(6,[0,1],[4.08,2.0])|     17.92| 21.33770972013873|
| (6,[0,1],[4.3,2.0])|      21.7|21.985143099802972|
| (6,[0,1],[5.0,3.0])|     31.27|27.508902026643366|
|[1.0,1.0,1.0,0.0,...|      7.25| 7.556587529606366|
|[1.0,1.0,1.0,1.0,...|      3.07| 8.399425326309618|
|[1.17,2.0,0.0,1.0...|     32.83|13.616769631283127|
|[1.32,2.0,0.0,0.0...|      9.68|13.2726330648

In [132]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(4.934769318637713, 48.24977436965849)