We are using spark to try a hands on multi-linear regression with the tips dataset. We will try to do some exploratory data analytics and fit our model

In [0]:
import pandas as pd
from pyspark.sql import SparkSession



In [0]:
spark = SparkSession.builder.appName('ml').getOrCreate()

In [0]:
file_location = "/FileStore/tables/tips.csv"
file_type = "csv"

df = spark.read.csv(file_location, header = True, inferSchema = True)

In [0]:
df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [0]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [0]:
df.columns

Out[7]: ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [0]:
### Handling categorical features

In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
indexer = StringIndexer(inputCol="sex", outputCol='sex_indexed')

In [0]:
df_r = indexer.fit(df).transform(df)

In [0]:
df_r.show()

+----------+----+------+------+---+------+----+-----------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|
+----------+----+------+------+---+------+----+-----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|        0.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|        0.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|        0.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|        0.0|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|        0.0|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|        1.0|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|        0.0|
|     18.43| 3.0|  Male|    No|Sun|Dinne

In [0]:
indexer = StringIndexer(inputCol="smoker", outputCol='smoker_indexed')
df_r = indexer.fit(df_r).transform(df_r)

In [0]:
df_r.show()

+----------+----+------+------+---+------+----+-----------+--------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|
+----------+----+------+------+---+------+----+-----------+--------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        0.0|           0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|        0.0|           0.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|
|     10.27|1.71|  Male| 

In [0]:
indexer = StringIndexer(inputCols=["day","time"], outputCols=['day_indexed','time_indexed'])
df_r = indexer.fit(df_r).transform(df_r)

In [0]:
df_r.show()

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        0.0|           0.0|        1.0|         0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|


In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
df_r.columns

Out[20]: ['total_bill',
 'tip',
 'sex',
 'smoker',
 'day',
 'time',
 'size',
 'sex_indexed',
 'smoker_indexed',
 'day_indexed',
 'time_indexed']

In [0]:
featureAssembler = VectorAssembler(inputCols=['tip','sex_indexed','smoker_indexed','size','day_indexed','time_indexed','total_bill'],outputCol='Independent_Features')

In [0]:
output = featureAssembler.transform(df_r)

In [0]:
output.show()

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+--------------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|Independent_Features|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+--------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|[1.01,1.0,0.0,2.0...|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|[1.66,0.0,0.0,3.0...|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|[3.5,0.0,0.0,3.0,...|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|[3.31,0.0,0.0,2.0...|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|[3.61,1.0,0.0,4.0...|
|     25.29|4.71|  Male|    No|S

In [0]:
finalized_data = output.select('Independent_Features','total_bill')

In [0]:
from pyspark.ml.regression import LinearRegression
##train test split
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent_Features', labelCol='total_bill')
regressor=regressor.fit(train_data)

In [0]:
regressor.coefficients

Out[34]: DenseVector([0.0, 0.0, 0.0, 0.0, -0.0, 0.0, 1.0])

In [0]:
regressor.intercept

Out[35]: 0.0

In [0]:
pred = regressor.evaluate(test_data)

In [0]:
pred.predictions.show()

+--------------------+----------+------------------+
|Independent_Features|total_bill|        prediction|
+--------------------+----------+------------------+
|(7,[0,3,6],[1.25,...|     10.07|             10.07|
|(7,[0,3,6],[1.25,...|     10.51|10.509999999999998|
|(7,[0,3,6],[2.5,4...|     18.35|18.350000000000005|
|(7,[0,3,6],[3.35,...|     20.65|20.650000000000006|
|(7,[0,3,6],[4.3,2...|      21.7|21.700000000000017|
|[1.0,1.0,0.0,1.0,...|      7.25| 7.250000000000001|
|[1.0,1.0,1.0,2.0,...|      5.75| 5.749999999999989|
|[1.01,1.0,0.0,2.0...|     16.99| 16.98999999999997|
|[1.25,1.0,0.0,2.0...|      8.51| 8.510000000000002|
|[1.5,1.0,0.0,2.0,...|      8.35| 8.350000000000007|
|[1.76,0.0,1.0,2.0...|     11.24| 11.24000000000001|
|[1.8,1.0,0.0,2.0,...|     12.43|             12.43|
|[2.0,0.0,0.0,4.0,...|     16.49| 16.48999999999999|
|[2.0,0.0,1.0,2.0,...|     14.48|14.479999999999999|
|[2.0,0.0,1.0,2.0,...|     13.51|13.510000000000007|
|[2.0,0.0,1.0,2.0,...|      16.0|             

In [0]:
### Well predition was good

In [0]:
pred.r2

Out[39]: 1.0

In [0]:
pred.meanAbsoluteError, pred.meanSquaredError

Out[40]: (1.006116865594732e-14, 1.677687298923309e-28)

In [0]:
### With this we end a basic ML in PySpark