In [1]:
from pyspark.sql import SparkSession

In [3]:
pyspark = SparkSession.builder.appName('Practice').getOrCreate()
pyspark

In [4]:
data = pyspark.read.csv('tips.csv',header=True,inferSchema=True)

In [7]:
data.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [8]:
data.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [11]:
data.select('total_bill').describe().show()

+-------+------------------+
|summary|        total_bill|
+-------+------------------+
|  count|               244|
|   mean|19.785942622950824|
| stddev| 8.902411954856857|
|    min|              3.07|
|    max|             50.81|
+-------+------------------+



In [12]:
data.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [14]:
from pyspark.ml.feature import StringIndexer
si = StringIndexer(inputCols=['sex', 'smoker', 'day', 'time'],outputCols=['sex_idx', 'smoker_idx', 'day_idx', 'time_idx'])
df = si.fit(data).transform(data)

In [15]:
df.show()

+----------+----+------+------+---+------+----+-------+----------+-------+--------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_idx|smoker_idx|day_idx|time_idx|
+----------+----+------+------+---+------+----+-------+----------+-------+--------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|    1.0|       0.0|    1.0|     0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|    1.0|     0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|    1.0|     0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|    1.0|     0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|    1.0|       0.0|    1.0|     0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|    0.0|       0.0|    1.0|     0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|    1.0|     0.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|    0.0|       0.0|    1.0|     0.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|    1.0|  

In [16]:
df.columns

['total_bill',
 'tip',
 'sex',
 'smoker',
 'day',
 'time',
 'size',
 'sex_idx',
 'smoker_idx',
 'day_idx',
 'time_idx']

In [19]:
from pyspark.ml.feature import VectorAssembler
vas = VectorAssembler(inputCols=['tip','size','sex_idx','smoker_idx','day_idx','time_idx'],outputCol='Independent Features')
df = vas.transform(df)

In [20]:
df.show()

+----------+----+------+------+---+------+----+-------+----------+-------+--------+--------------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_idx|smoker_idx|day_idx|time_idx|Independent Features|
+----------+----+------+------+---+------+----+-------+----------+-------+--------+--------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|    1.0|       0.0|    1.0|     0.0|[1.01,2.0,1.0,0.0...|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|    1.0|     0.0|[1.66,3.0,0.0,0.0...|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|    1.0|     0.0|[3.5,3.0,0.0,0.0,...|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|    1.0|     0.0|[3.31,2.0,0.0,0.0...|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|    1.0|       0.0|    1.0|     0.0|[3.61,4.0,1.0,0.0...|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|    0.0|       0.0|    1.0|     0.0|[4.71,4.0,0.0,0.0...|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|    0.0|

In [21]:
df = df.select('Independent Features','total_bill')

In [22]:
from pyspark.ml.regression import LinearRegression
train,test = df.randomSplit([0.75,0.25])

In [24]:
lr = LinearRegression(featuresCol='Independent Features',labelCol='total_bill')
lr = lr.fit(train)

In [25]:
lr.coefficients

DenseVector([2.9103, 3.3801, -1.9694, 1.8601, 0.1955, -1.0319])

In [26]:
lr.intercept

2.4832700236910585

In [27]:
pred = lr.evaluate(test)

In [29]:
pred.predictions.show()

+--------------------+----------+------------------+
|Independent Features|total_bill|        prediction|
+--------------------+----------+------------------+
|(6,[0,1],[1.25,2.0])|     10.07|12.881332398116312|
| (6,[0,1],[2.0,2.0])|     13.37|15.064036287191465|
| (6,[0,1],[2.5,4.0])|     18.35|23.279394772541565|
| (6,[0,1],[3.0,2.0])|      14.0| 17.97430813929167|
| (6,[0,1],[3.0,4.0])|     20.45|24.734530698591666|
|(6,[0,1],[3.18,2.0])|     19.82| 18.49815707266971|
|(6,[0,1],[3.27,2.0])|     17.78|18.760081539358726|
| (6,[0,1],[3.6,3.0])|     24.06| 23.10058253020179|
|(6,[0,1],[3.76,2.0])|     18.24|20.186114746887824|
|[1.0,1.0,1.0,0.0,...|      7.25|6.8042802641430615|
|[1.0,2.0,0.0,1.0,...|      12.6| 14.01388169823838|
|[1.0,2.0,1.0,1.0,...|      5.75|12.631106672983574|
|[1.1,2.0,1.0,1.0,...|      12.9|  12.3355359921502|
|[1.5,2.0,0.0,0.0,...|     12.46|14.195498227184755|
|[1.57,2.0,0.0,0.0...|     15.42|14.008152012802842|
|[1.64,2.0,0.0,1.0...|     15.36|15.8764556835

In [32]:
print("R2 score: {}\nMean Absolute Error: {}\nMean Squared error: {}".format(pred.r2,pred.meanAbsoluteError,pred.meanSquaredError))

R2 score: 0.6053632529726868
Mean Absolute Error: 4.2037175231212975
Mean Squared error: 35.9030336098997
