In [0]:
# we are going to use the Tips data set to create a machine learning model which would further the tip based on several features. This problem stateme is a regression problem statement as the target variable is a continuous variable.
# we will be applying linear regression algorithm to train the model.

In [0]:
# initializing a spark session
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Practice').getOrCreate()

In [0]:
file_location="/FileStore/tables/tips.csv"

In [0]:
df=spark.read.csv(file_location,header=True,inferSchema=True)

In [0]:
df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [0]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [0]:
df.columns

Out[8]: ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [0]:
#Handling the categorical features in the dataset
from pyspark.ml.feature import StringIndexer #for ordinal encoding


In [0]:
# for ordinal encoding
indexer=StringIndexer(inputCols=['sex','smoker','day','time'],outputCols=['sex_indexed','smoker_indexed','day_indexed','time_indexed'])
df_r=indexer.fit(df).transform(df)

In [0]:
display(df_r)

total_bill,tip,sex,smoker,day,time,size,sex_indexed,smoker_indexed,day_indexed,time_indexed
16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0
10.34,1.66,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0
21.01,3.5,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0
23.68,3.31,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0
25.29,4.71,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0
8.77,2.0,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
26.88,3.12,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0
15.04,1.96,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
14.78,3.23,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0


In [0]:
# grouping independent features together
from pyspark.ml.feature import VectorAssembler
assembler=VectorAssembler(inputCols=['total_bill','size','sex_indexed','smoker_indexed','day_indexed','time_indexed'],outputCol='Independent Feature')
independent_features=assembler.transform(df_r)

In [0]:
display(independent_features)

total_bill,tip,sex,smoker,day,time,size,sex_indexed,smoker_indexed,day_indexed,time_indexed,Independent Feature
16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(16.99, 2.0, 1.0, 0.0, 1.0, 0.0))"
10.34,1.66,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(10.34, 3.0, 0.0, 0.0, 1.0, 0.0))"
21.01,3.5,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(21.01, 3.0, 0.0, 0.0, 1.0, 0.0))"
23.68,3.31,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(23.68, 2.0, 0.0, 0.0, 1.0, 0.0))"
24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(24.59, 4.0, 1.0, 0.0, 1.0, 0.0))"
25.29,4.71,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(25.29, 4.0, 0.0, 0.0, 1.0, 0.0))"
8.77,2.0,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(8.77, 2.0, 0.0, 0.0, 1.0, 0.0))"
26.88,3.12,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(26.88, 4.0, 0.0, 0.0, 1.0, 0.0))"
15.04,1.96,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(15.04, 2.0, 0.0, 0.0, 1.0, 0.0))"
14.78,3.23,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(14.78, 2.0, 0.0, 0.0, 1.0, 0.0))"


In [0]:
independent_features.select('Independent Feature').show() # independent Features

+--------------------+
| Independent Feature|
+--------------------+
|[16.99,2.0,1.0,0....|
|[10.34,3.0,0.0,0....|
|[21.01,3.0,0.0,0....|
|[23.68,2.0,0.0,0....|
|[24.59,4.0,1.0,0....|
|[25.29,4.0,0.0,0....|
|[8.77,2.0,0.0,0.0...|
|[26.88,4.0,0.0,0....|
|[15.04,2.0,0.0,0....|
|[14.78,2.0,0.0,0....|
|[10.27,2.0,0.0,0....|
|[35.26,4.0,1.0,0....|
|[15.42,2.0,0.0,0....|
|[18.43,4.0,0.0,0....|
|[14.83,2.0,1.0,0....|
|[21.58,2.0,0.0,0....|
|[10.33,3.0,1.0,0....|
|[16.29,3.0,0.0,0....|
|[16.97,3.0,1.0,0....|
|(6,[0,1],[20.65,3...|
+--------------------+
only showing top 20 rows



In [0]:
Finalised_data=independent_features.select('Independent Feature','tip')

In [0]:
Finalised_data.show() #Independent and dependent variables

+--------------------+----+
| Independent Feature| tip|
+--------------------+----+
|[16.99,2.0,1.0,0....|1.01|
|[10.34,3.0,0.0,0....|1.66|
|[21.01,3.0,0.0,0....| 3.5|
|[23.68,2.0,0.0,0....|3.31|
|[24.59,4.0,1.0,0....|3.61|
|[25.29,4.0,0.0,0....|4.71|
|[8.77,2.0,0.0,0.0...| 2.0|
|[26.88,4.0,0.0,0....|3.12|
|[15.04,2.0,0.0,0....|1.96|
|[14.78,2.0,0.0,0....|3.23|
|[10.27,2.0,0.0,0....|1.71|
|[35.26,4.0,1.0,0....| 5.0|
|[15.42,2.0,0.0,0....|1.57|
|[18.43,4.0,0.0,0....| 3.0|
|[14.83,2.0,1.0,0....|3.02|
|[21.58,2.0,0.0,0....|3.92|
|[10.33,3.0,1.0,0....|1.67|
|[16.29,3.0,0.0,0....|3.71|
|[16.97,3.0,1.0,0....| 3.5|
|(6,[0,1],[20.65,3...|3.35|
+--------------------+----+
only showing top 20 rows



In [0]:
#doing train-test split and applying Linear Regression Algorithm
from pyspark.ml.regression import LinearRegression
train_data,test_data=Finalised_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Feature',labelCol='tip')
model=regressor.fit(train_data)

In [0]:
#finding the coefficients
model.coefficients


Out[19]: DenseVector([0.0773, 0.2306, -0.0851, -0.0997, 0.1237, -0.2121])

In [0]:
# finding the intercept value
model.intercept

Out[20]: 0.8225217312516332

In [0]:
#predictions
predictions=model.evaluate(test_data)

In [0]:
#final comaprison
predictions.predictions.show()

+--------------------+----+------------------+
| Independent Feature| tip|        prediction|
+--------------------+----+------------------+
|(6,[0,1],[10.07,2...|1.25|2.0617681960093726|
|(6,[0,1],[10.77,2...|1.47| 2.115858467765755|
|(6,[0,1],[12.02,2...|1.97| 2.212448238759295|
|(6,[0,1],[12.69,2...| 2.0| 2.264220356011833|
|(6,[0,1],[18.35,4...| 2.5| 3.162698108704362|
|(6,[0,1],[31.27,3...| 5.0|3.9304903468767036|
|(6,[0,1],[39.42,4...|7.58| 4.790815288571476|
|[3.07,1.0,1.0,1.0...| 1.0|1.1054480113420584|
|[5.75,2.0,1.0,1.0...| 1.0|1.9141343821687338|
|[7.51,2.0,0.0,0.0...| 2.0|1.8991861610200589|
|[7.74,2.0,0.0,1.0...|1.44|1.7819947886981655|
|[8.77,2.0,0.0,0.0...| 2.0|2.0849942565093027|
|[10.27,2.0,0.0,0....|1.71|2.2009019817015507|
|[10.33,2.0,1.0,0....| 2.0|2.0319649262741346|
|[11.69,2.0,0.0,0....|2.31|2.2221823552224573|
|[12.16,2.0,0.0,1....| 2.2|2.2824494572699923|
|[12.46,2.0,0.0,0....| 1.5| 2.617486105148657|
|[13.39,2.0,1.0,0....|2.61| 2.356862291994076|
|[13.42,2.0,0

In [0]:
#model Evaluation--> finding the R2 value and MAE value

predictions.r2,predictions.meanAbsoluteError

Out[29]: (0.5127273249851549, 0.7781916762284858)

Out[28]: 0.7781916762284858

In [0]:
#Saving a model:
model.save("LR.model")

In [0]:
#loading the model
from pyspark.ml.regression import LinearRegressionModel
loaded_model=LinearRegressionModel.load('/LR.model')