# Importing the Libraries

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('ML_Builder').getOrCreate()

In [5]:
df_pyspark = spark.read.csv('Food demand.csv', header=True, inferSchema=True)

In [6]:
df_pyspark.show()

+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+
|     id|week|center_id|meal_id|checkout_price|base_price|emailer_for_promotion|homepage_featured|num_orders|
+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+
|1000000|   3|      157|   2760|        233.83|    231.83|                    0|                0|       149|
|1000001| 100|      104|   2956|        486.03|    583.03|                    0|                0|       161|
|1000002| 143|       75|   1971|        328.86|    327.86|                    0|                0|       149|
|1000003|  41|       24|   2539|        145.53|    145.53|                    0|                0|       540|
|1000004|  45|       83|   2539|         95.06|    120.34|                    0|                0|       271|
|1000005| 101|       65|   1754|        291.03|    290.03|                    0|                0|       541|
|1000006| 

In [7]:
df_pyspark.printSchema()

root
 |-- id: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- center_id: integer (nullable = true)
 |-- meal_id: integer (nullable = true)
 |-- checkout_price: double (nullable = true)
 |-- base_price: double (nullable = true)
 |-- emailer_for_promotion: integer (nullable = true)
 |-- homepage_featured: integer (nullable = true)
 |-- num_orders: integer (nullable = true)



# Grouping Independent Features

In [8]:
df_pyspark.columns

['id',
 'week',
 'center_id',
 'meal_id',
 'checkout_price',
 'base_price',
 'emailer_for_promotion',
 'homepage_featured',
 'num_orders']

In [9]:
from pyspark.ml.feature import VectorAssembler
ind_features = ['week',
 'center_id',
 'meal_id',
 'checkout_price',
 'base_price',
 'emailer_for_promotion',
 'homepage_featured',
]

feature_assembler = VectorAssembler(inputCols=ind_features, outputCol="Independent Features")

In [10]:
output = feature_assembler.transform(df_pyspark)

In [12]:
output.show()

+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+--------------------+
|     id|week|center_id|meal_id|checkout_price|base_price|emailer_for_promotion|homepage_featured|num_orders|Independent Features|
+-------+----+---------+-------+--------------+----------+---------------------+-----------------+----------+--------------------+
|1000000|   3|      157|   2760|        233.83|    231.83|                    0|                0|       149|[3.0,157.0,2760.0...|
|1000001| 100|      104|   2956|        486.03|    583.03|                    0|                0|       161|[100.0,104.0,2956...|
|1000002| 143|       75|   1971|        328.86|    327.86|                    0|                0|       149|[143.0,75.0,1971....|
|1000003|  41|       24|   2539|        145.53|    145.53|                    0|                0|       540|[41.0,24.0,2539.0...|
|1000004|  45|       83|   2539|         95.06|    120.34|                    0|   

In [13]:
finalized = output.select("Independent Features", "num_orders")

In [14]:
finalized.show()

+--------------------+----------+
|Independent Features|num_orders|
+--------------------+----------+
|[3.0,157.0,2760.0...|       149|
|[100.0,104.0,2956...|       161|
|[143.0,75.0,1971....|       149|
|[41.0,24.0,2539.0...|       540|
|[45.0,83.0,2539.0...|       271|
|[101.0,65.0,1754....|       541|
|[107.0,153.0,2126...|        53|
|[11.0,50.0,1062.0...|       432|
|[114.0,57.0,1962....|       486|
|[68.0,36.0,1216.0...|        28|
|[10.0,76.0,2760.0...|       108|
|[18.0,104.0,2867....|        53|
|[33.0,36.0,2494.0...|        94|
|[140.0,88.0,1571....|        81|
|[105.0,81.0,2139....|        55|
|[112.0,43.0,2290....|      1863|
|[131.0,77.0,2290....|       418|
|[9.0,80.0,2826.0,...|       771|
|[101.0,92.0,1445....|        27|
|[18.0,52.0,1311.0...|       742|
+--------------------+----------+
only showing top 20 rows



# Implementation of Machine Learning model

In [17]:
from pyspark.ml.regression import LinearRegression
train, test = finalized.randomSplit([0.75, 0.25])
model = LinearRegression(featuresCol='Independent Features',labelCol='num_orders')
model = model.fit(train)

In [18]:
model.coefficients

DenseVector([-0.0407, -0.3366, -0.0022, -0.828, 0.2239, 268.2066, 171.4212])

In [19]:
model.intercept

448.27303587090074

# Making Predictions

In [20]:
preds = model.evaluate(test)

In [22]:
preds.predictions.show()

+--------------------+----------+------------------+
|Independent Features|num_orders|        prediction|
+--------------------+----------+------------------+
|[1.0,36.0,1971.0,...|       337|   561.61074441895|
|[1.0,52.0,1878.0,...|       121|257.67563830052256|
|[1.0,61.0,1216.0,...|        95|379.62621279568765|
|[2.0,108.0,1543.0...|        53|115.65270831442422|
|[2.0,132.0,1803.0...|       338|  290.521969122043|
|[3.0,43.0,2306.0,...|       404|472.20980007693913|
|[3.0,61.0,1558.0,...|       134| 54.99913427821269|
|[3.0,106.0,1754.0...|       377|239.24735429315857|
|[3.0,161.0,1754.0...|       136| 220.1327976463236|
|[4.0,23.0,2704.0,...|        94|296.57997080062745|
|[4.0,41.0,2640.0,...|        54| 280.7528977503619|
|[4.0,80.0,1878.0,...|        95| 279.4515028559203|
|[4.0,104.0,1885.0...|      1026|498.28108396039823|
|[4.0,139.0,1248.0...|        28|355.06482945893003|
|[5.0,89.0,2704.0,...|        94| 236.3175399408886|
|[5.0,99.0,2290.0,...|     12137| 784.44536953

# Evaluating the model

In [23]:
preds.meanAbsoluteError

200.50416646600706

In [24]:
preds.meanSquaredError

306988.70949752745