# Import Libraries and Data

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.context import SparkContext
sc = spark.sparkContext

In [2]:
train = spark.read.csv("train.csv",inferSchema=True, header=True)
meal_info = spark.read.csv("meal_info.csv",inferSchema=True, header=True)
fulfillment_center_info = spark.read.csv("fulfilment_center_info.csv",inferSchema=True, header=True)

In [3]:
train.columns

['id',
 'week',
 'center_id',
 'meal_id',
 'checkout_price',
 'base_price',
 'emailer_for_promotion',
 'homepage_featured',
 'num_orders']

In [4]:
meal_info.columns

['meal_id', 'category', 'cuisine']

In [5]:
fulfillment_center_info.columns

['center_id', 'city_code', 'region_code', 'center_type', 'op_area']

In [6]:
meal_info.createOrReplaceTempView("meal_info_view")
fulfillment_center_info.createOrReplaceTempView("center_info_view")
train.createOrReplaceTempView("train_view")

# Question 1

### What are the distinct number of meal categories and cuisines?

In [7]:
meal_info.printSchema()

root
 |-- meal_id: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- cuisine: string (nullable = true)



In [8]:
spark.sql("select count(distinct category) from meal_info_view")

count(DISTINCT category)
14


In [9]:
spark.sql("select count(distinct cuisine) from meal_info_view")

count(DISTINCT cuisine)
4


# Question 2

### Which center_id has the highest num_orders?

In [10]:
spark.sql(
    """
    select center_id, sum(num_orders) from train_view
    group by center_id
    order by sum(num_orders) desc
    limit 1
    """)

center_id,sum(num_orders)
13,1742220


The expanded list:

In [11]:
spark.sql(
    """
    select center_id, sum(num_orders) from train_view
    group by center_id
    order by sum(num_orders) desc
    """)

center_id,sum(num_orders)
13,1742220
43,1557942
10,1346533
137,1287312
52,1188327
174,1158331
67,1104886
11,1088253
27,955839
104,951334


# Question 3

### What is the top selling cuisine at the center_id that had the highest num_orders?

In [12]:
df = spark.sql(
            """
            select center_id, num_orders, train_view.meal_id from train_view where center_id = 13
            """)

# the below would not work with the where clause
#            left join meal_info_view on train_view.meal_id = meal_info_view.meal_id
#            """)

df2 = spark.sql(
            """
            select meal_id, cuisine from meal_info_view
            """)

df3 = df.join(df2, on='meal_id', how='left')

In [13]:
df3.select('cuisine', 'num_orders').groupBy('Cuisine').sum('num_orders')

Cuisine,sum(num_orders)
Thai,654724
Indian,377658
Continental,169600
Italian,540238


The thai cuisine sold the most at the center with the highest number of orders (13).

# Question 4

### What is the average op_area per center_type?

In [14]:
fulfillment_center_info.columns

['center_id', 'city_code', 'region_code', 'center_type', 'op_area']

In [15]:
fulfillment_center_info.select('center_id', 'op_area').groupBy('center_id').avg('op_area')

center_id,avg(op_area)
137,4.4
65,4.8
53,3.8
108,4.4
34,4.2
101,2.8
126,2.7
81,4.0
76,3.0
27,4.5


# Question 5

### Which center_type had the highest revenue? (Revenue is total sum of checkout_price*num_orders)

In [17]:
df = train.join(fulfillment_center_info, on='center_id', how='left')

In [18]:
from pyspark.sql.functions import col
df2 = df.withColumn('revenue', col('num_orders') * col('checkout_price'))

In [19]:
df2.select('center_type', 'revenue').groupBy('center_type').sum('revenue').sort('sum(revenue)', ascending=False)

center_type,sum(revenue)
TYPE_A,7276203201.869873
TYPE_B,3172968529.4000454
TYPE_C,2251833991.370026


Center type A had the highest revenue.

# Question 6

### Which is the top ordered cuisine in terms of num_orders?

In [22]:
df = train.join(meal_info, on='meal_id', how='left')

In [24]:
df.select('cuisine', 'num_orders').groupBy('cuisine').sum('num_orders').sort('sum(num_orders)', ascending=False)

cuisine,sum(num_orders)
Italian,17166334
Thai,14058488
Indian,10979934
Continental,6766188


Italian had the greatest number of orders.

# Question 7

### What are the num_orders per cuisine per week?

In [34]:
df.select('cuisine', 'num_orders', 'week').groupBy('week', 'cuisine').sum('num_orders').sort('week', 'cuisine')

week,cuisine,sum(num_orders)
1,Continental,146020
1,Indian,175317
1,Italian,228836
1,Thai,242088
2,Continental,133570
2,Indian,177109
2,Italian,202627
2,Thai,273778
3,Continental,97977
3,Indian,150148


# Question 8

### Which center_id gave the highest number of discounts?
(Discount is considered when checkout_price is less than base_price)

In [36]:
discount_df = train[train['checkout_price'] < train['base_price']]

In [37]:
df = discount_df.join(fulfillment_center_info, on='center_id', how='left')

In [39]:
df.columns

['center_id',
 'id',
 'week',
 'meal_id',
 'checkout_price',
 'base_price',
 'emailer_for_promotion',
 'homepage_featured',
 'num_orders',
 'city_code',
 'region_code',
 'center_type',
 'op_area']

In [43]:
df.select('center_id', 'num_orders').groupBy('center_id').sum('num_orders').sort('sum(num_orders)', ascending=False).limit(5)

center_id,sum(num_orders)
13,1092266
43,991737
137,871196
10,827907
174,753408


Center 13 gave the most discounts.