# Golden Ticket to Big Data: Exploring Wonka’s Candy Sales with Spark

## Simulating Data

In [1]:
import pandas as pd
import numpy as np

data = []

countries = ["USA", "MEX", "CAN", "DEU", "ITA", "FRA",
             "CHN", "RUS", "SAU", "ARE", "GBR", "TUR", "IND", "BRA"]
candy_types = ["chocolate bar", "white chocolate bar", "dark chocolate bar", "blueberry bubblegum",
               "caramel popcorn", "peanut butter pops", "chocolate cookies", "butter cookies", "gummy bears", "lollipops"]

for i in range(1000):
    country = np.random.choice(countries)
    candy = np.random.choice(candy_types)
    sales = np.random.randint(100000, 10000000)

    data.append({'country': country, 'candy': candy, 'sales': sales})
    
df = pd.DataFrame(data)
df.to_csv('candy_sales.csv', index=False) # We don't need to store the index


## Working with Data

In [None]:
#imports
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import count

#spark object
spark = (SparkSession
  .builder
  .appName("CandySalesCount")
  .getOrCreate())

#data file path
candy_sales_file = "./candy_sales.csv"

#load the data
candy_sales_df = (spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load(candy_sales_file))

Let’s see the resulting aggregations for each country and it’s sales. A total count of candy sales per state. Remember that show() is an action, so that means that it will trigger the query we just wrote and it will execute.

In [3]:
count_candy_sales_df = (candy_sales_df
  .select("country", "candy", "sales")
  .groupBy("country", "candy")
  .agg(count("sales").alias("Total"))
  .orderBy("Total", ascending=False))

count_candy_sales_df.show(n=60, truncate=False)
print(f"Total Rows = {count_candy_sales_df.count()}")

                                                                                

+-------+-------------------+-----+
|country|candy              |Total|
+-------+-------------------+-----+
|FRA    |dark chocolate bar |17   |
|ITA    |peanut butter pops |14   |
|RUS    |caramel popcorn    |14   |
|BRA    |chocolate cookies  |13   |
|RUS    |butter cookies     |13   |
|MEX    |white chocolate bar|12   |
|SAU    |chocolate bar      |12   |
|IND    |gummy bears        |12   |
|CAN    |peanut butter pops |12   |
|FRA    |gummy bears        |12   |
|MEX    |chocolate cookies  |11   |
|MEX    |butter cookies     |11   |
|DEU    |peanut butter pops |11   |
|SAU    |gummy bears        |11   |
|IND    |caramel popcorn    |11   |
|CHN    |chocolate cookies  |11   |
|ARE    |caramel popcorn    |10   |
|IND    |butter cookies     |10   |
|MEX    |caramel popcorn    |10   |
|GBR    |butter cookies     |10   |
|DEU    |white chocolate bar|10   |
|IND    |peanut butter pops |10   |
|IND    |lollipops          |10   |
|RUS    |blueberry bubblegum|10   |
|RUS    |chocolate bar      

If we wanted to dig a bit deeper, we can see the data for a specific country with the following code.

In [4]:
ita_count_candy_sales_df = (candy_sales_df
  .select("country", "candy", "sales")
  .where(candy_sales_df.country == "ITA")
  .groupBy("country", "candy")
  .agg(count("sales").alias("Total"))
  .orderBy("Total", ascending=False))

ita_count_candy_sales_df.show(n=10, truncate=False)

+-------+-------------------+-----+
|country|candy              |Total|
+-------+-------------------+-----+
|ITA    |peanut butter pops |14   |
|ITA    |blueberry bubblegum|9    |
|ITA    |lollipops          |8    |
|ITA    |caramel popcorn    |7    |
|ITA    |gummy bears        |7    |
|ITA    |white chocolate bar|7    |
|ITA    |chocolate cookies  |6    |
|ITA    |butter cookies     |5    |
|ITA    |dark chocolate bar |5    |
|ITA    |chocolate bar      |4    |
+-------+-------------------+-----+



In [5]:
spark.stop()