# Data Setup
This example uses two sets of data, customer and sales transactions. This data Setup is done using Spark SQL and UDF functions.
The total number of customers (small number of customers; those are used for sales data in cycle/repetition) and sales transactions are configurable.

# Analysis
Aggregate sales value by date.

# Execution
Execute this notebook without AQE enabled and with AQE Enabled to compare execution time.

In [None]:
%%configure -f
{ "conf": {"spark.sql.adaptive.enabled":"false",
           "spark.executor.instances":"5"
          }
}

In [None]:
sql("DROP table IF EXISTS customer")
sql("DROP table IF EXISTS sale")

## Data Setup

In [None]:
val noOfCustomers = 1000000
val noOfSaleTx = 1000000000

//possible random state
val possibleCustAddState = List("AL","AK","AZ","AR","AS","CA","CO","CT",
                                    "DE","DC","FL","GA","GU","HI","ID","IL","IN","IA","KS","KY")
val stateRandom=scala.util.Random.nextInt(possibleCustAddState.length)

//UDF to generate random string with given length
val randomString =udf((length: Int)=> scala.util.Random.alphanumeric.take(length).mkString)
spark.udf.register("randomString",randomString)
//UDF to generate random float
val randomFloat=udf(()=>scala.util.Random.nextFloat())
spark.udf.register("randomFloat",randomFloat)
//UDF to get random state
val randomAddState=udf(()=>possibleCustAddState(scala.util.Random.nextInt(possibleCustAddState.length)))
spark.udf.register("randomAddState",randomAddState)


/* list of customer schema 
root
 |-- name: string (nullable = true)
 |-- address_state: float (nullable = true)
 |-- customer_id: long (nullable = false)
*/
val customerDF = sql(s"""SELECT randomString(10) as name,
                        randomAddState() as address_state,
                        id as customer_id FROM range($noOfCustomers)""")

customerDF.write.format("parquet").mode("overwrite").saveAsTable("customer")

/* list of sales transactions 
 root
 |-- customer_id: integer (nullable = true)
 |-- tx_value: integer (nullable = false)
 |-- tx_id: long (nullable = false)
 |-- tx_date: date (nullable = true)
*/
val salesDF = sql(s"""SELECT CAST(randomFloat() * $noOfCustomers AS INT) AS customer_id,
                      round(randomFloat()*100,2) as tx_value,
                      id as tx_id,
                      DATE_ADD(current_date(), - CAST(randomFloat() * 90 AS INT)) AS tx_date
                      FROM range($noOfSaleTx)""")

salesDF.write.format("parquet").mode("overwrite").saveAsTable("sale")

## Analysis

In [None]:
%%sql
SELECT count(*), address_state FROM customer WHERE address_state="IL" GROUP BY address_state

## AQE Disable

In [None]:
%%sql
SELECT tx_date, sum(tx_value) AS total_sales
FROM sale
JOIN customer ON customer.customer_id = sale.customer_id
WHERE address_state="IL"
GROUP BY tx_date

## AQE Enable

In [None]:
// Enable AQE
sql("SET spark.sql.adaptive.enabled=true")
sql("set spark.sql.adaptive.localShuffleReader.enabled=true")

In [None]:
%%sql
SELECT tx_date, sum(tx_value) AS total_sales
FROM sale
JOIN customer ON customer.customer_id = sale.customer_id
WHERE address_state="IL"
GROUP BY tx_date