# Data Setup
This example uses two sets of data, items and sales transactions. This data Setup is done using Spark SQL and UDF functions.
The total number of items and sales transactions are configurable.
# Analysis
Total sales (sold qty * unit price) are grouped by transaction date. 
# Execution
Execute this notebook without AQE enabled and with AQE Enabled to compare execution time.

In [None]:
%%configure -f
{ "conf": {"spark.sql.adaptive.enabled":"false",
           "spark.executor.instances":"5"
          }
}

## Data Setup

In [None]:
// first disable AQE and SkewJoin
//sql("SET spark.sql.adaptive.enabled=true")
//sql("SET spark.sql.adaptive.skewJoin.enabled=true")

val noOfItems = 2000000
val noOfSaleTx = 1000000000
//UDF to generate random string with given length
val randomString =udf((length: Int)=> scala.util.Random.alphanumeric.take(length).mkString)
spark.udf.register("randomString",randomString)
//UDF to generate random float
val randomFloat=udf(()=>scala.util.Random.nextFloat())
spark.udf.register("randomFloat",randomFloat)
//UDF to generate random integer quantity
val randomQty=udf((max:Int)=>scala.util.Random.nextInt(10))
spark.udf.register("randomQty",randomQty)


/* list of items schema 
root
 |-- name: string (nullable = true)
 |-- unit_price: float (nullable = true)
 |-- item_id: long (nullable = false)
*/
val itemDF = sql(s"""SELECT randomString(10) as name,
                        round(randomFloat()*10,2) as unit_price,
                        id as item_id FROM range($noOfItems)""")

itemDF.write.format("parquet").mode("overwrite").saveAsTable("item")

/* list of sales transactions - skewness for item#18
 root
 |-- item_id: integer (nullable = true)
 |-- soldQty: integer (nullable = false)
 |-- tx_id: long (nullable = false)
 |-- tx_date: date (nullable = true)
*/
val salesDF = sql(s"""SELECT CASE WHEN randomFloat() < 0.9 THEN 18 ELSE CAST(randomFloat() * $noOfItems AS INT) END AS item_id,
                      randomQty(10) as soldQty,
                      id as tx_id,
                      DATE_ADD(current_date(), - CAST(randomFloat() * 90 AS INT)) AS tx_date FROM range($noOfSaleTx)""")

salesDF.write.format("parquet").mode("overwrite").saveAsTable("sale")

## Analysis

In [None]:
%%sql
SELECT tx_date, sum(soldQty * unit_price) AS total_sales
FROM sale
JOIN item ON item.item_id = sale.item_id
GROUP BY tx_date