<h1>Test Cases for retail jobs:</h1>
<p> <font color=red>* test analysis done on sample of actual dataset </font></h3>

**Spark Session creation**

In [1]:
#initial config work

import os
import sys
        
# add working directory
os.chdir(os.getcwd())

# Create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path.
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.7-src.zip"))

In [2]:
#create spark session
from pyspark.sql import SparkSession

spark = (SparkSession.builder
                     .master("local")
                     .appName("Word Count")
                     .config("spark.executor.memory", "1g")
                     .config("spark.cores.max", "2")
                     .getOrCreate())

In [3]:
#test code to see if spark works
spark.range(100).selectExpr("id*10 as idBy10").show(5) 

+------+
|idBy10|
+------+
|     0|
|    10|
|    20|
|    30|
|    40|
+------+
only showing top 5 rows



In [4]:
#fetch phase - reading a small chunk of sample test data

retail_df = (spark.read.csv('./../input-data/test-data/retail.csv',
                                schema=None,
                                sep=",",
                                inferSchema=True,
                                header=True))
retail_df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



***JOB2 : color based aggregation on quantity and products:***

In [5]:
from pyspark.sql.functions import regexp_extract, col, count, sum, expr, regexp_replace

extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"

transformed_retail = (retail_df.withColumn('product_color', regexp_extract(col("Description"), extract_str, 1))
                      .select('Country', 'Quantity', 'UnitPrice',
                              regexp_replace(col("product_color"), '^$', "NOCOLOR").alias('product_color'))
                      .groupBy('Country', 'product_color')
                      .agg(sum('Quantity').alias('total_quantity'),
                           sum('UnitPrice').alias('total_price'))
                      .withColumn('avg_spent (dollars)', expr('total_price/total_quantity'))
                      )

In [6]:
transformed_retail.show(3)

+--------------+-------------+--------------+------------------+-------------------+
|       Country|product_color|total_quantity|       total_price|avg_spent (dollars)|
+--------------+-------------+--------------+------------------+-------------------+
|       Germany|          RED|             8|              8.35|            1.04375|
|United Kingdom|      NOCOLOR|         17366|10082.310000000005| 0.5805775653575956|
|        France|        GREEN|            12|              3.75|             0.3125|
+--------------+-------------+--------------+------------------+-------------------+
only showing top 3 rows



***JOB3 : max spent on a day:***

In [35]:
#create a window function

from pyspark.sql.window import Window
from pyspark.sql.functions import col, date_format, desc, dense_rank, rank, max

#convert date format on retail_df
transform_step1 = (retail_df.withColumn('InvoiceDate', 
                                      date_format(col("InvoiceDate"), "MM/dd/yyyy H:mm")))

#window function
window_function = (Window.partitionBy("CustomerId")
                   .orderBy(desc("Quantity"))
                   .rowsBetween(Window.unboundedPreceding, Window.currentRow))


#aggregate functions
max_purchase_quantity = max(col("Quantity")).over(window_function)


#rank functions
purchase_dense_rank = dense_rank().over(window_function)
purchase_rank = rank().over(window_function)

transformed_df = (retail_df.withColumn('InvoiceDate', date_format(col("InvoiceDate"), "MM/dd/yyyy H:mm"))
                           .where("CustomerId IS NOT NULL")
                           .orderBy("CustomerId")
                           .select(col("CustomerId"),
                                   col("InvoiceDate"),
                                   col("Quantity"),
                                   purchase_rank.alias("quantityRank"),
                                   purchase_dense_rank.alias("quantityDenseRank"),
                                   max_purchase_quantity.alias("maxPurchaseQuantity")))

transformed_df.show(10)

+----------+----------------+--------+------------+-----------------+-------------------+
|CustomerId|     InvoiceDate|Quantity|quantityRank|quantityDenseRank|maxPurchaseQuantity|
+----------+----------------+--------+------------+-----------------+-------------------+
|   12431.0|12/01/2010 10:03|      24|           1|                1|                 24|
|   12431.0|12/01/2010 10:03|      24|           1|                1|                 24|
|   12431.0|12/01/2010 10:03|      12|           3|                2|                 24|
|   12431.0|12/01/2010 10:03|       8|           4|                3|                 24|
|   12431.0|12/01/2010 10:03|       6|           5|                4|                 24|
|   12431.0|12/01/2010 10:03|       6|           5|                4|                 24|
|   12431.0|12/01/2010 10:03|       6|           5|                4|                 24|
|   12431.0|12/01/2010 10:03|       4|           8|                5|                 24|
|   12431.