<h1>Test Cases for retail jobs:</h1>
<p> <font color=red>* test analysis done on sample of actual dataset </font></h3>

**Spark Session creation**

In [None]:
#initial config work

import os
import sys
        
# add working directory
os.chdir(os.getcwd())

# Create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path.
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.7-src.zip"))

In [None]:
#create spark session
from pyspark.sql import SparkSession

spark = (SparkSession.builder
                     .master("local")
                     .appName("Word Count")
                     .config("spark.executor.memory", "1g")
                     .config("spark.cores.max", "2")
                     .getOrCreate())

In [None]:
#test code to see if spark works
spark.range(100).selectExpr("id*10 as idBy10").show(5) 

In [None]:
#fetch phase - reading a small chunk of sample test data

retail_df = (spark.read.csv('./../input-data/test-data/retail.csv',
                                schema=None,
                                sep=",",
                                inferSchema=True,
                                header=True))
retail_df.printSchema()

In [None]:
retail_df.show(2)

***JOB0: preprocessing/cleaning phase***

In [None]:
#extract

from pyspark.sql.types import StructField, StructType, StringType, LongType, IntegerType,DoubleType, TimestampType

retail_schema = StructType([
            StructField("InvoiceNo", LongType(), True),
            StructField("StockCode", StringType(), True),
            StructField("Description", StringType(), True),
            StructField("Quantity", IntegerType(), True),
            StructField("InvoiceDate", TimestampType(), True),
            StructField("UnitPrice", StringType(), True),
            StructField("CustomerID", DoubleType(), True),
            StructField("Country", StringType(), True)
])

retail_df = (spark.read
                         .csv('./../input-data/test-data/retail.csv',
                                schema=retailSchema,
                                sep=",",
                                header=True,
                             mode='permissive'))
retail_df.printSchema()

In [None]:
#transformations

from pyspark.sql.functions import year, month, dayofmonth, col
retail_df = (retail_df.na.fill(0)
                        .na.fill('NOVALUE')
                        .withColumn('invoiceYear', year(col('InvoiceDate')))
                        .withColumn('invoiceMonth', month(col('InvoiceDate')))
                        .withColumn('invoiceday', dayofmonth('InvoiceDate')))

retail_df.show(2)

In [None]:
#load

(retail_df
 .coalesce(2)
 .write
 .format('json')
 .mode('overwrite')
 .partitionBy('invoiceYear','invoiceMonth','invoiceDay')
 .bucketBy(1, 'Country')
 .saveAsTable('retail_cleaned'))

In [None]:
retail_df.show(14)

***JOB1 : color based aggregation on quantity and products:***

In [None]:
from pyspark.sql.functions import regexp_extract, col, count, sum, expr, regexp_replace

extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"

transformed_retail = (retail_df.withColumn('product_color', regexp_extract(col("Description"), extract_str, 1))
                      .select('Country', 'Quantity', 'UnitPrice',
                              regexp_replace(col("product_color"), '^$', "NOCOLOR").alias('product_color'))
                      .groupBy('Country', 'product_color')
                      .agg(sum('Quantity').alias('total_quantity'),
                           sum('UnitPrice').alias('total_price'))
                      .withColumn('avg_spent (dollars)', expr('total_price/total_quantity'))
                      )

In [None]:
transformed_retail.show(3)

***JOB2 : max spent on a day:***

In [None]:
#create a window function

from pyspark.sql.window import Window
from pyspark.sql.functions import col, date_format, desc, dense_rank, rank, max

#convert date format on retail_df
transform_step1 = (retail_df.withColumn('InvoiceDate', 
                                      date_format(col("InvoiceDate"), "MM/dd/yyyy H:mm")))

#window function
window_function = (Window.partitionBy("CustomerId")
                   .orderBy(desc("Quantity"))
                   .rowsBetween(Window.unboundedPreceding, Window.currentRow))


#aggregate functions
max_purchase_quantity = max(col("Quantity")).over(window_function)


#rank functions
purchase_dense_rank = dense_rank().over(window_function)
purchase_rank = rank().over(window_function)

transformed_df = (retail_df.withColumn('InvoiceDate', date_format(col("InvoiceDate"), "MM/dd/yyyy H:mm"))
                           .where("CustomerId IS NOT NULL")
                           .orderBy("CustomerId")
                           .select(col("CustomerId"),
                                   col("InvoiceDate"),
                                   col("Quantity"),
                                   purchase_rank.alias("quantityRank"),
                                   purchase_dense_rank.alias("quantityDenseRank"),
                                   max_purchase_quantity.alias("maxPurchaseQuantity")))

transformed_df.show(10)

In [None]:
dataList = [[1, 'saurabh'], [2, 'shaunak'], [3, 'sampad'], [4, 'anuj'], [None, 'sanil'], [6,None]]

test = spark.createDataFrame(dataList)

In [None]:
test = test.na.fill('test').na.fill(0)

In [None]:
test.show()