# Retail Business Analytics
## Task 1: customers-tab-delimited
    Show the client information for those who live in California
    The final output must be in text format
    Save the results in the result/scenario1/solution folder
    Only records with the state value "CA" should be included in the result
    Only the customer's entire name should be included in the output

In [56]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark').getOrCreate()
spark

In [57]:
df = spark.read.option('delimiter','\t').csv('part-m-00000',inferSchema=True)

In [13]:
df.show(5)

+---+-------+---------+---------+---------+--------------------+-----------+---+-----+
|_c0|    _c1|      _c2|      _c3|      _c4|                 _c5|        _c6|_c7|  _c8|
+---+-------+---------+---------+---------+--------------------+-----------+---+-----+
|  1|Richard|Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|Brownsville| TX|78521|
|  2|   Mary|  Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|  Littleton| CO|80126|
|  3|    Ann|    Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|     Caguas| PR|  725|
|  4|   Mary|    Jones|XXXXXXXXX|XXXXXXXXX|  8324 Little Common| San Marcos| CA|92069|
|  5| Robert|   Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|     Caguas| PR|  725|
+---+-------+---------+---------+---------+--------------------+-----------+---+-----+
only showing top 5 rows



In [None]:
df = df.toDF('index','first_name','last_name','x1','x2','street','city','state','zip_code')

In [19]:
df.show(5)

+-----+----------+---------+---------+---------+--------------------+-----------+-----+--------+
|index|first_name|last_name|       x1|       x2|              street|       city|state|zip_code|
+-----+----------+---------+---------+---------+--------------------+-----------+-----+--------+
|    1|   Richard|Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|Brownsville|   TX|   78521|
|    2|      Mary|  Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|  Littleton|   CO|   80126|
|    3|       Ann|    Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|     Caguas|   PR|     725|
|    4|      Mary|    Jones|XXXXXXXXX|XXXXXXXXX|  8324 Little Common| San Marcos|   CA|   92069|
|    5|    Robert|   Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|     Caguas|   PR|     725|
+-----+----------+---------+---------+---------+--------------------+-----------+-----+--------+
only showing top 5 rows



In [24]:
ca_df = df.filter('state == "CA"')

In [29]:
ca_df.write.format('csv').option('header',True).option('sep','\t').save('output')

In [None]:
# final code for submitting spark job (scenario 1)
from pyspark.sql import SparkSession

#create spark session
spark = SparkSession.builder.appName('spark').getOrCreate()

#input data
df = spark.read.option('delimiter','\t').csv(r'data-files/customers-tab-delimited/part-m-00000',inferSchema=True)

#apply schema on data, making it a dataframe 
df = df.toDF('index','first_name','last_name','x1','x2','street','city','state','zip_code')

#using filter function, select only rows with state == CA
ca_df = df.filter('state == "CA"')

#now we combine first and last name into full name
from pyspark.sql import functions as F

""" this is if you want to add the new column at the end
final_df = ca_df.withColumn('full_name',F.concat(F.col('first_name'),\
                               F.lit(' '),\
                               F.col('last_name')))
"""
# this is for select the new column onlh
final_df = ca_df.select(F.concat(F.col('first_name'),F.lit(' '),F.col('last_name')).alias('full_name'))

#save file to the correct directory
final_df.write.format('csv').option('header',True).option('sep','\t').save(r'results/sc1/task1/')

#close out app to save resource
spark.stop()

### submit job via
spark3-submit --conf spark.ui.port=6065 --deploy-mode client ca.py

## Task 2 orders parquet
    • Show all orders with the order status value "COMPLETE“
    • Save the data in the "result/scenario2/solution" directory on HDFS
    • Include order number, order date, and current situation in the output
    The "order date" column should be in the "YYYY-MM-DD" format 
    Use GZIP compression to compress the output 

In [110]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName('spark')\
        .getOrCreate()
spark

Spark SQL caches Parquet metadata for better performance. When Hive metastore Parquet table conversion is enabled, metadata of those converted tables are also cached. If these tables are updated by Hive or other external tools, you need to refresh them manually to ensure consistent metadata.

In [109]:
from pyspark.sql.functions import from_utc_timestamp, from_unixtime
from pyspark.sql.types import StringType

d = ['1374710400000']
df = spark.createDataFrame(d, StringType())
df.show()


df.withColumn('new_date',from_utc_timestamp(from_unixtime(df.value/1000,"yyyy-MM-dd hh:mm:ss"),'UTC-8')).show()

+-------------+
|        value|
+-------------+
|1374710400000|
+-------------+

+-------------+-------------------+
|        value|           new_date|
+-------------+-------------------+
|1374710400000|2013-07-25 04:00:00|
+-------------+-------------------+



In [111]:
df = spark.read.format('parquet').load('short.parquet')

df.show(5)

+--------+-------------+-----------------+---------------+
|order_id|   order_date|order_customer_id|   order_status|
+--------+-------------+-----------------+---------------+
|       1|1374710400000|            11599|         CLOSED|
|       2|1374710400000|              256|PENDING_PAYMENT|
|       3|1374710400000|            12111|       COMPLETE|
|       4|1374710400000|             8827|         CLOSED|
|       5|1374710400000|            11318|       COMPLETE|
+--------+-------------+-----------------+---------------+
only showing top 5 rows



In [117]:
normalized_date_df = df.withColumn('new_date',from_utc_timestamp(from_unixtime(df['order_date']/1000),'UTC-8'))
normalized_date_df.show(5)

+--------+-------------+-----------------+---------------+-------------------+
|order_id|   order_date|order_customer_id|   order_status|           new_date|
+--------+-------------+-----------------+---------------+-------------------+
|       1|1374710400000|            11599|         CLOSED|2013-07-24 16:00:00|
|       2|1374710400000|              256|PENDING_PAYMENT|2013-07-24 16:00:00|
|       3|1374710400000|            12111|       COMPLETE|2013-07-24 16:00:00|
|       4|1374710400000|             8827|         CLOSED|2013-07-24 16:00:00|
|       5|1374710400000|            11318|       COMPLETE|2013-07-24 16:00:00|
+--------+-------------+-----------------+---------------+-------------------+
only showing top 5 rows



In [126]:
from pyspark.sql.functions import date_format, col
final_df = normalized_date_df.withColumn('date',date_format(col('new_date'),'yyyy-MM-dd').cast('date'))

In [127]:
final_df = final_df['order_id','order_customer_id','date']
final_df.show(5)

+--------+-----------------+----------+
|order_id|order_customer_id|      date|
+--------+-----------------+----------+
|       1|            11599|2013-07-24|
|       2|              256|2013-07-24|
|       3|            12111|2013-07-24|
|       4|             8827|2013-07-24|
|       5|            11318|2013-07-24|
+--------+-----------------+----------+
only showing top 5 rows



In [128]:
final_df.write.format('csv').option('header',True).option('sep','\t').option('compression','gzip').save(r'results/sc1/task2/')

In [89]:
spark.stop()

In [62]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: long (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [40]:
df.filter('order_status == "COMPLETE"').show(5)

+--------+-------------+-----------------+------------+
|order_id|   order_date|order_customer_id|order_status|
+--------+-------------+-----------------+------------+
|       3|1374710400000|            12111|    COMPLETE|
|       5|1374710400000|            11318|    COMPLETE|
|       6|1374710400000|             7130|    COMPLETE|
|       7|1374710400000|             4530|    COMPLETE|
|      15|1374710400000|             2568|    COMPLETE|
+--------+-------------+-----------------+------------+
only showing top 5 rows



In [44]:
complete_df = df.filter('order_status == "COMPLETE"')
spark.stop()

In [None]:
# final code for submitting spark job (scenario 2)
from pyspark.sql import SparkSession

#create spark session
spark = SparkSession.builder.appName('spark').getOrCreate()

#input data
df = spark.read.parquet(r'data-files/orders_parquet/shortName.parquet',inferSchema=True)

#apply schema on data is not needed because header is included

#using filter function, select only rows with order_status == COMPLETE
complete_df = df.filter('order_status == "COMPLETE"')

# after applying the first filter, convert time stamp into standard date format
from pyspark.sql.functions import from_utc_timestamp, from_unixtime, date_format, col
# convert timestamp
normalized_date_df = complete_df.withColumn('new_date',from_utc_timestamp(from_unixtime(df['order_date']/1000),'UTC-8'))
# extract date
final_df = normalized_date_df.withColumn('date',date_format(col('new_date'),'yyyy-MM-dd').cast('date'))
# select columns we need
final_df = final_df['order_id','order_customer_id','date','order_status']

# save file to the correct directory using gzip compression
final_df.write.format('csv').option('header',True).option('sep','\t').option('compression','gzip').save(r'results/sc1/task2/')

#close out app to save resource
spark.stop()

### submit job via
spark3-submit --conf spark.ui.port=6065 --deploy-mode client complete.py

## Task 3 customer-tab-delimited
    • Produce a list of all consumers who live in the city of "Caguas"
    • Save the results in the result/scenario3/solution folder
    • The result should only contain records with the value "Caguas" for the 
    customer city
    Use snappy compression to compress the output 
    Save the file in the orc format 


In [None]:
# final code for submitting spark job (scenario 3)
from pyspark.sql import SparkSession

#create spark session
spark = SparkSession.builder.appName('spark').getOrCreate()

#input data
df = spark.read.option('delimiter','\t').csv(r'data-files/customers-tab-delimited/part-m-00000',inferSchema=True)

#apply schema on data, making it a dataframe 
df = df.toDF('index','first_name','last_name','x1','x2',\
              'street','city','state','zip_code')

#using filter function, select only rows with city = Caguas
ca_df = df.filter('city == "Caguas"')

#save file to the correct directory
ca_df.write.format('orc').option('compression','snappy').option('header',True).save(r'results/sc1/task3/')

#close out app to save resource
spark.stop()

### submit job via
spark3-submit --conf spark.ui.port=6065 --deploy-mode client caguas.py

## Task 4 categories
    . Explore the order records saved in the “categories” directory on HDFS
    • Save the result files in CSV format
    • Save the data in the result/scenario4/solution directory on HDFS
    • Use lz4 compression to compress the output

In [None]:
# final code for submitting spark job (scenario 4)
from pyspark.sql import SparkSession

#create spark session
spark = SparkSession.builder.appName('spark').getOrCreate()

#input data
df = spark.read.option('delimiter',',').csv(r'data-files/categories/part-m-00000',inferSchema=True)

#apply schema on data, making it a dataframe 
df = df.toDF('index','type','category')

#save file to the correct directory
df.write.format('csv').option('header',True).option('compression','lz4')\
        .option('sep',',').save(r'results/sc1/task4/')

#close out app to save resource
spark.stop()

### submit job via
spark3-submit --conf spark.ui.port=6065 --deploy-mode client categories.py

for compressing, I could not find a way to compressfile straight in the hdfs system, so I resorted to copying it to the local linux file system, compress it then copy it back. Your kernel did not have lz4 library installed

hdfs dfs -copyToLocal /user/antnguyen72gmail/results/sc1/task4/part-00000-58f13e38-67f9-44f3-9ba9-858686dc369c-c000.csv

lz4 part-00000-58f13e38-67f9-44f3-9ba9-858686dc369c-c000.csv

hdfs dfs -put part-00000-58f13e38-67f9-44f3-9ba9-858686dc369c-c000.csv.lz4 /user/antnguyen72gmail/results/sc1/task4/part-00000-58f13e38-67f9-44f3-9ba9-858686dc369c-c000.csv.lz4


## Task 5 products_avro
    Explore the customer records saved in the “products_avro" directory on HDFS
    • Include the products with a price of more than 1000.0 in the output 
    • Remove data from the table if the product price is greater than 1000.0
    • Save the results in the result/scenario5/solution folde

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark').getOrCreate()

# read all files

# get a list of all file names from the directory
import os
file_list = os.listdir(r'data-files/products_avro/')

# using loop, add each dataframe objects onto a list
df_list = []
for line in file_list:
    df = spark.read.format("avro").load(f'data-files/products_avro/{line}')
    df_list.append(df)

# use reduce functional tool to apply a lambda function to am iterable list
# in this case, we unioned all the dataframes togther
from functools import reduce
combined_df = reduce(lambda x,y: x.union(y),df_list)

# output rows with product_price is greater than 1000.0
combined_df.filter('product_price > 1000.0').show()

# remove data from table if the product price is greater than 1000.0
final_df = combined_df.filter('product_price <= 1000.0')

# save file
# tried to save as avro with compression but avro is not compat with snappy
# turns out orc is the only compatiable with snappy?? and also avro
final_df.write.format('orc').option('compression','snappy').save(r'results/sc1/task5/')

# close out spark session
spark.stop()

### Job submit

spark3-submit --conf spark.ui.port=6065 --deploy-mode client avro_1.py

## Task 6
    Explore the “products_avro” stored in product records

    REQUIREMENT:

    Only products with a price of more than 1000.0 should be in the output
    The pattern "Treadmill" appears in the product name
    Save the output files in parquet format
    Save the data in the result/scenario6/solution directory on HDFS
    Use GZIP compression to compress the output

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark').getOrCreate()

# read all files

# get a list of all file names from the directory
import os
file_list = os.listdir(r'data-files/products_avro/')

# using loop, add each dataframe objects onto a list
df_list = []
for line in file_list:
    df = spark.read.format("avro").load(f'data-files/products_avro/{line}')
    df_list.append(df)

# use reduce functional tool to apply a lambda function to am iterable list
# in this case, we unioned all the dataframes togther
from functools import reduce
combined_df = reduce(lambda x,y: x.union(y),df_list)

# output rows with product_price is greater than 1000.0
greater_df = combined_df.filter('product_price > 1000.0')

# create a temp view for easier querying
greater_df.createOrReplaceTempView('table')

# query rows with the pattern "Treadmill"
final_df = spark.sql("select * from table where lower(product_name) like '%treadmill%'")

# save file as parquet format using gzip compression
final_df.write.format('parquet').option('compression','gzip').save(r'results/sc1/task6/')

# close out spark session
spark.stop()

spark3-submit --conf spark.ui.port=6065 --deploy-mode client avro_2.py

## Task 7
    Explore the order records that are saved in the “orders parquet” table on HDFS

    REQUIREMENT:

    Output all PENDING orders in July 2013
    Output files should be in JSON format
    Save the data in the result/scenario7/solution directory on HDFS.
    Only entries with the order status value of "PENDING" should be included in the result
    Order date should be in the YYY-MM-DD format
    Use snappy compression to compress the output, which should just contain the order date and order status

In [None]:
# final code for submitting spark job (scenario 7)
from pyspark.sql import SparkSession

#create spark session
spark = SparkSession.builder.appName('spark').getOrCreate()

#input data
df = spark.read.parquet('data-files/orders_parquet/shortName.parquet',inferSchema=True)

#apply schema on data is not needed because header is included

#using filter function, select only rows with order_status == PENDING
pending_df = df.filter('order_status == "PENDING"')

# after applying the first filter, convert time stamp into standard date format
from pyspark.sql.functions import from_utc_timestamp, from_unixtime, date_format, col
# convert timestamp
normalized_date_df = pending_df.withColumn('new_date',from_utc_timestamp(from_unixtime(df['order_date']/1000),'UTC-8'))
# extract date
final_df = normalized_date_df.withColumn('date',date_format(col('new_date'),'yyyy-MM-dd').cast('date'))

# select columns we need
final_df = final_df['date','order_status']

# save file
final_df.write.format('json').option('compression','snappy').save(r'results/sc1/task7/')

spark.stop()

spark3-submit --conf spark.ui.port=6065 --deploy-mode client pending.py

In [131]:
final_df.write.format('json').option('compression','snappy').save(r'results/sc1/task7/')

In [134]:
# final code for submitting spark job (scenario 2)
from pyspark.sql import SparkSession

#create spark session
spark = SparkSession.builder.appName('spark').getOrCreate()

#input data
df = spark.read.parquet(r'short.parquet',inferSchema=True)

#apply schema on data is not needed because header is included

#using filter function, select only rows with order_status == PENDING
pending_df = df.filter('order_status == "PENDING"')

# after applying the first filter, convert time stamp into standard date format
from pyspark.sql.functions import from_utc_timestamp, from_unixtime, date_format, col
# convert timestamp
normalized_date_df = pending_df.withColumn('new_date',from_utc_timestamp(from_unixtime(df['order_date']/1000),'UTC-8'))
# extract date
final_df = normalized_date_df.withColumn('date',date_format(col('new_date'),'yyyy-MM-dd').cast('date'))

# select columns we need
final_df = final_df['date','order_status']

# save file
final_df.write.format('json').option('compression','snappy').save(r'results/sc1/task7/')

spark.stop()