In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import *
import datetime
spark = SparkSession.builder.getOrCreate()

22/03/22 20:49:57 WARN Utils: Your hostname, LAPTOP-V8EALT7T resolves to a loopback address: 127.0.1.1; using 172.20.10.2 instead (on interface wifi0)
22/03/22 20:49:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/22 20:49:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
spark \
        .read \
        .option('header', True) \
        .option('inferSchema', 'true')\ # get types
        .csv(f'/home/dmitry/Business Sales Transaction.csv') \
        .createOrReplaceTempView('business_sales')

                                                                                

In [29]:
# Tasks:
# 1. How much money each customer spent per month? 
# 2. How many different items of goods he bought?
# 3. What goods he bought most often?

In [4]:
# create a df
df = spark.table('business_sales')
df.show(10)

+-------------+----------+---------+--------------------+-----+--------+----------+--------------+
|TransactionNo|      Date|ProductNo|         ProductName|Price|Quantity|CustomerNo|       Country|
+-------------+----------+---------+--------------------+-----+--------+----------+--------------+
|       536365|2018-12-01|   85123A|Cream Hanging Hea...| 1.88|       6|     17850|United Kingdom|
|       536365|2018-12-01|    71053|White Moroccan Me...| 2.01|       6|     17850|United Kingdom|
|       536365|2018-12-01|   84406B|Cream Cupid Heart...| 1.91|       8|     17850|United Kingdom|
|       536365|2018-12-01|   84029G|Knitted Union Fla...| 2.01|       6|     17850|United Kingdom|
|       536365|2018-12-01|   84029E|Red Woolly Hottie...| 2.01|       6|     17850|United Kingdom|
|       536365|2018-12-01|    22752|Set 7 Babushka Ne...| 2.65|       2|     17850|United Kingdom|
|       536365|2018-12-01|    21730|Glass Star Froste...| 2.14|       6|     17850|United Kingdom|
|       53

In [5]:
# rename columns
df = df\
.withColumnRenamed('TransactionNo', 'transaction_id')\
.withColumnRenamed('Date', 'date')\
.withColumnRenamed('ProductNo', 'product_number')\
.withColumnRenamed('ProductName', 'product_name')\
.withColumnRenamed('Price', 'product_price')\
.withColumnRenamed('Quantity', 'quantity')\
.withColumnRenamed('CustomerNo', 'customer_id')\
.withColumnRenamed('Country', 'country')

In [6]:
# drop null rows
df.dropna()

DataFrame[transaction_id: string, date: string, product_number: string, product_name: string, product_price: double, quantity: int, customer_id: string, country: string]

In [30]:
# total price and unique products count for each month and customer_id
df_month_cusid = df\
.withColumn('order_month',F.month(F.col('date')))\
.where(F.col('quantity') > 0)\
.groupBy(F.col('order_month'),F.col('customer_id'))\
.agg(
    F.sum(F.col('product_price') * F.col('quantity')).alias('total_price'),
    F.countDistinct(F.col('product_name')).alias('products_cnt_unique'))\
.withColumn('product_price', F.col('total_price').cast('float'))

#select values
df_month_cusid = df_month_cusid\
.select(F.col('order_month'), F.col('customer_id'), F.col('product_price'), F.col('products_cnt_unique'))\
.orderBy('customer_id','order_month').show(10)



+-----------+-----------+-------------+-------------------+
|order_month|customer_id|product_price|products_cnt_unique|
+-----------+-----------+-------------+-------------------+
|          4|      12004|       227.14|                 56|
|          5|      12006|         3.62|                  1|
|          3|      12008|       838.54|                203|
|         12|      12013|        10.23|                  1|
|          6|      12024|        21.84|                  5|
|          2|      12025|       176.26|                 46|
|          1|      12026|      3032.83|                406|
|          9|      12031|         36.2|                  7|
|          3|      12042|       929.33|                223|
|          7|      12043|       207.29|                 36|
+-----------+-----------+-------------+-------------------+
only showing top 10 rows



                                                                                

In [22]:
# cnt of products by month and customer_id
find_product = df\
.where(F.col('quantity') > 0)\
.withColumn('order_month',F.month(F.col('date')))\
.groupBy(F.col('order_month'),F.col('customer_id'), F.col('product_name'))\
.agg(
    F.sum(F.col('quantity')).alias('product_cnt'))

# show df
find_product\
.orderBy('customer_id','order_month').show(70)



+-----------+-----------+--------------------+-----------+
|order_month|customer_id|        product_name|product_cnt|
+-----------+-----------+--------------------+-----------+
|          4|      12004|Set Of 4 Polkadot...|          1|
|          4|      12004| Photo Frame Cornice|          1|
|          4|      12004|Jumbo Bag Woodlan...|          1|
|          4|      12004|French Blue Metal...|          1|
|          4|      12004|Set/6 Red Spotty ...|          4|
|          4|      12004|Citronella Candle...|          2|
|          4|      12004|Skull Lunch Box W...|          2|
|          4|      12004|Picnic Basket Wic...|          1|
|          4|      12004|Jumbo Storage Bag...|          2|
|          4|      12004|Tea Time Party Bu...|          4|
|          4|      12004|Set Of Picture Fr...|          1|
|          4|      12004|  Red Retrospot Bowl|          1|
|          4|      12004|Jumbo Bag Pink Po...|          1|
|          4|      12004|Gingerbread Man C...|          

                                                                                

In [23]:
# the most popular product by month and customer_id, over clause
most_popular_product = find_product.orderBy(F.col('product_cnt'), ascending = False)\
.groupBy(F.col('order_month'), F.col('customer_id'))\
.agg(F.first(F.col('product_name')).alias('popular_product_name'))

# popular_product_name by each month and customer_id
most_popular_product\
.orderBy('customer_id','order_month').show(70)



+-----------+-----------+--------------------+
|order_month|customer_id|popular_product_name|
+-----------+-----------+--------------------+
|          4|      12004|Red Enchanted For...|
|          5|      12006|Jumbo Storage Bag...|
|          3|      12008|Fluted Antique Ca...|
|         12|      12013|Cabin Bag Vintage...|
|          6|      12024|Vintage Caravan G...|
|          2|      12025|Jumbo Bag Red Ret...|
|          1|      12026|Metal Sign Cupcak...|
|          9|      12031|      Jumbo Bag Owls|
|          3|      12042|Black Mini Tape M...|
|          7|      12043|Jumbo Shopper Vin...|
|          7|      12050|Jumbo Shopper Vin...|
|          9|      12057|Jumbo Storage Bag...|
|         11|      12057|Spaceboy Mini Bac...|
|         12|      12060|Pack 3 Boxes Chri...|
|          5|      12063|Charlotte Bag App...|
|         11|      12067|Girls Alphabet Ir...|
|         12|      12071|Hot Water Bottle ...|
|          2|      12078|Hanging Heart Jar...|
|          6|

                                                                                

In [28]:
# join 2 df and create a view of data: 'updated_df'
updated_df = df_month_cusid\
.join(most_popular_product, ['customer_id', 'order_month'], 'outer')\
.select(df_month_cusid.customer_id,
        df_month_cusid.order_month, 
        df_month_cusid.product_price,
        df_month_cusid.products_cnt_unique, 
        most_popular_product.popular_product_name)

updated_df.orderBy('customer_id', 'order_month').show(30)



+-----------+-----------+-------------+-------------------+--------------------+
|customer_id|order_month|product_price|products_cnt_unique|popular_product_name|
+-----------+-----------+-------------+-------------------+--------------------+
|      12004|          4|       227.14|                 56|Red Enchanted For...|
|      12006|          5|         3.62|                  1|Jumbo Storage Bag...|
|      12008|          3|       838.54|                203|Fluted Antique Ca...|
|      12013|         12|        10.23|                  1|Cabin Bag Vintage...|
|      12024|          6|        21.84|                  5|Vintage Caravan G...|
|      12025|          2|       176.26|                 46|Jumbo Bag Red Ret...|
|      12026|          1|      3032.83|                406|Metal Sign Cupcak...|
|      12031|          9|         36.2|                  7|      Jumbo Bag Owls|
|      12042|          3|       929.33|                223|Black Mini Tape M...|
|      12043|          7|   

                                                                                