## DATASET

Source : https://www.kaggle.com/puneetbhaya/online-retail/

## Initialization

In [1]:
# Import findspark to read SPARK_HOME and HADOOP_HOME

import findspark
findspark.init()

In [2]:
# Import required library

from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

In [3]:
# Print Spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x00000211BA645198>


## Loading Data & Pre Processing

In [4]:
# Import Dataset Online Retail
data = spark.read.csv("OnlineRetail.csv", header=True, inferSchema=True)

In [5]:
data.count()

541909

In [6]:
data.show()

+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|01/12/2010 08:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|01/12/2010 08:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|01/12/2010 08:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|01/12/2010 08:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|01/12/2010 08:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|01/12/2010 08:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|01/1

In [7]:
#Drop missing value
df = data.dropna()

In [8]:
df.count()

406829

In [9]:
# Choose column InvoiceNo and StockCode
df = df.select('InvoiceNo','StockCode')
df.show()

+---------+---------+
|InvoiceNo|StockCode|
+---------+---------+
|   536365|   85123A|
|   536365|    71053|
|   536365|   84406B|
|   536365|   84029G|
|   536365|   84029E|
|   536365|    22752|
|   536365|    21730|
|   536366|    22633|
|   536366|    22632|
|   536367|    84879|
|   536367|    22745|
|   536367|    22748|
|   536367|    22749|
|   536367|    22310|
|   536367|    84969|
|   536367|    22623|
|   536367|    22622|
|   536367|    21754|
|   536367|    21755|
|   536367|    21777|
+---------+---------+
only showing top 20 rows



In [10]:
from pyspark.sql import functions as F

#Group items into list by InvoiceNo
df = df.groupby("InvoiceNo").agg(F.collect_set("StockCode").alias('Items'))
df.show()

+---------+--------------------+
|InvoiceNo|               Items|
+---------+--------------------+
|   536938|[22112, 21931, 84...|
|   537691|[22505, 46000R, 2...|
|   538184|[22561, 22147, 21...|
|   538517|[22749, 21212, 22...|
|   538879|[21212, 22759, 22...|
|   539275|[22083, 22150, 22...|
|   539630|[22111, 22971, 22...|
|   540499|[22697, 22796, 21...|
|   540540|[22111, 22834, 22...|
|   540976|[22413, 21212, 22...|
|   541432|[22113, 22457, 21...|
|   541518|[21212, 22432, 22...|
|   541783|[22561, 22697, 22...|
|   542026|[22398, 22194, 22...|
|   542375|[22629, 21731, 22...|
|   543641|[22645, 75131, 22...|
|   544303|[84596L, 22931, 8...|
|   545583|[22090, 21931, 22...|
|   547122|[22090, 22434, 21...|
|   548542|[22357, 21586, 22...|
+---------+--------------------+
only showing top 20 rows



In [11]:
#Count of Distinct InvoiceNo
df.count()

22190

## FP Growth

In [12]:
from pyspark.ml.fpm import FPGrowth

#First frequent itemset model with minimal support 0.01 and minimal confidence 0.3
fg1 = FPGrowth(itemsCol="Items", minSupport=0.01, minConfidence=0.3)
model1 = fg1.fit(df)

In [13]:
# Display frequent itemsets
model1.freqItemsets.show()

+----------------+----+
|           items|freq|
+----------------+----+
|        [15056N]| 384|
|         [22846]| 266|
|        [85123A]|2020|
|         [22423]|1884|
| [22423, 85123A]| 239|
|         [22616]| 384|
|         [21154]| 266|
|        [85099B]|1643|
|[85099B, 85123A]| 255|
|         [21479]| 382|
|         [22840]| 265|
|         [22690]| 265|
|         [47566]|1399|
|  [47566, 22423]| 232|
| [47566, 85123A]| 280|
|         [21314]| 382|
|         [84879]|1385|
|  [84879, 22423]| 231|
| [84879, 85123A]| 271|
|         [22750]| 380|
+----------------+----+
only showing top 20 rows



In [14]:
# Display generated association rules
model1.associationRules.show()

+---------------+----------+-------------------+
|     antecedent|consequent|         confidence|
+---------------+----------+-------------------+
|        [22554]|   [22551]| 0.4966216216216216|
|        [22554]|   [22556]|0.43243243243243246|
|        [22960]|   [22961]|  0.322279792746114|
| [20726, 22382]|   [20725]| 0.6356968215158925|
|        [21977]|   [21212]| 0.5007407407407407|
|        [21977]|   [84991]| 0.4148148148148148|
|        [22699]|   [22423]| 0.5167464114832536|
|        [22699]|   [22697]|  0.666267942583732|
|        [22699]|   [22698]| 0.5394736842105263|
|        [22866]|   [22867]| 0.5073684210526316|
|        [22866]|   [22865]| 0.5915789473684211|
|        [20723]|   [22355]|0.47023809523809523|
|        [20723]|   [20724]| 0.5952380952380952|
|[22386, 85099B]|   [21931]|0.40252707581227437|
|[22386, 85099B]|  [85099F]|0.42057761732851984|
| [23202, 23203]|  [85099B]| 0.5302325581395348|
| [20726, 22384]|   [20725]| 0.7430340557275542|
|        [22386]|   

In [15]:
# Transform examines the input items against all the association rules and summarize the consequents as prediction
model1.transform(df).show()

+---------+--------------------+--------------------+
|InvoiceNo|               Items|          prediction|
+---------+--------------------+--------------------+
|   536938|[22112, 21931, 84...|[23203, 85099B, 8...|
|   537691|[22505, 46000R, 2...|[22720, 21755, 85...|
|   538184|[22561, 22147, 21...|                  []|
|   538517|[22749, 21212, 22...|[21977, 21975, 84...|
|   538879|[21212, 22759, 22...|      [84991, 21094]|
|   539275|[22083, 22150, 22...|                  []|
|   539630|[22111, 22971, 22...|[22386, 22112, 22...|
|   540499|[22697, 22796, 21...|[22698, 21094, 21...|
|   540540|[22111, 22834, 22...|[22112, 21485, 22...|
|   540976|[22413, 21212, 22...|[22556, 85099B, 2...|
|   541432|[22113, 22457, 21...|             [22111]|
|   541518|[21212, 22432, 22...|[20725, 22355, 20...|
|   541783|[22561, 22697, 22...|[22698, 22554, 22...|
|   542026|[22398, 22194, 22...|             [21755]|
|   542375|[22629, 21731, 22...|             [22630]|
|   543641|[22645, 75131, 22

In [16]:
#Second frequent itemset model with minimal support 0.05 and minimal confidence 0.5
fg2 = FPGrowth(itemsCol="Items", minSupport=0.05, minConfidence=0.5)
model2 = fg2.fit(df)

In [17]:
# Display frequent itemset
model2.freqItemsets.show()

+--------+----+
|   items|freq|
+--------+----+
|[85123A]|2020|
| [22423]|1884|
|[85099B]|1643|
| [47566]|1399|
| [84879]|1385|
| [20725]|1330|
| [22720]|1218|
|  [POST]|1194|
+--------+----+



In [18]:
# Display generated association rules
model2.associationRules.show()

+----------+----------+----------+
|antecedent|consequent|confidence|
+----------+----------+----------+
+----------+----------+----------+



In [19]:
# Transform examines the input items against all the association rules and summarize the consequents as prediction
model2.transform(df).show()

+---------+--------------------+----------+
|InvoiceNo|               Items|prediction|
+---------+--------------------+----------+
|   536938|[22112, 21931, 84...|        []|
|   537691|[22505, 46000R, 2...|        []|
|   538184|[22561, 22147, 21...|        []|
|   538517|[22749, 21212, 22...|        []|
|   538879|[21212, 22759, 22...|        []|
|   539275|[22083, 22150, 22...|        []|
|   539630|[22111, 22971, 22...|        []|
|   540499|[22697, 22796, 21...|        []|
|   540540|[22111, 22834, 22...|        []|
|   540976|[22413, 21212, 22...|        []|
|   541432|[22113, 22457, 21...|        []|
|   541518|[21212, 22432, 22...|        []|
|   541783|[22561, 22697, 22...|        []|
|   542026|[22398, 22194, 22...|        []|
|   542375|[22629, 21731, 22...|        []|
|   543641|[22645, 75131, 22...|        []|
|   544303|[84596L, 22931, 8...|        []|
|   545583|[22090, 21931, 22...|        []|
|   547122|[22090, 22434, 21...|        []|
|   548542|[22357, 21586, 22...|

In [20]:
#Third frequent itemset model with minimal support 0.1 and minimal confidence 0.2
fg3 = FPGrowth(itemsCol="Items", minSupport=0.1, minConfidence=0.2)
model3 = fg3.fit(df)

In [21]:
#Display frequent itemset
model3.freqItemsets.show()

+-----+----+
|items|freq|
+-----+----+
+-----+----+



In [22]:
#Display generated association rules
model3.associationRules.show()

+----------+----------+----------+
|antecedent|consequent|confidence|
+----------+----------+----------+
+----------+----------+----------+



In [23]:
# Transform examines the input items against all the association rules and summarize the consequents as prediction
model3.transform(df).show()

+---------+--------------------+----------+
|InvoiceNo|               Items|prediction|
+---------+--------------------+----------+
|   536938|[22112, 21931, 84...|        []|
|   537691|[22505, 46000R, 2...|        []|
|   538184|[22561, 22147, 21...|        []|
|   538517|[22749, 21212, 22...|        []|
|   538879|[21212, 22759, 22...|        []|
|   539275|[22083, 22150, 22...|        []|
|   539630|[22111, 22971, 22...|        []|
|   540499|[22697, 22796, 21...|        []|
|   540540|[22111, 22834, 22...|        []|
|   540976|[22413, 21212, 22...|        []|
|   541432|[22113, 22457, 21...|        []|
|   541518|[21212, 22432, 22...|        []|
|   541783|[22561, 22697, 22...|        []|
|   542026|[22398, 22194, 22...|        []|
|   542375|[22629, 21731, 22...|        []|
|   543641|[22645, 75131, 22...|        []|
|   544303|[84596L, 22931, 8...|        []|
|   545583|[22090, 21931, 22...|        []|
|   547122|[22090, 22434, 21...|        []|
|   548542|[22357, 21586, 22...|

In [24]:
#Fourth frequent itemset model with minimal support 0.03 and minimal confidence 0.3
fg4 = FPGrowth(itemsCol="Items", minSupport=0.03, minConfidence=0.3)
model4 = fg4.fit(df)

In [25]:
#Display frequent itemset
model4.freqItemsets.show()

+--------+----+
|   items|freq|
+--------+----+
|[85123A]|2020|
| [22423]|1884|
|[85099B]|1643|
| [47566]|1399|
| [84879]|1385|
| [20725]|1330|
| [22720]|1218|
|  [POST]|1194|
| [23203]|1097|
| [22197]|1085|
| [20727]|1073|
| [22383]|1063|
| [21212]|1041|
| [23298]|1021|
| [23209]|1017|
| [22382]|1001|
| [22086]| 990|
| [20728]| 989|
| [22457]| 984|
| [22469]| 972|
+--------+----+
only showing top 20 rows



In [26]:
#Display generated association rules
model4.associationRules.show()

+----------+----------+----------+
|antecedent|consequent|confidence|
+----------+----------+----------+
+----------+----------+----------+



In [27]:
# Transform examines the input items against all the association rules and summarize the consequents as prediction
model4.transform(df).show()

+---------+--------------------+----------+
|InvoiceNo|               Items|prediction|
+---------+--------------------+----------+
|   536938|[22112, 21931, 84...|        []|
|   537691|[22505, 46000R, 2...|        []|
|   538184|[22561, 22147, 21...|        []|
|   538517|[22749, 21212, 22...|        []|
|   538879|[21212, 22759, 22...|        []|
|   539275|[22083, 22150, 22...|        []|
|   539630|[22111, 22971, 22...|        []|
|   540499|[22697, 22796, 21...|        []|
|   540540|[22111, 22834, 22...|        []|
|   540976|[22413, 21212, 22...|        []|
|   541432|[22113, 22457, 21...|        []|
|   541518|[21212, 22432, 22...|        []|
|   541783|[22561, 22697, 22...|        []|
|   542026|[22398, 22194, 22...|        []|
|   542375|[22629, 21731, 22...|        []|
|   543641|[22645, 75131, 22...|        []|
|   544303|[84596L, 22931, 8...|        []|
|   545583|[22090, 21931, 22...|        []|
|   547122|[22090, 22434, 21...|        []|
|   548542|[22357, 21586, 22...|

In [28]:
#Fifth frequent itemset model with minimal support 0.02 and minimal confidence 0.3
fg5 = FPGrowth(itemsCol="Items", minSupport=0.02, minConfidence=0.3)
model5 = fg5.fit(df)

In [29]:
#Display frequent itemset
model5.freqItemsets.show()

+---------------+----+
|          items|freq|
+---------------+----+
|       [85123A]|2020|
|        [22423]|1884|
|       [85099B]|1643|
|        [47566]|1399|
|        [84879]|1385|
|        [20725]|1330|
|        [22720]|1218|
|         [POST]|1194|
|        [23203]|1097|
|[23203, 85099B]| 473|
|        [22197]|1085|
|        [20727]|1073|
| [20727, 20725]| 523|
|        [22383]|1063|
| [22383, 20725]| 526|
| [22383, 20727]| 467|
|        [21212]|1041|
|        [23298]|1021|
|        [23209]|1017|
| [23209, 23203]| 444|
+---------------+----+
only showing top 20 rows



In [30]:
#Display generated association rules
model5.associationRules.show()

+----------+----------+-------------------+
|antecedent|consequent|         confidence|
+----------+----------+-------------------+
|   [22699]|   [22697]|  0.666267942583732|
|   [22699]|   [22698]| 0.5394736842105263|
|   [22386]|  [85099B]| 0.6266968325791855|
|   [20727]|   [20725]| 0.4874184529356943|
|   [20727]|   [22383]|0.43522833178005593|
|   [20727]|   [22384]|0.43522833178005593|
|   [22382]|   [22383]|0.45054945054945056|
|   [22382]|   [20725]| 0.4695304695304695|
|   [20725]|   [20727]|0.39323308270676693|
|   [20725]|   [22383]| 0.3954887218045113|
|   [20725]|   [22382]| 0.3533834586466165|
|   [20725]|   [20728]|0.35037593984962406|
|   [20725]|   [22384]|0.39849624060150374|
|   [22384]|   [20725]| 0.5573080967402734|
|   [22384]|   [20727]|  0.491062039957939|
|   [22910]|   [22086]| 0.6403385049365303|
|   [23209]|   [23203]| 0.4365781710914454|
|  [85099B]|   [22386]| 0.3371880706025563|
|   [22726]|   [22727]| 0.6625463535228677|
|   [22383]|   [20725]|0.4948259

In [31]:
# Transform examines the input items against all the association rules and summarize the consequents as prediction
model5.transform(df).show()

+---------+--------------------+--------------------+
|InvoiceNo|               Items|          prediction|
+---------+--------------------+--------------------+
|   536938|[22112, 21931, 84...|            [85099B]|
|   537691|[22505, 46000R, 2...|                  []|
|   538184|[22561, 22147, 21...|                  []|
|   538517|[22749, 21212, 22...|                  []|
|   538879|[21212, 22759, 22...|                  []|
|   539275|[22083, 22150, 22...|                  []|
|   539630|[22111, 22971, 22...|             [22386]|
|   540499|[22697, 22796, 21...|             [22698]|
|   540540|[22111, 22834, 22...|                  []|
|   540976|[22413, 21212, 22...|                  []|
|   541432|[22113, 22457, 21...|                  []|
|   541518|[21212, 22432, 22...|[22386, 22383, 20...|
|   541783|[22561, 22697, 22...|             [22698]|
|   542026|[22398, 22194, 22...|                  []|
|   542375|[22629, 21731, 22...|                  []|
|   543641|[22645, 75131, 22

### Summary

With count function, we got the frequency of association rules and frequent itemset with different value of minimal support and minimal confidence

| Minimal Support | Minimal Confidence | Frequent Itemset | Association Rules |
| --- | --- | --- | --- |
| 0.01 | 0.3 | 777 | 418 |
| 0.02 | 0.3 | 182 | 38 |
| 0.03 | 0.3 | 63 | 0 |
| 0.05 | 0.5 | 8 | 0 |
| 0.1 | 0.2 | 0 | 0 |