# Recommendation Systems - Frequent Pattern Mining



*   Rafi Akbar Rafsanjani
*   05111942000004



In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=7017c821558fdedb12ffdbbc43cc34a3665496572dece18e689388cd8311f3a0
  Stored in directory: /root/.cache/pip/wheels/9f/34/a4/159aa12d0a510d5ff7c8f0220abbea42e5d81ecf588c4fd884
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [28]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.fpm import FPGrowth

In [31]:
# Create a Spark session
spark = SparkSession.builder \
    .master("local") \
    .appName("Frequent Pattern Mining") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spark.read.options(header = 'True', delimiter = ';').csv("/content/market-basket.csv")
df.show()

+------+--------------------+--------+----------------+-----+----------+----------------+
|BillNo|            Itemname|Quantity|            Date|Price|CustomerID|      Country,,,|
+------+--------------------+--------+----------------+-----+----------+----------------+
|536365|WHITE HANGING HEA...|       6|01.12.2010 08:26| 2,55|     17850|United Kingdom,,|
|536365| WHITE METAL LANTERN|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom,,|
|536365|CREAM CUPID HEART...|       8|01.12.2010 08:26| 2,75|     17850|United Kingdom,,|
|536365|KNITTED UNION FLA...|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom,,|
|536365|RED WOOLLY HOTTIE...|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom,,|
|536365|SET 7 BABUSHKA NE...|       2|01.12.2010 08:26| 7,65|     17850|United Kingdom,,|
|536365|GLASS STAR FROSTE...|       6|01.12.2010 08:26| 4,25|     17850|United Kingdom,,|
|536366|HAND WARMER UNION...|       6|01.12.2010 08:28| 1,85|     17850|United Kingdom,,|
|536366|HA

In [35]:
# Select only the first two column
# Select  BillNo and Itemname 
ds= df.select("BillNo", "Itemname")
ds.show()

+------+--------------------+
|BillNo|            Itemname|
+------+--------------------+
|536365|WHITE HANGING HEA...|
|536365| WHITE METAL LANTERN|
|536365|CREAM CUPID HEART...|
|536365|KNITTED UNION FLA...|
|536365|RED WOOLLY HOTTIE...|
|536365|SET 7 BABUSHKA NE...|
|536365|GLASS STAR FROSTE...|
|536366|HAND WARMER UNION...|
|536366|HAND WARMER RED P...|
|536367|ASSORTED COLOUR B...|
|536367|POPPY'S PLAYHOUSE...|
|536367|POPPY'S PLAYHOUSE...|
|536367|FELTCRAFT PRINCES...|
|536367|IVORY KNITTED MUG...|
|536367|BOX OF 6 ASSORTED...|
|536367|BOX OF VINTAGE JI...|
|536367|BOX OF VINTAGE AL...|
|536367|HOME BUILDING BLO...|
|536367|LOVE BUILDING BLO...|
|536367|RECIPE BOX WITH M...|
+------+--------------------+
only showing top 20 rows



In [44]:
# Use GroupBy, agg, and collect list methods
dg = ds.groupBy("BillNo").agg(collect_list('Itemname').alias('Itemname'))
dg.show()

+--------------------+--------+
|              BillNo|Itemname|
+--------------------+--------+
|"536477;""RECORD ...|    [48]|
|"536544;""LETTER ...|     [1]|
|"536544;""LETTER ...|     [1]|
|"536544;""LETTER ...|     [1]|
|"536544;""LETTER ...|     [1]|
|"536544;""LETTER ...|     [1]|
|"536557;""RECORD ...|     [4]|
|"536576;""RECORD ...|   [144]|
|"536591;""LETTER ...|     [2]|
|"536591;""LETTER ...|     [1]|
|"536591;""LETTER ...|     [1]|
|"536591;""LETTER ...|     [1]|
|"536592;""FLOWER ...|     [1]|
|"536592;""LETTER ...|     [2]|
|"536592;""LETTER ...|     [1]|
|"536592;""LETTER ...|     [1]|
|"536592;""LETTER ...|     [1]|
|"536595;""RECORD ...|    [24]|
|"536876;""LETTER ...|     [1]|
|"536876;""LETTER ...|     [3]|
+--------------------+--------+
only showing top 20 rows



In [37]:
# Experiment with minSupport and minConfidence values
min_support= [0.5, 0.6, 0.7]
min_confidence = [0.5, 0.7]

In [45]:
#
dg_new = dg.withColumn("Itemname", array(dg["Itemname"]))
dg_new.show()

+--------------------+--------+
|              BillNo|Itemname|
+--------------------+--------+
|"536477;""RECORD ...|  [[48]]|
|"536544;""LETTER ...|   [[1]]|
|"536544;""LETTER ...|   [[1]]|
|"536544;""LETTER ...|   [[1]]|
|"536544;""LETTER ...|   [[1]]|
|"536544;""LETTER ...|   [[1]]|
|"536557;""RECORD ...|   [[4]]|
|"536576;""RECORD ...| [[144]]|
|"536591;""LETTER ...|   [[2]]|
|"536591;""LETTER ...|   [[1]]|
|"536591;""LETTER ...|   [[1]]|
|"536591;""LETTER ...|   [[1]]|
|"536592;""FLOWER ...|   [[1]]|
|"536592;""LETTER ...|   [[2]]|
|"536592;""LETTER ...|   [[1]]|
|"536592;""LETTER ...|   [[1]]|
|"536592;""LETTER ...|   [[1]]|
|"536595;""RECORD ...|  [[24]]|
|"536876;""LETTER ...|   [[1]]|
|"536876;""LETTER ...|   [[3]]|
+--------------------+--------+
only showing top 20 rows



In [46]:
# Loop for different minSupport and minConfidence value
for a in min_support:
  for b in min_confidence:
    print(f"minSupport: {a}, minConfidence: {b}")
    fpGrowth = FPGrowth(itemsCol="Itemname", minSupport=a, minConfidence=b)
    model = fpGrowth.fit(dg_new)

    # Display frequent itemsets.
    model.freqItemsets.show()

    # Display generated association rules.
    model.associationRules.show()

    # transform examines the input items against all the association rules and summarize the
    # consequents as prediction
    model.transform(dg_new).show()

minSupport: 0.5, minConfidence: 0.5
+-----+----+
|items|freq|
+-----+----+
+-----+----+

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

+--------------------+--------+----------+
|              BillNo|Itemname|prediction|
+--------------------+--------+----------+
|"536477;""RECORD ...|  [[48]]|        []|
|"536544;""LETTER ...|   [[1]]|        []|
|"536544;""LETTER ...|   [[1]]|        []|
|"536544;""LETTER ...|   [[1]]|        []|
|"536544;""LETTER ...|   [[1]]|        []|
|"536544;""LETTER ...|   [[1]]|        []|
|"536557;""RECORD ...|   [[4]]|        []|
|"536576;""RECORD ...| [[144]]|        []|
|"536591;""LETTER ...|   [[2]]|        []|
|"536591;""LETTER ...|   [[1]]|        []|
|"536591;""LETTER ...|   [[1]]|        []|
|"536591;""LETTER ...|   [[1]]|        []|
|"536592;""FLOWER ...|   [[1]]|        []|
|"536592;""LETTER ...|   [[2]]

In [48]:
# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = model.select("*").toPandas()
result_pdf.head()

AttributeError: ignored

# Conclusion

The minimal frequency that an itemset must have in order to be deemed frequent is determined by the minimal Support parameter. Itemsets that don't meet the minimum support criterion are trimmed because they aren't likely to contain any useful or instructive information. More frequent itemsets are generated when the minimum support value is decreased as opposed to when it is increased.
The minimum amount of confidence that an association rule must possess in order to be deemed interesting or instructive is determined by the Minimum Confidence parameter. Pruning is done to association rules that are deemed uninteresting or have confidence levels below the minimum confidence threshold. More association rules are produced when the minimum confidence value is increased, while more association rules are produced when it is decreased.
