# Init spark

In [None]:
import findspark
findspark.init()

from pyspark import SparkContext

# Init sparkcontext
sc = SparkContext(master="local", appName="association")

from pyspark.sql import SparkSession
spark = SparkSession(sc)
spark

25/01/27 10:44:18 WARN Utils: Your hostname, Sophies-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.6 instead (on interface en0)
25/01/27 10:44:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/27 10:44:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Import library

In [2]:
import pyspark.sql.functions as f
import pyspark.sql.types as t

import matplotlib.pyplot as plt 
import seaborn as sns
import pandas as pd
import pyspark.sql.types as t

from pyspark.ml.fpm import FPGrowth
# Convert list array to string
from pyspark.sql.types import StringType


# Import DF

In [3]:
df = spark.read.csv('/Users/phungtran/Library/Mobile Documents/com~apple~CloudDocs/Documents/Learning/Data_science/DL06_v2_BigData/Bai_Thi_Cuoi_ky/Data/75000/75000i.csv', inferSchema=True)
df.show(5)

+---+---+---+
|_c0|_c1|_c2|
+---+---+---+
|  1|  1| 21|
|  1|  5| 11|
|  2|  1|  7|
|  2|  3| 11|
|  2|  4| 37|
+---+---+---+
only showing top 5 rows



In [4]:
# Rename columns for df
df = df.toDF("receipt_id", "quantity", "item")
df.show(5)

+----------+--------+----+
|receipt_id|quantity|item|
+----------+--------+----+
|         1|       1|  21|
|         1|       5|  11|
|         2|       1|   7|
|         2|       3|  11|
|         2|       4|  37|
+----------+--------+----+
only showing top 5 rows



In [5]:
# check info
df.printSchema()

root
 |-- receipt_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- item: integer (nullable = true)



In [6]:
# count distinct values of receipt_id
df.select('receipt_id').distinct().count()

75000

In [8]:
# create array for each receipt ID
baskets = df.groupBy('receipt_id').agg(f.collect_set('item').alias('items'))

In [9]:
baskets.show(5, truncate=False)

+----------+---------------+
|receipt_id|items          |
+----------+---------------+
|1         |[21, 11]       |
|2         |[45, 37, 7, 11]|
|3         |[33, 42, 3]    |
|4         |[12, 5, 17, 47]|
|5         |[42, 6, 18]    |
+----------+---------------+
only showing top 5 rows



# Build association model using ID

In [None]:
# Initialize the FP-Growth model with minimum support and confidence thresholds.
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.005)

# Train (fit) the FP-Growth model on the basket dataset.
model = fpGrowth.fit(baskets)

In [None]:
# Extract association rules and sort them by confidence (descending) and lift (descending).
rules = model.associationRules.sort(f.col("confidence").desc(), f.col("lift").desc())

# Display the sorted association rules.
rules.show()

[Stage 23:>                                                         (0 + 1) / 1]

+----------------+----------+------------------+------------------+--------------------+
|      antecedent|consequent|        confidence|              lift|             support|
+----------------+----------+------------------+------------------+--------------------+
|[43, 41, 24, 40]|      [23]|               1.0| 14.78415138971023|0.020733333333333333|
|[43, 23, 24, 40]|      [41]|               1.0|14.760873843731549|0.020733333333333333|
|[43, 23, 41, 24]|      [40]|               1.0|14.654161781946074|0.020733333333333333|
|[43, 23, 41, 40]|      [24]|0.9993573264781491| 14.69355018346622|0.020733333333333333|
|    [43, 23, 24]|      [41]|0.9980744544287549|14.732451108474045|0.020733333333333333|
|    [43, 23, 24]|      [40]|0.9980744544287549|14.625944525626538|0.020733333333333333|
|    [43, 41, 40]|      [23]|0.9974358974358974|14.746243309223793|0.020746666666666667|
|    [41, 24, 40]|      [23]|0.9973985431841832|14.745691058311401|             0.02556|
|    [43, 41, 40]|   

                                                                                

In [12]:
# Display frequent itemsets.
model.freqItemsets.show()

+------------+----+
|       items|freq|
+------------+----+
|        [19]|5685|
|    [19, 27]| 359|
|[19, 27, 28]| 113|
|    [19, 33]| 334|
| [19, 33, 1]|  89|
|[19, 33, 42]| 110|
|     [19, 1]|2764|
| [19, 1, 27]|  99|
| [19, 1, 28]|  96|
| [19, 1, 35]|  91|
|  [19, 1, 4]|  98|
| [19, 1, 22]|  91|
| [19, 1, 45]| 108|
| [19, 1, 32]|  85|
| [19, 1, 14]|  98|
| [19, 1, 18]| 115|
| [19, 1, 42]| 100|
|  [19, 1, 7]| 110|
|    [19, 28]| 408|
|    [19, 37]| 274|
+------------+----+
only showing top 20 rows



In [13]:
# transform examines the input items against all the association rules and summarize the consequents as prediction
mostPopularItemInABasket = model.transform(baskets)

In [None]:
# Display the first 5 rows of the 'mostPopularItemInABasket' DataFrame.
# 'truncate=False' ensures full column values are shown without shortening.
# 'vertical=True' prints each row in a vertical format for easier reading.
mostPopularItemInABasket.show(5, truncate=False, vertical=True)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 receipt_id | 1                                                                                                                                                                                      
 items      | [21, 11]                                                                                                                                                                               
 prediction | [19, 41, 33, 27, 31, 17, 1, 39, 24, 28, 37, 35, 16, 34, 46, 4, 15, 29, 22, 5, 23, 2, 48, 32, 45, 12, 47, 43, 3, 40, 14, 0, 18, 20, 49, 9, 36, 44, 42, 7, 38, 13, 6, 25, 8, 26, 10, 30] 
-RECORD 1--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 receipt_i

# Build association model using product name

In [None]:
# Read the product data from the CSV file, enabling header parsing and automatic schema inference.
product_data = spark.read.csv('Data/75000/goods.csv', header=True, inferSchema=True)
product_data.show(5)

+---+------------+------+-----+------+
| Id|      Flavor|  Food|Price|  Type|
+---+------------+------+-----+------+
|  0| 'Chocolate'|'Cake'| 8.95|'Food'|
|  1|     'Lemon'|'Cake'| 8.95|'Food'|
|  2|    'Casino'|'Cake'|15.95|'Food'|
|  3|     'Opera'|'Cake'|15.95|'Food'|
|  4|'Strawberry'|'Cake'|11.95|'Food'|
+---+------------+------+-----+------+
only showing top 5 rows



In [None]:
# Display the full schema to understand column names and data types.
product_data.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- Flavor: string (nullable = true)
 |-- Food: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Type: string (nullable = true)



In [None]:
# Create a new column 'product_name' by concatenating 'Food' and 'Flavor' with a hyphen.
product_data = product_data.withColumn('product_name', f.concat(f.col('Food'), f.lit('-'), f.col('Flavor')))
product_data.show(5)

+---+------------+------+-----+------+-------------------+
| Id|      Flavor|  Food|Price|  Type|       product_name|
+---+------------+------+-----+------+-------------------+
|  0| 'Chocolate'|'Cake'| 8.95|'Food'| 'Cake'-'Chocolate'|
|  1|     'Lemon'|'Cake'| 8.95|'Food'|     'Cake'-'Lemon'|
|  2|    'Casino'|'Cake'|15.95|'Food'|    'Cake'-'Casino'|
|  3|     'Opera'|'Cake'|15.95|'Food'|     'Cake'-'Opera'|
|  4|'Strawberry'|'Cake'|11.95|'Food'|'Cake'-'Strawberry'|
+---+------------+------+-----+------+-------------------+
only showing top 5 rows



In [None]:
# Join the main dataframe 'df' with the product information dataframe 'product_data'.
# The join condition matches df.item with product_data.Id, using a left join to keep all rows from df.
merge_data = df.join(product_data, on = [df.item == product_data.Id], how='left')
merge_data.show(5)

+----------+--------+----+---+---------+--------+-----+------+------------------+
|receipt_id|quantity|item| Id|   Flavor|    Food|Price|  Type|      product_name|
+----------+--------+----+---+---------+--------+-----+------+------------------+
|         1|       1|  21| 21|'Ganache'|'Cookie'| 1.15|'Food'|'Cookie'-'Ganache'|
|         1|       5|  11| 11|  'Apple'|   'Pie'| 5.25|'Food'|     'Pie'-'Apple'|
|         2|       1|   7|  7| 'Coffee'|'Eclair'|  3.5|'Food'| 'Eclair'-'Coffee'|
|         2|       3|  11| 11|  'Apple'|   'Pie'| 5.25|'Food'|     'Pie'-'Apple'|
|         2|       4|  37| 37| 'Almond'| 'Twist'| 1.15|'Food'|  'Twist'-'Almond'|
+----------+--------+----+---+---------+--------+-----+------+------------------+
only showing top 5 rows



In [None]:
# Group data by receipt_id to create transaction baskets.
# 'collect_set' gathers unique product names into an array representing items purchased together.
baskets_1 = merge_data.groupBy('receipt_id').agg(f.collect_set('product_name').alias('items'))

In [22]:
baskets_1.show(5, False)

+----------+-------------------------------------------------------------------------------+
|receipt_id|items                                                                          |
+----------+-------------------------------------------------------------------------------+
|1         |['Cookie'-'Ganache', 'Pie'-'Apple']                                            |
|2         |['Coffee'-'Hot', 'Eclair'-'Coffee', 'Pie'-'Apple', 'Twist'-'Almond']           |
|3         |['Juice'-'Orange', 'Cake'-'Opera', 'Croissant'-'Cheese']                       |
|4         |['Frappuccino'-'Vanilla', 'Tart'-'Chocolate', 'Tart'-'Apple', 'Cake'-'Truffle']|
|5         |['Juice'-'Orange', 'Eclair'-'Chocolate', 'Tart'-'Cherry']                      |
+----------+-------------------------------------------------------------------------------+
only showing top 5 rows



In [None]:
# Initialize a new FP-Growth model using the basket dataset that contains product names.
# Set minimum support and confidence thresholds for generating association rules.
fpGrowth_1 = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.003)

# Fit (train) the model on baskets_1 to learn frequent itemsets and association rules.
model_1 = fpGrowth_1.fit(baskets_1)

In [None]:
# Retrieve the association rules and sort them by confidence (descending) and lift (descending)
# to highlight the strongest and most meaningful rules first.
rules_1 = model_1.associationRules.sort(f.col("confidence").desc(), f.col("lift").desc())
rules_1.show(10, False)

[Stage 60:>                                                         (0 + 1) / 1]

+---------------------------------------------------------------------------------+------------------------+------------------+------------------+--------------------+
|antecedent                                                                       |consequent              |confidence        |lift              |support             |
+---------------------------------------------------------------------------------+------------------------+------------------+------------------+--------------------+
|['Tea'-'Green', 'Lemonade'-'Raspberry', 'Cookie'-'Lemon', 'Lemonade'-'Lemon']    |['Cookie'-'Raspberry']  |1.0               |14.78415138971023 |0.020733333333333333|
|['Tea'-'Green', 'Cookie'-'Raspberry', 'Cookie'-'Lemon', 'Lemonade'-'Lemon']      |['Lemonade'-'Raspberry']|1.0               |14.760873843731549|0.020733333333333333|
|['Tea'-'Green', 'Cookie'-'Raspberry', 'Lemonade'-'Raspberry', 'Cookie'-'Lemon']  |['Lemonade'-'Lemon']    |1.0               |14.654161781946074|0.020733333333

                                                                                

In [None]:
# Use the trained FP-Growth model to generate predicted consequents (recommendations)
mostPopularItemInABasket_1 = model_1.transform(baskets_1)

In [None]:
# Display the first 5 rows of the transformed DataFrame.
mostPopularItemInABasket_1.show(5, truncate=False, vertical=True)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 receipt_id | 1                            

# Recommend a product

In [None]:
from pyspark.sql.functions import array_contains, explode

In [None]:
def get_associated_products(product_id, model, data, top_n=3):
    """
    Return the top_n products most frequently purchased together with the given product_id.

    Parameters:
    - product_id: ID of the target product.
    - model: Trained FPGrowth model.
    - data: DataFrame containing product information (including product_name).
    - top_n: Number of associated products to return.

    Returns:
    - list: A list of rows containing associated product IDs and names.
    """
    
    # Extract association rules generated by the FP-Growth model.
    association_rules = model.associationRules

    # Filter rules where the antecedent contains the target product_id.
    filtered_rules = association_rules.filter(array_contains(association_rules["antecedent"], product_id))

    # Sort by confidence in descending order, take the top_n rules,
    # and extract product IDs from the consequent array.
    top_associated_products = (filtered_rules
                               .orderBy("confidence", ascending=False)
                               .limit(top_n)
                               .select(explode("consequent").alias("associated_product_id")))

    # Join with product metadata to retrieve product_name and other details.
    associated_product_details = (top_associated_products
                                  .join(data, top_associated_products["associated_product_id"] == data["Id"])
                                  .select("Id", "product_name"))

    # Collect and return results as a Python list.
    return associated_product_details.collect()

# Note: product_id must be an integer when calling this function.

Predict recommendation for product no.20

In [None]:
# Get the top 5 associated products for product ID = 20 using the trained FP-Growth model.
associated_products = get_associated_products(20, model, product_data, top_n = 5)

In [None]:
# Print the reference product's name.
print('Recommend for product no.20', product_data[product_data['Id']==40].select('product_name').collect()[0][0])
print(' ')

# Print the list of recommended associated products.
for product in associated_products:
    print(f"Product ID: {product['Id']}, Product Name: {product['product_name']}")

Recommend for product no.20 'Lemonade'-'Lemon'
 
Product ID: 4, Product Name: 'Cake'-'Strawberry'
Product ID: 14, Product Name: 'Tart'-'Berry'
Product ID: 27, Product Name: 'Cookie'-'Marzipan'
Product ID: 28, Product Name: 'Cookie'-'Tuile'
Product ID: 42, Product Name: 'Juice'-'Orange'
