In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1

Collecting mlxtend==0.23.1
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.4 MB[0m [31m9.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mlxtend
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.23.3
    Uninstalling mlxtend-0.23.3:
      Successfully uninstalled mlxtend-0.23.3
Successfully installed mlxtend-0.23.1


# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [3]:
# load the data set ans show the first five transaction
url = "https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv"

# Load the dataset
data = pd.read_csv(url)

# Display the first few rows of the dataset
data.head()



Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


In [4]:
purchased_products = set(data.values.flatten())  # Flatten and get unique products

print("Purchased Products:", purchased_products)


Purchased Products: {'Bread', 'Diaper', 'Eggs', 'Wine', 'Pencil', 'Milk', 'Cheese', 'Bagel', 'Meat', nan}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [6]:
# Initialize the dictionary with all unique products set to 0
purchased_product_status = dict.fromkeys(purchased_products, 0)

# Encode the first row by setting 1 for purchased products
for product in data.iloc[0].dropna():
    purchased_product_status[product] = 1

purchased_product_status


{'Bread': 1,
 'Diaper': 1,
 'Eggs': 1,
 'Wine': 1,
 'Pencil': 1,
 'Milk': 0,
 'Cheese': 1,
 'Bagel': 0,
 'Meat': 1,
 nan: 0}

In [12]:
# Initialize a DataFrame with zeros and columns as unique products
encoded_df = pd.DataFrame(0, index=range(len(data)), columns=purchased_product_status)
for i, row in data.iterrows():
    for item in row.dropna():
        encoded_df.loc[i, item] = 1
encoded_df.head()


Unnamed: 0,Bread,Diaper,Eggs,Wine,Pencil,Milk,Cheese,Bagel,Meat,NaN
0,1,1,1,1,1,0,1,0,1,0
1,1,1,0,1,1,1,1,0,1,0
2,0,0,1,1,0,1,1,0,1,0
3,0,0,1,1,0,1,1,0,1,0
4,0,0,0,1,1,0,0,0,1,0


In [13]:
# Check if the last column is NaN and drop it
if encoded_df.columns[-1] is np.nan or encoded_df.columns[-1] == 'nan':
    encoded_df = encoded_df.iloc[:, :-1]
encoded_df.head()


Unnamed: 0,Bread,Diaper,Eggs,Wine,Pencil,Milk,Cheese,Bagel,Meat
0,1,1,1,1,1,0,1,0,1
1,1,1,0,1,1,1,1,0,1
2,0,0,1,1,0,1,1,0,1
3,0,0,1,1,0,1,1,0,1
4,0,0,0,1,1,0,0,0,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [15]:
#Set threshold value untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules
mostly = apriori(encoded_df, min_support=0.2, use_colnames=True)
mostly

  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.504762,(Bread)
1,0.406349,(Diaper)
2,0.438095,(Eggs)
3,0.438095,(Wine)
4,0.361905,(Pencil)
5,0.501587,(Milk)
6,0.501587,(Cheese)
7,0.425397,(Bagel)
8,0.47619,(Meat)
9,0.231746,"(Bread, Diaper)"


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [17]:
association_rules_data = association_rules(mostly, metric="confidence", min_threshold=0.6)

# Show the association rules
association_rules_data

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265,0.402687
1,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203,0.469167
2,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624,0.387409
3,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754,0.330409
4,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
5,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
6,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754,0.500891
7,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891,0.526414
8,"(Meat, Eggs)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667,0.518717
9,"(Meat, Cheese)",(Eggs),0.32381,0.438095,0.215873,0.666667,1.521739,0.074014,1.685714,0.507042


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)

Metrics for Evaluating Association Rules

Antecedent Support

This metric shows how often the antecedent (the items appearing on the left side of the rule) occurs in the dataset, providing insight into its frequency.

Consequent Support

This metric measures the frequency of the consequent (the items on the right side of the rule) in the dataset, indicating how often these items are purchased.

Support

Support measures the co-occurrence of both the antecedent and consequent in transactions, reflecting how often both items appear together in the data.

Confidence

Confidence evaluates the reliability of a rule by indicating how often the consequent follows the antecedent. Higher confidence values imply stronger rules.

Lift

Lift assesses the correlation between the antecedent and consequent. A lift greater than 1 indicates a positive association, meaning the items appear together more often than expected by chance. A lift below 1 suggests a negative relationship.

Leverage

Leverage helps gauge the extent of the relationship between items. Values close to zero suggest independence, while positive values indicate a stronger association. Negative values reflect an inverse relationship.

Conviction

Conviction measures how dependent the consequent is on the antecedent. A value of 1 indicates independence, while higher values suggest a stronger relationship.

Zhang’s Metric

Zhang’s metric ranges between 0 and 1, with higher values denoting stronger associations between the items.

Interpreting the Results

Confidence: High-confidence rules, such as (Eggs → Meat) and (Milk → Cheese), demonstrate significant associations. For example, a confidence score of 60.87% for (Eggs → Meat) suggests that when customers purchase eggs, there is a 60.87% likelihood they will also buy meat.

Lift: A lift score above 1, like 1.28 for (Eggs → Meat), means that the purchase of eggs increases the chances of buying meat beyond what would be expected randomly.

Leverage: The leverage score, such as 0.058 for (Eggs → Meat), highlights that this combination occurs more frequently than chance would predict, suggesting a positive correlation.

Conviction: A conviction value of 1.34 for (Eggs → Meat) points to a notable dependency between these items, implying a stronger association than random chance.

Zhang’s Metric: With a score of 0.39, Zhang’s metric confirms the association between (Eggs → Meat), supporting the conclusions drawn from other metrics like lift and confidence.

These metrics offer valuable insights into product relationships, enabling businesses to identify key item pairings. Understanding these associations can drive decisions around cross-selling opportunities, marketing strategies, and promotional campaigns, ultimately enhancing sales and customer satisfaction