In [83]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

! pip install mlxtend



# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here: 
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [84]:
# load the data set and show the first five transaction

url = "https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv"
data = pd.read_csv(url)

print(data.head())

        0       1     2       3       4       5       6
0   Bread    Wine  Eggs    Meat  Cheese  Pencil  Diaper
1   Bread  Cheese  Meat  Diaper    Wine    Milk  Pencil
2  Cheese    Meat  Eggs    Milk    Wine     NaN     NaN
3  Cheese    Meat  Eggs    Milk    Wine     NaN     NaN
4    Meat  Pencil  Wine     NaN     NaN     NaN     NaN


# Get the set of product that has been purchased


Get the unique product that has been purchased

In [85]:
unique_products = set(data['6'].unique())

print(unique_products)

{'Diaper', 'Bagel', 'Meat', 'Milk', 'Wine', 'Eggs', 'Pencil', nan, 'Cheese', 'Bread'}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [86]:
#create an itemset based on the products

transactions = data.groupby(['1'])['1'].apply(list)

itemset = pd.get_dummies(data['2']).groupby(data['5']).max()

print(itemset)
print()
print()


# encoding the feature

encoded_data = pd.get_dummies(data, columns=['4'])

print(encoded_data.head())

        Bagel  Bread  Cheese  Diaper  Eggs  Meat  Milk  Pencil  Wine
5                                                                   
Bagel       0      0       0       0     0     1     1       1     1
Bread       0      0       0       0     1     0     0       0     0
Cheese      0      1       0       0     0     0     1       0     1
Diaper      0      0       0       0     1     1     0       1     1
Eggs        0      0       1       1     0     1     1       0     0
Meat        1      0       0       0     1     0     0       1     1
Milk        1      1       0       1     0     1     0       1     0
Pencil      0      0       1       0     1     1     0       0     1
Wine        0      0       0       0     0     1     0       0     0


        0       1     2       3       5       6  4_Bagel  4_Bread  4_Cheese  \
0   Bread    Wine  Eggs    Meat  Pencil  Diaper        0        0         1   
1   Bread  Cheese  Meat  Diaper    Milk  Pencil        0        0         0   
2 

In [87]:
  # create new dataframe from the encoded features

  # show the new dataframe

encoded_data = pd.get_dummies(data, columns=['0','1','2','3','4','5','6'])

new_dataframe = encoded_data.drop([], axis=1)

print(new_dataframe.head())

   0_Bagel  0_Bread  0_Cheese  0_Diaper  0_Eggs  0_Meat  0_Milk  0_Pencil  \
0        0        1         0         0       0       0       0         0   
1        0        1         0         0       0       0       0         0   
2        0        0         1         0       0       0       0         0   
3        0        0         1         0       0       0       0         0   
4        0        0         0         0       0       1       0         0   

   0_Wine  1_Bagel  ...  5_Wine  6_Bagel  6_Bread  6_Cheese  6_Diaper  6_Eggs  \
0       0        0  ...       0        0        0         0         1       0   
1       0        0  ...       0        0        0         0         0       0   
2       0        0  ...       0        0        0         0         0       0   
3       0        0  ...       0        0        0         0         0       0   
4       0        0  ...       0        0        0         0         0       0   

   6_Meat  6_Milk  6_Pencil  6_Wine  
0       0   

Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

In [88]:

encoded_data = pd.get_dummies(data, columns=['0','1','2','3','4','5','6'])

encoded_data.dropna(axis=1, how='any', inplace=True)

print(encoded_data.head())


   0_Bagel  0_Bread  0_Cheese  0_Diaper  0_Eggs  0_Meat  0_Milk  0_Pencil  \
0        0        1         0         0       0       0       0         0   
1        0        1         0         0       0       0       0         0   
2        0        0         1         0       0       0       0         0   
3        0        0         1         0       0       0       0         0   
4        0        0         0         0       0       1       0         0   

   0_Wine  1_Bagel  ...  5_Wine  6_Bagel  6_Bread  6_Cheese  6_Diaper  6_Eggs  \
0       0        0  ...       0        0        0         0         1       0   
1       0        0  ...       0        0        0         0         0       0   
2       0        0  ...       0        0        0         0         0       0   
3       0        0  ...       0        0        0         0         0       0   
4       0        0  ...       0        0        0         0         0       0   

   6_Meat  6_Milk  6_Pencil  6_Wine  
0       0   

In [89]:
encoded_data = pd.get_dummies(data, columns=['0','1','2','3','4','5','6'])

selected_columns = encoded_data.iloc[:, 1:]

print(selected_columns.head())


   0_Bread  0_Cheese  0_Diaper  0_Eggs  0_Meat  0_Milk  0_Pencil  0_Wine  \
0        1         0         0       0       0       0         0       0   
1        1         0         0       0       0       0         0       0   
2        0         1         0       0       0       0         0       0   
3        0         1         0       0       0       0         0       0   
4        0         0         0       0       1       0         0       0   

   1_Bagel  1_Bread  ...  5_Wine  6_Bagel  6_Bread  6_Cheese  6_Diaper  \
0        0        0  ...       0        0        0         0         1   
1        0        0  ...       0        0        0         0         0   
2        0        0  ...       0        0        0         0         0   
3        0        0  ...       0        0        0         0         0   
4        0        0  ...       0        0        0         0         0   

   6_Eggs  6_Meat  6_Milk  6_Pencil  6_Wine  
0       0       0       0         0       0  
1     

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products. 
For this case study, we will min_support=0.2

In [90]:
encoded_data = pd.get_dummies(data, columns=['0','1','2','3','4','5','6'])

frequent_itemsets = apriori(encoded_data, min_support=0.05, use_colnames=True)

frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)

with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
    print(frequent_itemsets)

     support                            itemsets
1   0.234921                           (0_Bread)
2   0.177778                          (0_Cheese)
21  0.165079                            (2_Eggs)
13  0.149206                            (1_Meat)
8   0.149206                           (1_Bagel)
30  0.142857                            (3_Milk)
23  0.142857                            (2_Milk)
34  0.114286                            (4_Wine)
10  0.111111                          (1_Cheese)
45  0.111111                    (3_Milk, 2_Eggs)
3   0.107937                          (0_Diaper)
43  0.104762                    (1_Meat, 3_Milk)
36  0.104762                   (2_Milk, 0_Bread)
50  0.101587          (0_Cheese, 3_Milk, 2_Eggs)
39  0.101587                  (0_Cheese, 3_Milk)
38  0.101587                  (0_Cheese, 2_Eggs)
5   0.101587                            (0_Meat)
48  0.098413          (0_Cheese, 1_Meat, 2_Eggs)
49  0.098413          (0_Cheese, 1_Meat, 3_Milk)
42  0.098413        



Then, we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [91]:
encoded_data = pd.get_dummies(data, columns=['0','1','2','3','4','5','6'])

frequent_itemsets = apriori(encoded_data, min_support=0.05, use_colnames=True)

association_rules_output = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

print(association_rules_output)

                   antecedents                 consequents  \
0                    (1_Bagel)                   (0_Bread)   
1                     (2_Milk)                   (0_Bread)   
2                     (1_Meat)                  (0_Cheese)   
3                     (2_Eggs)                  (0_Cheese)   
4                     (3_Milk)                  (0_Cheese)   
5                     (2_Milk)                   (1_Bagel)   
6                     (1_Meat)                    (2_Eggs)   
7                     (1_Meat)                    (3_Milk)   
8                     (3_Milk)                    (1_Meat)   
9                     (3_Milk)                    (2_Eggs)   
10                    (2_Eggs)                    (3_Milk)   
11           (2_Milk, 0_Bread)                   (1_Bagel)   
12           (2_Milk, 1_Bagel)                   (0_Bread)   
13          (0_Bread, 1_Bagel)                    (2_Milk)   
14                    (2_Milk)          (0_Bread, 1_Bagel)   
15      



Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__ and __conviction__

In [92]:
#Antecedent Support: The support of the antecedent part of the rule.

#Consequent Support: The support of the consequent part of the rule.

#Support: The support of an itemset measures how frequently the itemset appears in the dataset. Higher support indicates a more frequently occurring itemset.

#Confidence: measures how often a rule is found to be true. Higher confidence signifies that when {A} is present, the likelihood of {B} being present as well.

#Lift: measures how much more often the antecedent (if) and consequent (then) of a rule occur together than we would expect if they were statistically independent.

#Leverage: measures the difference between the observed frequency of {A, B} appearing together and what would be expected if A and B were independent.

#Conviction: measures the ratio of the expected frequency that A occurs without B (if they were independent) to the observed frequency of incorrect predictions. High conviction means that the consequent is highly dependent on the antecedent.

In [82]:
#Markdown and Latex: aº