In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori,association_rules

# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here: 
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [2]:
url = "https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


Get the unique product that has been purchased

In [3]:
unique = df['0'].unique()
print(unique)

['Bread' 'Cheese' 'Meat' 'Eggs' 'Wine' 'Bagel' 'Pencil' 'Diaper' 'Milk']


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [4]:
#create an itemset based on the products
te = TransactionEncoder()
itemset = te.fit_transform(df.apply(lambda x: x.dropna().tolist()))
df_itemset = pd.DataFrame(itemset, columns=te.columns_)

In [5]:
# One-hot encode the data
df_encoded = pd.get_dummies(df.stack().reset_index(level=1, drop=True)).groupby(level=0).max()

Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

In [6]:
df_encoded_nan = df_encoded.dropna(axis=1, how='all')

# Display the new dataframe without NaN columns
print(df_encoded_nan.head())

   Bagel  Bread  Cheese  Diaper  Eggs  Meat  Milk  Pencil  Wine
0      0      1       1       1     1     1     0       1     1
1      0      1       1       1     0     1     1       1     1
2      0      0       1       0     1     1     1       0     1
3      0      0       1       0     1     1     1       0     1
4      0      0       0       0     0     1     0       1     1


In [7]:
df_encoded_withoutFirst = df_encoded.iloc[:, 1:]

# Display the new dataframe without the first column
print(df_encoded_withoutFirst.head())

   Bread  Cheese  Diaper  Eggs  Meat  Milk  Pencil  Wine
0      1       1       1     1     1     0       1     1
1      1       1       1     0     1     1       1     1
2      0       1       0     1     1     1       0     1
3      0       1       0     1     1     1       0     1
4      0       0       0     0     1     0       1     1


## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products. 
For this case study, we will min_support=0.2

In [8]:
frequent_itemsets = apriori(df_encoded, min_support=0.2, use_colnames=True)

frequent_itemsets



Unnamed: 0,support,itemsets
0,0.425397,(Bagel)
1,0.504762,(Bread)
2,0.501587,(Cheese)
3,0.406349,(Diaper)
4,0.438095,(Eggs)
5,0.47619,(Meat)
6,0.501587,(Milk)
7,0.361905,(Pencil)
8,0.438095,(Wine)
9,0.279365,"(Bagel, Bread)"


In [9]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265,0.402687
1,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203,0.469167
2,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754,0.500891
3,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891,0.526414
4,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
5,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
6,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754,0.330409
7,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624,0.387409
8,"(Eggs, Meat)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667,0.518717
9,"(Eggs, Cheese)",(Meat),0.298413,0.47619,0.215873,0.723404,1.519149,0.073772,1.893773,0.487091


Then, we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__ and __conviction__

In [10]:
#Antecedent support itu Mengukur frekuensi himpunan barang pada bagian depan aturan.
#Consequent support itu Mengukur frekuensi himpunan barang pada bagian belakang aturan.
#Support itu Mengukur kemunculan bersama himpunan barang pada bagian depan dan belakang aturan.
#Confidence Menunjukkan seberapa besar kemungkinan himpunan barang pada bagian belakang dibeli ketika himpunan barang pada bagian depan sudah dibeli.
#Lift > 1 menunjukkan adanya asosiasi positif, lift = 1 menunjukkan independensi, dan lift < 1 menunjukkan asosiasi negatif.
#Leverage positif menunjukkan frekuensi yang lebih tinggi dari yang diharapkan, sedangkan leverage negatif menunjukkan frekuensi yang lebih rendah dari yang diharapkan.
#Kalau nilai conviction tinggi menunjukkan implikasi yang kuat, sedangkan nilai rendah menunjukkan implikasi yang lebih lemah.