In [1]:
!pip install mlxtend
!pip install xlrd

Collecting mlxtend
  Downloading mlxtend-0.21.0-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.21.0


In [1]:
# Importing Libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


In [2]:
#Preparing the Dataset
#Lets outline a simplified version of this problem, on the internet have hundred of thousand records of information. #
#we will create a dataset myself, so you can better understand how it works.
#Onion, Sausages, Cheese, Water, Butter, Sugar, Eggs
df = [['Onion', 'Sausages', 'Cheese', 'Butter'],
      ['Onion', 'Sausages', 'Water', 'Sugar'],
      ['Onion', 'Water', 'Sausages'],
      ['Butter', 'Sugar', 'Eggs'],
      ['Butter', 'Sugar', 'Eggs', 'Cheese'],
      ['Water', 'Cheese', 'Eggs'],
      ['Water', 'Butter'],
      ['Onion', 'Butter', 'Sugar'],
      ['Onion', 'Butter', 'Cheese'],
      ['Onion', 'Butter', 'Water'],
      ]

df = pd.DataFrame(df)
df


Unnamed: 0,0,1,2,3
0,Onion,Sausages,Cheese,Butter
1,Onion,Sausages,Water,Sugar
2,Onion,Water,Sausages,
3,Butter,Sugar,Eggs,
4,Butter,Sugar,Eggs,Cheese
5,Water,Cheese,Eggs,
6,Water,Butter,,
7,Onion,Butter,Sugar,
8,Onion,Butter,Cheese,
9,Onion,Butter,Water,


In [4]:
#Converting DataFrame to a compatible list
#In reality, the dataset would have been ready before transforming it into a DataFrame. 
#However, the point of this article is to illustrate to you what are the building blocks of Market Basket Analysis. 
#Because you will likely start with information stored into a pandas DataFrame, this will prove useful in future.

#conversion in list: the issue of None values
df = df.values.tolist()
df
[['Onion', 'Sausages', 'Cheese', 'Butter'],
 ['Onion', 'Sausages', 'Water', 'Sugar'],
 ['Onion', 'Water', 'Sausages', None],
 ['Butter', 'Sugar', 'Eggs', None],
 ['Butter', 'Sugar', 'Eggs', 'Cheese'],
 ['Water', 'Cheese', 'Eggs', None],
 ['Water', 'Butter', None, None],
 ['Onion', 'Butter', 'Sugar', None],
 ['Onion', 'Butter', 'Cheese', None],
 ['Onion', 'Butter', 'Water', None]]


[['Onion', 'Sausages', 'Cheese', 'Butter'],
 ['Onion', 'Sausages', 'Water', 'Sugar'],
 ['Onion', 'Water', 'Sausages', None],
 ['Butter', 'Sugar', 'Eggs', None],
 ['Butter', 'Sugar', 'Eggs', 'Cheese'],
 ['Water', 'Cheese', 'Eggs', None],
 ['Water', 'Butter', None, None],
 ['Onion', 'Butter', 'Sugar', None],
 ['Onion', 'Butter', 'Cheese', None],
 ['Onion', 'Butter', 'Water', None]]

In [5]:
#Removing None values in list, 2 dimensions
df_ = list()
for _ in df:
  #using list comprehension 
  _ = [x for x in _ if x is not None]
  df_.append(_)
df = df_
df
[['Onion', 'Sausages', 'Cheese', 'Butter'],
 ['Onion', 'Sausages', 'Water', 'Sugar'],
 ['Onion', 'Water', 'Sausages'],
 ['Butter', 'Sugar', 'Eggs'],
 ['Butter', 'Sugar', 'Eggs', 'Cheese'],
 ['Water', 'Cheese', 'Eggs'],
 ['Water', 'Butter'],
 ['Onion', 'Butter', 'Sugar'],
 ['Onion', 'Butter', 'Cheese'],
 ['Onion', 'Butter', 'Water']]


[['Onion', 'Sausages', 'Cheese', 'Butter'],
 ['Onion', 'Sausages', 'Water', 'Sugar'],
 ['Onion', 'Water', 'Sausages'],
 ['Butter', 'Sugar', 'Eggs'],
 ['Butter', 'Sugar', 'Eggs', 'Cheese'],
 ['Water', 'Cheese', 'Eggs'],
 ['Water', 'Butter'],
 ['Onion', 'Butter', 'Sugar'],
 ['Onion', 'Butter', 'Cheese'],
 ['Onion', 'Butter', 'Water']]

In [6]:
# Scikit-Learn does not support the apriori algorithm, I have installed mlxtend for the occasion. 
# It will transform the bidimensional list into one_hot encoded DataFrame.
# As mentioned above, this should be the final result

#one_hot encoding (boolean output)
te = TransactionEncoder()
te_ary = te.fit(df).transform(df)
df = pd.DataFrame(te_ary, columns=te.columns_)
df


Unnamed: 0,Butter,Cheese,Eggs,Onion,Sausages,Sugar,Water
0,True,True,False,True,True,False,False
1,False,False,False,True,True,True,True
2,False,False,False,True,True,False,True
3,True,False,True,False,False,True,False
4,True,True,True,False,False,True,False
5,False,True,True,False,False,False,True
6,True,False,False,False,False,False,True
7,True,False,False,True,False,True,False
8,True,True,False,True,False,False,False
9,True,False,False,True,False,False,True


## 5. Extract Frequent Itemsets

In [7]:
frequent_itemsets = apriori(df, min_support=0.4, use_colnames=True)

frequent_itemsets


Unnamed: 0,support,itemsets
0,0.7,(Butter)
1,0.4,(Cheese)
2,0.6,(Onion)
3,0.4,(Sugar)
4,0.5,(Water)
5,0.4,"(Onion, Butter)"


## 6. Extract Association Rules

In [8]:
# Among all items, I will select the ones that have a minimum confidence of .4:
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Onion),(Butter),0.6,0.7,0.4,0.666667,0.952381,-0.02,0.9
1,(Butter),(Onion),0.7,0.6,0.4,0.571429,0.952381,-0.02,0.933333


- There is a 40% frequency that butter and Onion apeear together in our dataset transactions.
- There is a ~67% frequency that butter will appear in a transcation given Onion is already in the basket.
- There is a ~57% frequency that onion will appear in a transcation given butter is already in the basket

## 7. Extract Rules

In [9]:
#With this step, I will impose a minimum threshold on the lift of .7:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=.7)

In [11]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(Onion),(Butter),0.6,0.7,0.4,0.666667,0.952381,-0.02,0.9,1
1,(Butter),(Onion),0.7,0.6,0.4,0.571429,0.952381,-0.02,0.933333,1


##  8. Define Threshold and extract the final associations

In [10]:
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))

## 9.	Make a selection based on specifics

In [12]:
#In case you want to select association rules based on a threshold, you will find this algorithm useful.
rules[ (rules['antecedent_len'] >= 1) &
       (rules['confidence'] > 0.6) &
       (rules['lift'] > 0.9) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(Onion),(Butter),0.6,0.7,0.4,0.666667,0.952381,-0.02,0.9,1


## Make a selection based on ingredients

In [13]:
#select the ones you want
rules[rules['antecedents'] == {'Onion'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(Onion),(Butter),0.6,0.7,0.4,0.666667,0.952381,-0.02,0.9,1
