# Market Basket Analysis with Python

## 1. Load the Dataset

In [2]:
# Import Pandas and Numpy
import pandas as pd
import numpy as np

# Load the Dataset
df = pd.read_csv("Market_Basket_Optimisation.csv", header=None)

In [3]:
# Show the shape of the data: the number of rows and columns
df.shape

(7501, 20)

In [4]:
# Show the header
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


## 2. Data Preparation

In [5]:
# Create a list of transaction
df['Transactions']= df.values.tolist()

In [6]:
df['Transactions']

0       [shrimp, almonds, avocado, vegetables mix, gre...
1       [burgers, meatballs, eggs, nan, nan, nan, nan,...
2       [chutney, nan, nan, nan, nan, nan, nan, nan, n...
3       [turkey, avocado, nan, nan, nan, nan, nan, nan...
4       [mineral water, milk, energy bar, whole wheat ...
                              ...                        
7496    [butter, light mayo, fresh bread, nan, nan, na...
7497    [burgers, frozen vegetables, eggs, french frie...
7498    [chicken, nan, nan, nan, nan, nan, nan, nan, n...
7499    [escalope, green tea, nan, nan, nan, nan, nan,...
7500    [eggs, frozen smoothie, yogurt cake, low fat y...
Name: Transactions, Length: 7501, dtype: object

In [7]:
# Delete NaN from the transaction list
df['Transactions'] = df['Transactions'].apply(lambda x: [i for i in x if str(i) != "nan"])

In [8]:
df['Transactions']

0       [shrimp, almonds, avocado, vegetables mix, gre...
1                              [burgers, meatballs, eggs]
2                                               [chutney]
3                                       [turkey, avocado]
4       [mineral water, milk, energy bar, whole wheat ...
                              ...                        
7496                    [butter, light mayo, fresh bread]
7497    [burgers, frozen vegetables, eggs, french frie...
7498                                            [chicken]
7499                                [escalope, green tea]
7500    [eggs, frozen smoothie, yogurt cake, low fat y...
Name: Transactions, Length: 7501, dtype: object

In [9]:
# Convert the transaction list from a DataFrame column into a list of strings
transactions = list(df['Transactions'])

In [10]:
# Count a transaction which contains burgers, meatballs, and eggs
transactions.count(['burgers', 'meatballs', 'eggs'])

1

In [11]:
# Count the number of rules

# Import library to count the number of permutations
from itertools import permutations

# Extract unique items.
unique_items = [item for transaction in transactions for item in transaction]

# Convert the unique item list from a string to a list
unique_item_list = list(set(unique_items))

# Compute and print rules.
rules = list(permutations(unique_item_list, 2))
print(rules)

[('bacon', 'strawberries'), ('bacon', 'herb & pepper'), ('bacon', 'fresh bread'), ('bacon', 'flax seed'), ('bacon', 'oatmeal'), ('bacon', 'whole weat flour'), ('bacon', 'burgers'), ('bacon', 'cake'), ('bacon', 'water spray'), ('bacon', 'french wine'), ('bacon', 'milk'), ('bacon', 'gums'), ('bacon', 'cookies'), ('bacon', 'cauliflower'), ('bacon', 'blueberries'), ('bacon', 'napkins'), ('bacon', 'eggplant'), ('bacon', 'candy bars'), ('bacon', 'mint green tea'), ('bacon', 'bug spray'), ('bacon', 'spaghetti'), ('bacon', 'yogurt cake'), ('bacon', 'turkey'), ('bacon', 'escalope'), ('bacon', 'tomato sauce'), ('bacon', 'mashed potato'), ('bacon', 'salad'), ('bacon', 'corn'), ('bacon', 'green grapes'), ('bacon', 'yams'), ('bacon', 'gluten free bar'), ('bacon', 'fresh tuna'), ('bacon', 'ground beef'), ('bacon', 'chicken'), ('bacon', 'green beans'), ('bacon', 'shampoo'), ('bacon', 'tea'), ('bacon', 'cereals'), ('bacon', 'ketchup'), ('bacon', 'rice'), ('bacon', 'honey'), ('bacon', 'barbecue sauce')

In [12]:
# Print the number of rules with length 2
print(len(rules))

14280


## 3. Basic Metrics

### 3.1. Support

In [13]:
# Import the library for encoding
from mlxtend.preprocessing import TransactionEncoder

# Instantiate transaction encoder
encoder = TransactionEncoder().fit(transactions)

In [14]:
# One-hot encode itemsets by applying fit and transform
onehot = encoder.transform(transactions)

# Convert one-hot encoded data to DataFrame
onehot = pd.DataFrame(onehot, columns = encoder.columns_)
print(onehot)

       asparagus  almonds  antioxydant juice  asparagus  avocado  babies food  \
0          False     True               True      False     True        False   
1          False    False              False      False    False        False   
2          False    False              False      False    False        False   
3          False    False              False      False     True        False   
4          False    False              False      False    False        False   
...          ...      ...                ...        ...      ...          ...   
7496       False    False              False      False    False        False   
7497       False    False              False      False    False        False   
7498       False    False              False      False    False        False   
7499       False    False              False      False    False        False   
7500       False    False              False      False    False        False   

      bacon  barbecue sauce

In [15]:
# Computing Support for Single Items
print(onehot.mean())

 asparagus           0.000133
almonds              0.020397
antioxydant juice    0.008932
asparagus            0.004666
avocado              0.033329
                       ...   
whole wheat pasta    0.029463
whole wheat rice     0.058526
yams                 0.011465
yogurt cake          0.027330
zucchini             0.009465
Length: 120, dtype: float64


In [16]:
# Define itemset that contains both eggs and ground beef
onehot['eggs_&_ground beef'] = np.logical_and(onehot['eggs'], onehot['ground beef'])

# Compute Support for itemset that contains both eggs and ground beef 
print(onehot['eggs_&_ground beef'].mean())

0.019997333688841486


In [17]:
# Drop the column of "eggs_&_ground beef" to keep the dataset simple 
onehot=onehot.drop('eggs_&_ground beef', axis=1)
onehot

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7497,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7498,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7499,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### 3.2. Confidence

In [18]:
# Compute Support for the itemsets that contains eggs and ground beef
support_eggs_groundbeef = np.logical_and(onehot['eggs'], onehot['ground beef']).mean()
support_eggs = onehot['eggs'].mean()
support_groundbeef = onehot['ground beef'].mean()

In [19]:
# Compute and print Confidence {eggs -> ground beef}
confidence_eggs_to_groundbeef = support_eggs_groundbeef / support_eggs
print(confidence_eggs_to_groundbeef)

0.11127596439169138


### 3.3. Lift

In [20]:
# Compute and print Lift {eggs -> ground beef}
lift_eggs_to_groundbeef = support_eggs_groundbeef / (support_eggs * support_groundbeef)
print(lift_eggs_to_groundbeef)

1.1325386823637411


In [21]:
lift2 = confidence_eggs_to_groundbeef / support_groundbeef
print(lift2)

1.132538682363741


## 4. Apriori Algorithm

In [22]:
# Import Apriori algorithm
from mlxtend.frequent_patterns import apriori

In [23]:
# Compute frequent itemsets
frequent_itemsets = apriori(onehot, min_support = 0.0005,max_len = 4, use_colnames = True)

# Print number of itemsets
print(len(frequent_itemsets))

19788


In [24]:
# Print frequent itemsets
print(frequent_itemsets.head())

    support             itemsets
0  0.020397            (almonds)
1  0.008932  (antioxydant juice)
2  0.004666          (asparagus)
3  0.033329            (avocado)
4  0.004533        (babies food)


In [39]:
# Import Apriori algorithm
from mlxtend.frequent_patterns import apriori, association_rules

# Compute association rules
Rules = association_rules(frequent_itemsets,
                          metric = "support",
                          min_threshold = 0.005)

In [45]:
# Print association rules
Rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(almonds),(burgers),0.020397,0.087188,0.005199,0.254902,2.923577,0.003421,1.225089
1,(burgers),(almonds),0.087188,0.020397,0.005199,0.059633,2.923577,0.003421,1.041724
2,(almonds),(chocolate),0.020397,0.163845,0.005999,0.294118,1.795099,0.002657,1.184553
3,(chocolate),(almonds),0.163845,0.020397,0.005999,0.036615,1.795099,0.002657,1.016834
4,(almonds),(eggs),0.020397,0.179709,0.006532,0.320261,1.782108,0.002867,1.206774
...,...,...,...,...,...,...,...,...,...
1935,"(spaghetti, olive oil)",(pancakes),0.022930,0.095054,0.005066,0.220930,2.324260,0.002886,1.161572
1936,"(pancakes, olive oil)",(spaghetti),0.010799,0.174110,0.005066,0.469136,2.694478,0.003186,1.555746
1937,(spaghetti),"(pancakes, olive oil)",0.174110,0.010799,0.005066,0.029096,2.694478,0.003186,1.018846
1938,(pancakes),"(spaghetti, olive oil)",0.095054,0.022930,0.005066,0.053296,2.324260,0.002886,1.032075


In [40]:
# Print the rules.
print(Rules)

                 antecedents             consequents  antecedent support  \
0                  (almonds)               (burgers)            0.020397   
1                  (burgers)               (almonds)            0.087188   
2                  (almonds)             (chocolate)            0.020397   
3                (chocolate)               (almonds)            0.163845   
4                  (almonds)                  (eggs)            0.020397   
...                      ...                     ...                 ...   
1935  (spaghetti, olive oil)              (pancakes)            0.022930   
1936   (pancakes, olive oil)             (spaghetti)            0.010799   
1937             (spaghetti)   (pancakes, olive oil)            0.174110   
1938              (pancakes)  (spaghetti, olive oil)            0.095054   
1939             (olive oil)   (spaghetti, pancakes)            0.065858   

      consequent support   support  confidence      lift  leverage  conviction  
0     

In [43]:
filtered_rules = Rules[(Rules['antecedent support'] > 0.01) & 
                       (Rules['support'] > 0.009) & 
                       (Rules['confidence'] > 0.5) & 
                       (Rules['lift'] > 1.00)]

In [44]:
filtered_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1406,"(ground beef, eggs)",(mineral water),0.019997,0.238368,0.010132,0.506667,2.125563,0.005365,1.543848
1593,"(ground beef, frozen vegetables)",(mineral water),0.016931,0.238368,0.009199,0.543307,2.279277,0.005163,1.667711
1737,"(ground beef, milk)",(mineral water),0.021997,0.238368,0.011065,0.50303,2.110308,0.005822,1.532552


In [None]:
# Computing support.
supportASAL = np.logical_and(onehot['asparagus'],onehot['almonds']).mean()
supportAS = onehot['asparagus'].mean()
supportAL = onehot['almonds'].mean()

# Compute and print confidence and lift.
confidence = supportASAL / supportAS
lift = supportASAL / (supportAS * supportAL)

# Print results.
print(supportAL, confidence, lift)