In [1]:
# apriori implementation example
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# read the dataset
data = pd.read_csv('../market_basket_example.csv')

# shape of dataset
data.shape

(7500, 20)

In [3]:
data.head()

Unnamed: 0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
0,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
1,chutney,,,,,,,,,,,,,,,,,,,
2,turkey,avocado,,,,,,,,,,,,,,,,,,
3,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
4,low fat yogurt,,,,,,,,,,,,,,,,,,,


In [4]:
# checking the random entries in the data

data.sample(10)

Unnamed: 0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
2153,milk,eggs,champagne,,,,,,,,,,,,,,,,,
930,champagne,,,,,,,,,,,,,,,,,,,
1887,red wine,mineral water,muffins,pancakes,honey,cake,cooking oil,whole weat flour,french fries,frozen smoothie,mushroom cream sauce,hot dogs,sparkling water,green tea,,,,,,
2143,spaghetti,mineral water,olive oil,strong cheese,honey,cereals,chocolate,,,,,,,,,,,,,
3394,butter,extra dark chocolate,,,,,,,,,,,,,,,,,,
4607,chocolate,milk,olive oil,,,,,,,,,,,,,,,,,
672,yams,mineral water,eggs,cake,,,,,,,,,,,,,,,,
1558,eggs,frozen smoothie,,,,,,,,,,,,,,,,,,
2531,soup,,,,,,,,,,,,,,,,,,,
1183,protein bar,,,,,,,,,,,,,,,,,,,


In [5]:
# preprocessing
# making each customers shopping items an identical list
trans = []
for i in range(0, 7501):
    trans.append([str(data.values[i,j]) for j in range(0, 20)])

# conveting it into an numpy array
trans = np.array(trans)

# checking the shape of the array
print(trans.shape)

IndexError: index 7500 is out of bounds for axis 0 with size 7500

In [6]:
# using transacton encoder
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
data = te.fit_transform(trans)
data = pd.DataFrame(data, columns = te.columns_)

# getting the shape of the data
data.shape

(7500, 121)

In [7]:
# getting correlations for 121 items would be messy 
# so let's reduce the items from 121 to 50

data = data.loc[:, ['mineral water', 'burgers', 'turkey', 'chocolate', 'frozen vegetables', 'spaghetti',
                    'shrimp', 'grated cheese', 'eggs', 'cookies', 'french fries', 'herb & pepper', 'ground beef',
                    'tomatoes', 'milk', 'escalope', 'fresh tuna', 'red wine', 'ham', 'cake', 'green tea',
                    'whole wheat pasta', 'pancakes', 'soup', 'muffins', 'energy bar', 'olive oil', 'champagne', 
                    'avocado', 'pepper', 'butter', 'parmesan cheese', 'whole wheat rice', 'low fat yogurt', 
                    'chicken', 'vegetables mix', 'pickles', 'meatballs', 'frozen smoothie', 'yogurt cake']]

# checking the shape
data.shape

(7500, 40)

In [8]:
# columns
data.columns

Index(['mineral water', 'burgers', 'turkey', 'chocolate', 'frozen vegetables',
       'spaghetti', 'shrimp', 'grated cheese', 'eggs', 'cookies',
       'french fries', 'herb & pepper', 'ground beef', 'tomatoes', 'milk',
       'escalope', 'fresh tuna', 'red wine', 'ham', 'cake', 'green tea',
       'whole wheat pasta', 'pancakes', 'soup', 'muffins', 'energy bar',
       'olive oil', 'champagne', 'avocado', 'pepper', 'butter',
       'parmesan cheese', 'whole wheat rice', 'low fat yogurt', 'chicken',
       'vegetables mix', 'pickles', 'meatballs', 'frozen smoothie',
       'yogurt cake'],
      dtype='object')

In [9]:
data.head()

Unnamed: 0,mineral water,burgers,turkey,chocolate,frozen vegetables,spaghetti,shrimp,grated cheese,eggs,cookies,...,butter,parmesan cheese,whole wheat rice,low fat yogurt,chicken,vegetables mix,pickles,meatballs,frozen smoothie,yogurt cake
0,False,True,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [10]:
# applying apriori with minimum support of 5 %
apriori(data, min_support = 0.01, use_colnames = True)

Unnamed: 0,support,itemsets
0,0.238267,(mineral water)
1,0.087200,(burgers)
2,0.062533,(turkey)
3,0.163867,(chocolate)
4,0.095333,(frozen vegetables)
...,...,...
206,0.010133,"(eggs, mineral water, ground beef)"
207,0.013067,"(eggs, mineral water, milk)"
208,0.011067,"(mineral water, milk, ground beef)"
209,0.010533,"(spaghetti, chocolate, eggs)"


In [11]:
# selecting and filtering the results
frequent_itemsets = apriori(data, min_support = 0.05, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.238267,(mineral water),1
1,0.0872,(burgers),1
2,0.062533,(turkey),1
3,0.163867,(chocolate),1
4,0.095333,(frozen vegetables),1
5,0.174133,(spaghetti),1
6,0.071333,(shrimp),1
7,0.0524,(grated cheese),1
8,0.179733,(eggs),1
9,0.0804,(cookies),1


In [12]:
# getting th item sets with length = 2 and support more han 10%

frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.01) ]

Unnamed: 0,support,itemsets,length
24,0.052667,"(chocolate, mineral water)",2
25,0.059733,"(spaghetti, mineral water)",2
26,0.050933,"(eggs, mineral water)",2


In [13]:
# getting th item sets with length = 2 and support more han 10%

frequent_itemsets[ (frequent_itemsets['length'] == 1) &
                   (frequent_itemsets['support'] >= 0.01) ]



Unnamed: 0,support,itemsets,length
0,0.238267,(mineral water),1
1,0.0872,(burgers),1
2,0.062533,(turkey),1
3,0.163867,(chocolate),1
4,0.095333,(frozen vegetables),1
5,0.174133,(spaghetti),1
6,0.071333,(shrimp),1
7,0.0524,(grated cheese),1
8,0.179733,(eggs),1
9,0.0804,(cookies),1


In [14]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'eggs', 'mineral water'} ]

Unnamed: 0,support,itemsets,length
26,0.050933,"(eggs, mineral water)",2


In [15]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'mineral water'} ]

Unnamed: 0,support,itemsets,length
0,0.238267,(mineral water),1


In [16]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'milk'} ]

Unnamed: 0,support,itemsets,length
13,0.1296,(milk),1


In [17]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'chicken'} ]

Unnamed: 0,support,itemsets,length
22,0.06,(chicken),1


In [18]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'frozen vegetables'} ]

Unnamed: 0,support,itemsets,length
4,0.095333,(frozen vegetables),1


In [19]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'chocolate'} ]

Unnamed: 0,support,itemsets,length
3,0.163867,(chocolate),1
