In [1]:
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Example of converting lists into a dataframe

In [2]:
from mlxtend.preprocessing import TransactionEncoder

dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df1 = pd.DataFrame(te_ary, columns=te.columns_)
df1

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


# Read in Data

In [3]:
df = pd.read_csv('data/groceries.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9835 entries, 0 to 9834
Columns: 169 entries, frankfurter to bags
dtypes: bool(169)
memory usage: 1.6 MB


In [4]:
df.head()

Unnamed: 0,frankfurter,sausage,liver loaf,ham,meat,finished products,organic sausage,chicken,turkey,pork,...,candles,light bulbs,sound storage medium,newspapers,photo/film,pot plants,flower soil/fertilizer,flower (seeds),shopping bags,bags
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(df, min_support=0.001, use_colnames=True)

In [6]:
type(frequent_itemsets)

pandas.core.frame.DataFrame

In [7]:
frequent_itemsets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13492 entries, 0 to 13491
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   support   13492 non-null  float64
 1   itemsets  13492 non-null  object 
dtypes: float64(1), object(1)
memory usage: 210.9+ KB


In [8]:
frequent_itemsets.head(10)

Unnamed: 0,support,itemsets
0,0.058973,(frankfurter)
1,0.09395,(sausage)
2,0.005084,(liver loaf)
3,0.026029,(ham)
4,0.025826,(meat)
5,0.006507,(finished products)
6,0.002237,(organic sausage)
7,0.042908,(chicken)
8,0.008134,(turkey)
9,0.057651,(pork)


In [9]:
frequent_itemsets.tail(10)

Unnamed: 0,support,itemsets
13482,0.001118,"(whole milk, beef, other vegetables, root vege..."
13483,0.001423,"(whole milk, other vegetables, root vegetables..."
13484,0.001017,"(whole milk, other vegetables, root vegetables..."
13485,0.001322,"(pip fruit, whole milk, other vegetables, root..."
13486,0.001118,"(whole milk, other vegetables, butter, root ve..."
13487,0.001118,"(whole milk, other vegetables, root vegetables..."
13488,0.001322,"(whole milk, other vegetables, root vegetables..."
13489,0.001017,"(whole milk, oil, other vegetables, root veget..."
13490,0.001118,"(whole milk, other vegetables, root vegetables..."
13491,0.001017,"(whole milk, domestic eggs, other vegetables, ..."


In [10]:
frequent_itemsets.sort_values(by=['support'], ascending=False).head()

Unnamed: 0,support,itemsets
24,0.255516,(whole milk)
22,0.193493,(other vegetables)
53,0.183935,(rolls/buns)
99,0.174377,(soda)
29,0.139502,(yogurt)


In [11]:
frequent_itemsets.iloc[0,1]

frozenset({'frankfurter'})

In [12]:
len(frequent_itemsets.iloc[0,1])

1

In [13]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

In [14]:
frequent_itemsets.head()

Unnamed: 0,support,itemsets,length
0,0.058973,(frankfurter),1
1,0.09395,(sausage),1
2,0.005084,(liver loaf),1
3,0.026029,(ham),1
4,0.025826,(meat),1


In [15]:
frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.02) ]

Unnamed: 0,support,itemsets,length
173,0.020539,"(whole milk, frankfurter)",2
255,0.026945,"(sausage, other vegetables)",2
257,0.029893,"(sausage, whole milk)",2
281,0.030605,"(sausage, rolls/buns)",2
303,0.024301,"(sausage, soda)",2
...,...,...,...
2387,0.024199,"(rolls/buns, bottled water)",2
2388,0.038332,"(soda, rolls/buns)",2
2524,0.021047,"(pastry, soda)",2
2856,0.028978,"(soda, bottled water)",2


In [16]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'pastry', 'soda'} ]

Unnamed: 0,support,itemsets,length
2524,0.021047,"(pastry, soda)",2


In [17]:
from mlxtend.frequent_patterns import association_rules
rules = association_rules(frequent_itemsets, min_threshold=0.1)

In [18]:
rules.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43732 entries, 0 to 43731
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   antecedents         43732 non-null  object 
 1   consequents         43732 non-null  object 
 2   antecedent support  43732 non-null  float64
 3   consequent support  43732 non-null  float64
 4   support             43732 non-null  float64
 5   confidence          43732 non-null  float64
 6   lift                43732 non-null  float64
 7   leverage            43732 non-null  float64
 8   conviction          43732 non-null  float64
dtypes: float64(7), object(2)
memory usage: 3.0+ MB


In [19]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(sausage),(frankfurter),0.09395,0.058973,0.010066,0.107143,1.81681,0.004526,1.05395
1,(frankfurter),(sausage),0.058973,0.09395,0.010066,0.17069,1.81681,0.004526,1.092534
2,(meat),(frankfurter),0.025826,0.058973,0.003254,0.125984,2.136302,0.001731,1.07667
3,(pork),(frankfurter),0.057651,0.058973,0.005897,0.102293,1.734568,0.002497,1.048256
4,(hamburger meat),(frankfurter),0.033249,0.058973,0.003355,0.100917,1.711246,0.001395,1.046652


In [20]:
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))

In [21]:
rules[ (rules['antecedent_len'] >= 2) &
       (rules['confidence'] > 0.75) &
       (rules['lift'] > 1.2) ].sort_values(by="support", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
24171,"(citrus fruit, root vegetables, tropical fruit)",(other vegetables),0.005694,0.193493,0.004474,0.785714,4.060694,0.003372,3.763701,3
27999,"(curd, tropical fruit, yogurt)",(whole milk),0.005287,0.255516,0.003965,0.750000,2.935237,0.002614,2.977936,3
38977,"(citrus fruit, whole milk, root vegetables, tr...",(other vegetables),0.003559,0.193493,0.003152,0.885714,4.577509,0.002463,7.056940,4
30694,"(brown bread, root vegetables, other vegetables)",(whole milk),0.004067,0.255516,0.003152,0.775000,3.033078,0.002113,3.308818,3
31657,"(butter, root vegetables, yogurt)",(whole milk),0.003864,0.255516,0.003050,0.789474,3.089723,0.002063,3.536299,3
...,...,...,...,...,...,...,...,...,...,...
34390,"(long life bakery product, salty snack, other ...",(whole milk),0.001220,0.255516,0.001017,0.833333,3.261374,0.000705,4.466904,3
34168,"(chocolate, margarine, other vegetables)",(whole milk),0.001322,0.255516,0.001017,0.769231,3.010499,0.000679,3.226097,3
33593,"(detergent, whipped/sour cream, other vegetables)",(whole milk),0.001220,0.255516,0.001017,0.833333,3.261374,0.000705,4.466904,3
32854,"(herbs, fruit/vegetable juice, other vegetables)",(whole milk),0.001220,0.255516,0.001017,0.833333,3.261374,0.000705,4.466904,3


In [22]:
rules[ (rules['antecedents'].apply(lambda x: 'Eggs' in x)) & (rules['antecedent_len'] >=2) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
