In [1]:
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_colwidth', None)

# Example of converting lists into a dataframe

In [2]:
from mlxtend.preprocessing import TransactionEncoder

dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df1 = pd.DataFrame(te_ary, columns=te.columns_)
df1

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


# Read in Data

In [3]:
df = pd.read_csv('../data/groceries.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9835 entries, 0 to 9834
Columns: 169 entries, frankfurter to bags
dtypes: bool(169)
memory usage: 1.6 MB


In [4]:
df.head()

Unnamed: 0,frankfurter,sausage,liver loaf,ham,meat,finished products,organic sausage,chicken,turkey,pork,...,candles,light bulbs,sound storage medium,newspapers,photo/film,pot plants,flower soil/fertilizer,flower (seeds),shopping bags,bags
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
items = list(df.columns)

In [6]:
from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(df, min_support=0.001, use_colnames=True)

In [7]:
type(frequent_itemsets)

pandas.core.frame.DataFrame

In [8]:
frequent_itemsets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13492 entries, 0 to 13491
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   support   13492 non-null  float64
 1   itemsets  13492 non-null  object 
dtypes: float64(1), object(1)
memory usage: 210.9+ KB


In [9]:
frequent_itemsets.head(10)

Unnamed: 0,support,itemsets
0,0.058973,(frankfurter)
1,0.09395,(sausage)
2,0.005084,(liver loaf)
3,0.026029,(ham)
4,0.025826,(meat)
5,0.006507,(finished products)
6,0.002237,(organic sausage)
7,0.042908,(chicken)
8,0.008134,(turkey)
9,0.057651,(pork)


In [10]:
frequent_itemsets.tail(10)

Unnamed: 0,support,itemsets
13482,0.001118,"(whole milk, beef, rolls/buns, tropical fruit, root vegetables, other vegetables)"
13483,0.001423,"(citrus fruit, whole milk, yogurt, tropical fruit, root vegetables, other vegetables)"
13484,0.001017,"(citrus fruit, whole milk, yogurt, whipped/sour cream, root vegetables, other vegetables)"
13485,0.001322,"(whole milk, yogurt, pip fruit, tropical fruit, root vegetables, other vegetables)"
13486,0.001118,"(whole milk, butter, yogurt, tropical fruit, root vegetables, other vegetables)"
13487,0.001118,"(whole milk, yogurt, whipped/sour cream, tropical fruit, root vegetables, other vegetables)"
13488,0.001322,"(whole milk, yogurt, rolls/buns, tropical fruit, root vegetables, other vegetables)"
13489,0.001017,"(oil, whole milk, yogurt, tropical fruit, root vegetables, other vegetables)"
13490,0.001118,"(bottled water, whole milk, yogurt, tropical fruit, root vegetables, other vegetables)"
13491,0.001017,"(whole milk, butter, yogurt, domestic eggs, tropical fruit, other vegetables)"


In [11]:
frequent_itemsets.sort_values(by=['support'], ascending=False).head()

Unnamed: 0,support,itemsets
24,0.255516,(whole milk)
22,0.193493,(other vegetables)
53,0.183935,(rolls/buns)
99,0.174377,(soda)
29,0.139502,(yogurt)


In [12]:
frequent_itemsets.iloc[0,1]

frozenset({'frankfurter'})

In [13]:
len(frequent_itemsets.iloc[0,1])

1

In [14]:
# Add the length of the antecedent (left hand side)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

In [15]:
frequent_itemsets.head()

Unnamed: 0,support,itemsets,length
0,0.058973,(frankfurter),1
1,0.09395,(sausage),1
2,0.005084,(liver loaf),1
3,0.026029,(ham),1
4,0.025826,(meat),1


In [16]:
frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.02) ]

Unnamed: 0,support,itemsets,length
173,0.020539,"(whole milk, frankfurter)",2
255,0.026945,"(other vegetables, sausage)",2
257,0.029893,"(whole milk, sausage)",2
281,0.030605,"(sausage, rolls/buns)",2
303,0.024301,"(soda, sausage)",2
...,...,...,...
2387,0.024199,"(bottled water, rolls/buns)",2
2388,0.038332,"(soda, rolls/buns)",2
2524,0.021047,"(soda, pastry)",2
2856,0.028978,"(soda, bottled water)",2


In [17]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'pastry', 'soda'} ]

Unnamed: 0,support,itemsets,length
2524,0.021047,"(soda, pastry)",2


In [18]:
from mlxtend.frequent_patterns import association_rules
rules = association_rules(frequent_itemsets, min_threshold=0.1)

In [19]:
rules.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43732 entries, 0 to 43731
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   antecedents         43732 non-null  object 
 1   consequents         43732 non-null  object 
 2   antecedent support  43732 non-null  float64
 3   consequent support  43732 non-null  float64
 4   support             43732 non-null  float64
 5   confidence          43732 non-null  float64
 6   lift                43732 non-null  float64
 7   leverage            43732 non-null  float64
 8   conviction          43732 non-null  float64
dtypes: float64(7), object(2)
memory usage: 3.0+ MB


In [20]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(sausage),(frankfurter),0.09395,0.058973,0.010066,0.107143,1.81681,0.004526,1.05395
1,(frankfurter),(sausage),0.058973,0.09395,0.010066,0.17069,1.81681,0.004526,1.092534
2,(meat),(frankfurter),0.025826,0.058973,0.003254,0.125984,2.136302,0.001731,1.07667
3,(pork),(frankfurter),0.057651,0.058973,0.005897,0.102293,1.734568,0.002497,1.048256
4,(hamburger meat),(frankfurter),0.033249,0.058973,0.003355,0.100917,1.711246,0.001395,1.046652


In [21]:
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))

## Exploring the Rules

This is where human creativity and "exploration" comes in.

In [22]:
# Find all rules that have at least 3 items on the LHS, confidence of at least .75 and lift of at least 1.2; sort by support

sub = rules[ (rules['antecedent_len'] >= 3) &
       (rules['confidence'] > 0.75) &
       (rules['lift'] > 1.2) ].sort_values(by="support", ascending=False)
sub.shape
display(sub.head(10))

(700, 10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
24172,"(root vegetables, citrus fruit, tropical fruit)",(other vegetables),0.005694,0.193493,0.004474,0.785714,4.060694,0.003372,3.763701,3
27999,"(curd, tropical fruit, yogurt)",(whole milk),0.005287,0.255516,0.003965,0.75,2.935237,0.002614,2.977936,3
38974,"(root vegetables, citrus fruit, whole milk, tropical fruit)",(other vegetables),0.003559,0.193493,0.003152,0.885714,4.577509,0.002463,7.05694,4
30693,"(root vegetables, other vegetables, brown bread)",(whole milk),0.004067,0.255516,0.003152,0.775,3.033078,0.002113,3.308818,3
31657,"(root vegetables, butter, yogurt)",(whole milk),0.003864,0.255516,0.00305,0.789474,3.089723,0.002063,3.536299,3
33064,"(other vegetables, curd, domestic eggs)",(whole milk),0.003457,0.255516,0.002847,0.823529,3.223005,0.001964,4.218743,3
20264,"(root vegetables, sausage, tropical fruit)",(whole milk),0.003559,0.255516,0.002745,0.771429,3.019101,0.001836,3.257117,3
26604,"(root vegetables, domestic eggs, tropical fruit)",(whole milk),0.003559,0.255516,0.002745,0.771429,3.019101,0.001836,3.257117,3
31665,"(root vegetables, butter, whipped/sour cream)",(whole milk),0.003457,0.255516,0.002644,0.764706,2.99279,0.00176,3.164057,3
26480,"(root vegetables, fruit/vegetable juice, tropical fruit)",(other vegetables),0.003254,0.193493,0.002542,0.78125,4.037622,0.001912,3.686891,3


In [23]:
# mlxtend stores the itemsets as frozensets. If we want to filter rules by what items are in them, 
# we have to check for set membership.

# This little helper will check if any elements in set1 are in set2
def is_any_in(set1, set2):
    c = set1.intersection(set2)
    return len(c) != 0

# Testing out our little helper function to make sure it works:
setb = frozenset({'cheese', 'soda', 'milk', 'ham'})
is_any_in(set({'cheese'}), setb) # Expect True
is_any_in(set({'cheese', 'ice cream'}), setb) # Expect True
is_any_in(set({'legos', 'ice cream'}), setb) # Expect False

True

True

False

In [24]:
# Let's remind ourselves: Which items mention "eggs" at all?

[item for item in items if "egg" in item.lower()]

['domestic eggs']

In [25]:
# OK, cool. Now, Let's find all rules that contain eggs on the LHS

search_set = set({'domestic eggs'})

sub = rules[(rules['antecedents'].apply(lambda x: is_any_in(search_set, x))) & 
            (rules['antecedent_len'] >=1) ].sort_values('lift', ascending=False)

sub.shape
display(sub.head(10))

(2577, 10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
43708,"(other vegetables, whole milk, yogurt, domestic eggs)","(butter, tropical fruit)",0.003355,0.009964,0.001017,0.30303,30.411255,0.000983,1.420486,4
43728,"(other vegetables, yogurt, domestic eggs)","(whole milk, butter, tropical fruit)",0.005796,0.006202,0.001017,0.175439,28.285879,0.000981,1.205244,3
43719,"(whole milk, yogurt, domestic eggs)","(other vegetables, butter, tropical fruit)",0.007728,0.005491,0.001017,0.131579,23.964425,0.000974,1.145193,3
43720,"(whole milk, domestic eggs, tropical fruit)","(other vegetables, butter, yogurt)",0.006914,0.006406,0.001017,0.147059,22.957516,0.000972,1.164904,3
43710,"(other vegetables, whole milk, domestic eggs, tropical fruit)","(butter, yogurt)",0.00305,0.014642,0.001017,0.333333,22.766204,0.000972,1.478038,4
43729,"(other vegetables, domestic eggs, tropical fruit)","(whole milk, butter, yogurt)",0.004779,0.009354,0.001017,0.212766,22.745143,0.000972,1.258388,3
43727,"(yogurt, domestic eggs, tropical fruit)","(other vegetables, whole milk, butter)",0.00427,0.01149,0.001017,0.238095,20.722714,0.000968,1.29742,3
43721,"(butter, yogurt, domestic eggs)","(other vegetables, whole milk, tropical fruit)",0.002949,0.017082,0.001017,0.344828,20.186782,0.000966,1.500243,3
43724,"(butter, domestic eggs, tropical fruit)","(other vegetables, whole milk, yogurt)",0.002339,0.022267,0.001017,0.434783,19.525511,0.000965,1.729835,3
41102,"(other vegetables, yogurt, domestic eggs)","(butter, tropical fruit)",0.005796,0.009964,0.001118,0.192982,19.367168,0.001061,1.226783,3
