# Playground for Association Rule Learning slides

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [1]:
import datetime
print(datetime.datetime.now())

2020-10-05 13:32:06.144238


In [2]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Example of converting lists into a dataframe

In [3]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df1 = pd.DataFrame(te_ary, columns=te.columns_)
df1

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


# Read in Data

In [4]:
df = pd.read_csv('../data/groceries.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9835 entries, 0 to 9834
Columns: 169 entries, frankfurter to bags
dtypes: bool(169)
memory usage: 1.6 MB


In [5]:
df.head()

Unnamed: 0,frankfurter,sausage,liver loaf,ham,meat,finished products,organic sausage,chicken,turkey,pork,...,candles,light bulbs,sound storage medium,newspapers,photo/film,pot plants,flower soil/fertilizer,flower (seeds),shopping bags,bags
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
from mlxtend.frequent_patterns import apriori

%time frequent_itemsets = apriori(df, min_support=0.001, use_colnames=True)

Wall time: 17 s


In [7]:
frequent_itemsets.head(10)

Unnamed: 0,support,itemsets
0,0.058973,(frankfurter)
1,0.09395,(sausage)
2,0.005084,(liver loaf)
3,0.026029,(ham)
4,0.025826,(meat)
5,0.006507,(finished products)
6,0.002237,(organic sausage)
7,0.042908,(chicken)
8,0.008134,(turkey)
9,0.057651,(pork)


In [8]:
frequent_itemsets.tail(10)

Unnamed: 0,support,itemsets
13482,0.001118,"(other vegetables, root vegetables, whole milk..."
13483,0.001423,"(other vegetables, root vegetables, whole milk..."
13484,0.001017,"(other vegetables, root vegetables, whole milk..."
13485,0.001322,"(other vegetables, pip fruit, root vegetables,..."
13486,0.001118,"(other vegetables, root vegetables, whole milk..."
13487,0.001118,"(other vegetables, root vegetables, whole milk..."
13488,0.001322,"(other vegetables, root vegetables, whole milk..."
13489,0.001017,"(other vegetables, oil, root vegetables, whole..."
13490,0.001118,"(other vegetables, root vegetables, whole milk..."
13491,0.001017,"(other vegetables, domestic eggs, whole milk, ..."


In [9]:
frequent_itemsets.sort_values(by=['support'], ascending=False).head()

Unnamed: 0,support,itemsets
24,0.255516,(whole milk)
22,0.193493,(other vegetables)
53,0.183935,(rolls/buns)
99,0.174377,(soda)
29,0.139502,(yogurt)


In [10]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

In [11]:
frequent_itemsets.head()

Unnamed: 0,support,itemsets,length
0,0.058973,(frankfurter),1
1,0.09395,(sausage),1
2,0.005084,(liver loaf),1
3,0.026029,(ham),1
4,0.025826,(meat),1


In [12]:
frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.02) ]

Unnamed: 0,support,itemsets,length
173,0.020539,"(whole milk, frankfurter)",2
255,0.026945,"(other vegetables, sausage)",2
257,0.029893,"(whole milk, sausage)",2
281,0.030605,"(rolls/buns, sausage)",2
303,0.024301,"(soda, sausage)",2
...,...,...,...
2387,0.024199,"(rolls/buns, bottled water)",2
2388,0.038332,"(soda, rolls/buns)",2
2524,0.021047,"(soda, pastry)",2
2856,0.028978,"(soda, bottled water)",2


In [13]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'pastry', 'soda'} ]

Unnamed: 0,support,itemsets,length
2524,0.021047,"(soda, pastry)",2


In [14]:
%time rules = association_rules(frequent_itemsets, min_threshold=0.1)

Wall time: 302 ms


In [15]:
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(frankfurter),(sausage),0.058973,0.093950,0.010066,0.170690,1.816810,0.004526,1.092534,1
1,(sausage),(frankfurter),0.093950,0.058973,0.010066,0.107143,1.816810,0.004526,1.053950,1
2,(meat),(frankfurter),0.025826,0.058973,0.003254,0.125984,2.136302,0.001731,1.076670,1
3,(pork),(frankfurter),0.057651,0.058973,0.005897,0.102293,1.734568,0.002497,1.048256,1
4,(hamburger meat),(frankfurter),0.033249,0.058973,0.003355,0.100917,1.711246,0.001395,1.046652,1
...,...,...,...,...,...,...,...,...,...,...
43727,"(whole milk, butter, tropical fruit)","(other vegetables, yogurt, domestic eggs)",0.006202,0.005796,0.001017,0.163934,28.285879,0.000981,1.189146,3
43728,"(whole milk, butter, yogurt)","(other vegetables, tropical fruit, domestic eggs)",0.009354,0.004779,0.001017,0.108696,22.745143,0.000972,1.116590,3
43729,"(yogurt, tropical fruit, butter)","(other vegetables, whole milk, domestic eggs)",0.004575,0.012303,0.001017,0.222222,18.062443,0.000960,1.269896,3
43730,"(domestic eggs, butter)","(other vegetables, yogurt, tropical fruit, who...",0.009659,0.007626,0.001017,0.105263,13.803509,0.000943,1.109124,2


In [16]:
rules[ (rules['antecedent_len'] >= 2) &
       (rules['confidence'] > 0.75) &
       (rules['lift'] > 1.2) ].sort_values(by="support", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
24171,"(root vegetables, citrus fruit, tropical fruit)",(other vegetables),0.005694,0.193493,0.004474,0.785714,4.060694,0.003372,3.763701,3
27999,"(yogurt, curd, tropical fruit)",(whole milk),0.005287,0.255516,0.003965,0.750000,2.935237,0.002614,2.977936,3
38978,"(root vegetables, citrus fruit, tropical fruit...",(other vegetables),0.003559,0.193493,0.003152,0.885714,4.577509,0.002463,7.056940,4
30695,"(root vegetables, other vegetables, brown bread)",(whole milk),0.004067,0.255516,0.003152,0.775000,3.033078,0.002113,3.308818,3
31658,"(root vegetables, butter, yogurt)",(whole milk),0.003864,0.255516,0.003050,0.789474,3.089723,0.002063,3.536299,3
...,...,...,...,...,...,...,...,...,...,...
34392,"(other vegetables, long life bakery product, s...",(whole milk),0.001220,0.255516,0.001017,0.833333,3.261374,0.000705,4.466904,3
34169,"(other vegetables, margarine, chocolate)",(whole milk),0.001322,0.255516,0.001017,0.769231,3.010499,0.000679,3.226097,3
33594,"(other vegetables, detergent, whipped/sour cream)",(whole milk),0.001220,0.255516,0.001017,0.833333,3.261374,0.000705,4.466904,3
32854,"(other vegetables, fruit/vegetable juice, herbs)",(whole milk),0.001220,0.255516,0.001017,0.833333,3.261374,0.000705,4.466904,3


In [17]:
rules[ (rules['antecedents'].apply(lambda x: 'Eggs' in x)) & (rules['antecedent_len'] >=2) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len


# Exec Ed Slides

In [18]:
df = pd.read_csv('data/groceries_exec.csv')

df = df.drop(['Date', 'Transaction ID'], axis=1)

df.Items = df.Items.astype(str)
df.Items = df.Items.apply(lambda x: x.split(', '))

te = TransactionEncoder()
te_ary = te.fit(df.Items).transform(df.Items)
df1 = pd.DataFrame(te_ary, columns=te.columns_)
#df1

frequent_itemsets = apriori(df1, min_support=0.06, use_colnames=True)
rules = association_rules(frequent_itemsets, min_threshold=0.1)

rules.sort_values(by="confidence", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4,(peanut butter),(bacon),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25
9,(cake mix),(beans),0.2,0.3,0.15,0.75,2.5,0.09,2.8
23,(chicken),(cheese),0.15,0.15,0.1,0.666667,4.444444,0.0775,2.55
22,(cheese),(chicken),0.15,0.15,0.1,0.666667,4.444444,0.0775,2.55
24,(chicken),(milk),0.15,0.5,0.1,0.666667,1.333333,0.025,1.5
30,"(bacon, milk)",(bread),0.15,0.4,0.1,0.666667,1.666667,0.04,1.8
34,"(bread, peanut butter)",(bacon),0.15,0.35,0.1,0.666667,1.904762,0.0475,1.95
40,"(bread, eggs)",(milk),0.15,0.5,0.1,0.666667,1.333333,0.025,1.5
42,"(eggs, milk)",(bread),0.15,0.4,0.1,0.666667,1.666667,0.04,1.8
26,(eggs),(milk),0.25,0.5,0.15,0.6,1.2,0.025,1.25


In [19]:
df1.loc[df1['cake mix'] == True]

Unnamed: 0,apple,bacon,banana,beans,bread,cake mix,carrot,cheese,chicken,eggs,ice cream,milk,peanut butter
9,False,False,False,True,False,True,False,False,False,False,False,True,False
13,False,False,False,True,True,True,False,False,False,False,False,False,True
15,False,False,False,True,False,True,False,False,False,False,True,False,False
17,False,False,False,False,False,True,False,False,False,False,False,True,False


In [20]:
df1.loc[df1['peanut butter'] == True]

Unnamed: 0,apple,bacon,banana,beans,bread,cake mix,carrot,cheese,chicken,eggs,ice cream,milk,peanut butter
1,False,True,False,False,False,False,False,False,False,False,False,False,True
3,False,True,False,False,True,False,False,False,False,False,False,False,True
13,False,False,False,True,True,True,False,False,False,False,False,False,True
16,False,True,False,False,False,False,False,False,False,False,False,False,True
18,False,True,False,False,True,False,False,False,False,False,False,True,True


In [21]:
df1.loc[df1['chicken'] == True]

Unnamed: 0,apple,bacon,banana,beans,bread,cake mix,carrot,cheese,chicken,eggs,ice cream,milk,peanut butter
2,False,False,False,False,False,False,True,True,True,False,False,True,False
4,False,True,False,False,False,False,False,False,True,False,False,True,False
10,False,False,False,True,False,False,False,True,True,True,False,False,False


In [23]:
df1.loc[df1['peanut butter'] == True]

Unnamed: 0,apple,bacon,banana,beans,bread,cake mix,carrot,cheese,chicken,eggs,ice cream,milk,peanut butter
1,False,True,False,False,False,False,False,False,False,False,False,False,True
3,False,True,False,False,True,False,False,False,False,False,False,False,True
13,False,False,False,True,True,True,False,False,False,False,False,False,True
16,False,True,False,False,False,False,False,False,False,False,False,False,True
18,False,True,False,False,True,False,False,False,False,False,False,True,True
