# Playground for Association Rule Learning slides

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [37]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns 

from sklearn.metrics import silhouette_score, silhouette_samples
import sklearn.metrics
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

import itertools

import scipy

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Example of converting lists into a dataframe

In [61]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df1 = pd.DataFrame(te_ary, columns=te.columns_)
df1

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


# Read in Data

In [63]:
df = pd.read_csv('data/groceries.csv')
df.head()

Unnamed: 0,frankfurter,sausage,liver loaf,ham,meat,finished products,organic sausage,chicken,turkey,pork,...,candles,light bulbs,sound storage medium,newspapers,photo/film,pot plants,flower soil/fertilizer,flower (seeds),shopping bags,bags
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [50]:
from mlxtend.frequent_patterns import apriori

%time frequent_itemsets = apriori(df, min_support=0.001, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

Wall time: 16.6 s


In [52]:
frequent_itemsets.head(10)

Unnamed: 0,support,itemsets,length
0,0.058973,(frankfurter),1
1,0.09395,(sausage),1
2,0.005084,(liver loaf),1
3,0.026029,(ham),1
4,0.025826,(meat),1
5,0.006507,(finished products),1
6,0.002237,(organic sausage),1
7,0.042908,(chicken),1
8,0.008134,(turkey),1
9,0.057651,(pork),1


In [53]:
frequent_itemsets.tail(10)

Unnamed: 0,support,itemsets,length
13482,0.001118,"(beef, other vegetables, root vegetables, trop...",6
13483,0.001423,"(other vegetables, root vegetables, yogurt, tr...",6
13484,0.001017,"(other vegetables, root vegetables, yogurt, ci...",6
13485,0.001322,"(other vegetables, root vegetables, yogurt, tr...",6
13486,0.001118,"(other vegetables, root vegetables, yogurt, bu...",6
13487,0.001118,"(other vegetables, root vegetables, yogurt, tr...",6
13488,0.001322,"(other vegetables, root vegetables, yogurt, tr...",6
13489,0.001017,"(oil, other vegetables, root vegetables, yogur...",6
13490,0.001118,"(bottled water, other vegetables, root vegetab...",6
13491,0.001017,"(other vegetables, yogurt, butter, tropical fr...",6


In [57]:
frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.02) ]

Unnamed: 0,support,itemsets,length
173,0.020539,"(whole milk, frankfurter)",2
255,0.026945,"(sausage, other vegetables)",2
257,0.029893,"(whole milk, sausage)",2
281,0.030605,"(rolls/buns, sausage)",2
303,0.024301,"(sausage, soda)",2
551,0.021657,"(pork, other vegetables)",2
553,0.022166,"(whole milk, pork)",2
627,0.021251,"(whole milk, beef)",2
762,0.028876,"(citrus fruit, other vegetables)",2
764,0.030503,"(citrus fruit, whole milk)",2


In [58]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'pastry', 'soda'} ]

Unnamed: 0,support,itemsets,length
2524,0.021047,"(pastry, soda)",2


In [59]:
from mlxtend.frequent_patterns import association_rules
%time rules = association_rules(frequent_itemsets, min_threshold=0.1)

Wall time: 422 ms


In [60]:
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(sausage),(frankfurter),0.093950,0.058973,0.010066,0.107143,1.816810,0.004526,1.053950,1
1,(frankfurter),(sausage),0.058973,0.093950,0.010066,0.170690,1.816810,0.004526,1.092534,1
2,(meat),(frankfurter),0.025826,0.058973,0.003254,0.125984,2.136302,0.001731,1.076670,1
3,(pork),(frankfurter),0.057651,0.058973,0.005897,0.102293,1.734568,0.002497,1.048256,1
4,(hamburger meat),(frankfurter),0.033249,0.058973,0.003355,0.100917,1.711246,0.001395,1.046652,1
5,(frankfurter),(citrus fruit),0.058973,0.082766,0.006507,0.110345,1.333220,0.001626,1.031000,1
6,(frankfurter),(tropical fruit),0.058973,0.104931,0.009456,0.160345,1.528092,0.003268,1.065995,1
7,(frankfurter),(pip fruit),0.058973,0.075648,0.007219,0.122414,1.618198,0.002758,1.053289,1
8,(frankfurter),(root vegetables),0.058973,0.108998,0.010168,0.172414,1.581800,0.003740,1.076627,1
9,(onions),(frankfurter),0.031012,0.058973,0.003762,0.121311,2.057066,0.001933,1.070945,1


In [12]:
rules[ (rules['antecedent_len'] >= 2) &
       (rules['confidence'] > 0.75) &
       (rules['lift'] > 1.2) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
12,"(Kidney Beans, Onion)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,2


In [36]:
rules[ (rules['antecedents'].apply(lambda x: 'Eggs' in x)) & (rules['antecedent_len'] >=2) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
10,"(Kidney Beans, Eggs)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,2
11,"(Onion, Eggs)",(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf,2
