In [1]:
%matplotlib inline 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("paper")

from itertools import combinations, groupby
from collections import Counter
from mlxtend.preprocessing import TransactionEncoder
import sys

# import

Detail
Perform ARM using product, aisle, and department as the item.

To perform an ARM with item=product we need to reduce the number of products (from the initial 50K products). A crude approach is to simply drop all product which appear less than some minimum frequency threshold (say PRODUCT_KEEP_MIN_FREQ=500).

The table orders_products needs to be one-hot encoded. Simplest way to do this is to first convert the order-product pairs to a transaction list and then (as in Practical B) to a one-hot encoding.

convert table of (order,product) pairs to list of transactions
transactions = orders_products.groupby('order_id').apply(lambda order: order['product_id'].tolist())
For each ARM with item (product, aisle, department) you should at least do the following subtasks:

Visualise the first 100 transactions vs items and comment (see Practical B).
Visualise the distribution of transaction size and comment (see Practical B).
Generate a few hundred frequent itemsets. You need to experiment (as discussed in class) to estimate a suitable value for the support threshold.
Generate a hundred+ rules and see if your can identify any interesting ones.
Disclaimers/Comments
When using aisle and department for item in the ARM model, we really should be using more advanced models than just (0=absent, 1=present). Feel free to explore the more advanced models, but we would be happy with just the one-hot analysis.

When you perform ARM with item=product we will find the your rule list will swamped by the organic fanboy/girl customers (my term not Bernard's) and I realy hope those customers buying the  {lemon,lime}{lemon,lime}  itemset were doing it for non-healthy reasons like drinking gin!

A more sophisticated analysis with item=product would be to roll up (merge) essentially similar products. So

All the organic fruit ( Organic Strawberries,Organic Raspberries,Bag of Organic Bananas,…looong list…Organic Strawberries,Organic Raspberries,Bag of Organic Bananas,…looong list… ) would just go to "organic fruit"
Merge products which differ only by brand name.
Grading Outline
3 x 25% for each of the separate basic ARM with item = (product, aisle, department) where the 25% consists of
10% Visualisation
5% Rule generation
10% Comments/analysis and identification of interesting rules.
25% for performing some roll-up of products and a more sophisticated analysis.

In [3]:
PRODUCT_KEEP_MIN_FREQ=3

orders_products_prior_full = pd.read_csv('instacart_2017_05_01/my_order_products__prior.csv')
orders_products_train_full = pd.read_csv('instacart_2017_05_01/my_order_products__train.csv')
orders_products = pd.concat([orders_products_prior_full, orders_products_train_full])

c = orders_products["product_id"].value_counts().gt(PRODUCT_KEEP_MIN_FREQ)
orders_products = orders_products.loc[orders_products["product_id"].isin(c[c].index)]
products = pd.read_csv('instacart_2017_05_01/products.csv')
aisles = pd.read_csv('instacart_2017_05_01/aisles.csv')
departments = pd.read_csv('instacart_2017_05_01/departments.csv')


In [4]:
orders_products.head()

Unnamed: 0.1,Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,83830,8938,49235,1,1
1,83831,8938,13032,2,1
2,83832,8938,26348,3,1
5,83835,8938,37710,6,1
7,333401,35204,8859,1,1


In [5]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [6]:
#groupby returns a GroupByDataFrame - the lamba afterwards collects each item, turning itback into a dataframe
order_products_a = pd.merge(orders_products, products, on='product_id', how='left').apply(lambda x: x)
order_products_b = pd.merge(order_products_a, departments, on='department_id', how='left').apply(lambda x: x)
orders_products = pd.merge(order_products_b, aisles, on='aisle_id', how='left').groupby("order_id",as_index=False).apply(lambda x: x)
orders_products.head()

Unnamed: 0.1,Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,department,aisle
0,83830,8938,49235,1,1,Organic Half & Half,53,16,dairy eggs,cream
1,83831,8938,13032,2,1,Cinnamon Toast Crunch,121,14,breakfast,cereal
2,83832,8938,26348,3,1,Mixed Fruit Fruit Snacks,50,19,snacks,fruit vegetable snacks
3,83835,8938,37710,6,1,Trail Mix,125,19,snacks,trail mix snack mix
4,333401,35204,8859,1,1,Natural Spring Water,115,7,beverages,water seltzer sparkling water


In [None]:


for index, row in df.iterrows():
   print row['c1'], row['c2']

In [20]:
#each order item in a sepearte list as list of lists
#orders_products_prior.head()
orders_ids = orders_products.order_id.unique()
list_of_lists = []

for order_id in orders_ids:
    inner_list = orders_products.loc[orders_products['order_id'] == order_id, 'product_name']
    list_of_lists.append([item.replace("Organic", "").strip() for item in inner_list])



In [21]:
list_of_lists[0:5]

[['Half & Half',
  'Cinnamon Toast Crunch',
  'Mixed Fruit Fruit Snacks',
  'Trail Mix'],
 ['Natural Spring Water', 'Pure Irish Butter', 'Salted Butter', 'Dried Mango'],
 ['Apple Honeycrisp',
  'Banana',
  'Hass Avocados',
  'Greek Lowfat Yogurt With Cherries',
  'Peach Yoghurt',
  'Nonfat Greek Yogurt With Peaches',
  'Uncured Genoa Salami',
  'Reduced Fat 2% Milk',
  'Orange Juice',
  'Grilled Chicken Breast Strips'],
 ['Pretzel Slider Buns'],
 ['Mini Peeled Carrots', 'Milano']]

In [9]:
te = TransactionEncoder()
te_ary = te.fit(list_of_lists).transform(list_of_lists,sparse=False)

df = pd.DataFrame(te_ary, columns=te.columns_)
df.head()

Unnamed: 0,100% Whole Wheat Bread,2% Reduced Fat Milk,3 Toddler Next Step Natural Milk Flavor Milk Drink,Almond Breeze Unsweetened Almond Coconut Milk Blend,"Almondmilk Creamer, Vanilla",Almonds,Animal Crackers,Apple Honeycrisp,Avocado,Baby Arugula,...,Whole Milk,Whole Wheat Bread,XL Pick-A-Size Paper Towel Rolls,Yams,Yellow Onion,Yellow Onions,Z Mixed Berry Fruit Rope,Zero Rise Orange,Zucchini,"s Pop's Corn Microwave Popcorn, , Light Butter"
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
from mlxtend.frequent_patterns import apriori
a = apriori(df, min_support=0.05,use_colnames=True).sort_values(by='support',ascending=False)
a.head()

Unnamed: 0,support,itemsets
5,0.214286,[Banana]
4,0.105442,[Bag of Bananas]
7,0.105442,[Blueberries]
13,0.085034,[Natural Pure Sparkling Water]
11,0.081633,[Half & Half]


In [11]:
frequent_itemsets = apriori(df, min_support=0.01,use_colnames=True)
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.020408,[100% Whole Wheat Bread]
1,0.020408,[2% Reduced Fat Milk]
2,0.020408,[3 Toddler Next Step Natural Milk Flavor Milk ...
3,0.017007,[Almond Breeze Unsweetened Almond Coconut Milk...
4,0.013605,"[Almondmilk Creamer, Vanilla]"


In [12]:
from mlxtend.frequent_patterns import association_rules

frequent_itemsets = apriori(df, min_support=0.05,use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.25)
rules.head()

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Apple Honeycrisp),(Banana),0.078231,0.214286,0.078231,1.0,4.666667,0.061467,inf
1,(Banana),(Apple Honeycrisp),0.214286,0.078231,0.078231,0.365079,4.666667,0.061467,1.451786
2,(Apple Honeycrisp),(Hass Avocados),0.078231,0.068027,0.05102,0.652174,9.586957,0.045699,2.679422
3,(Hass Avocados),(Apple Honeycrisp),0.068027,0.078231,0.05102,0.75,9.586957,0.045699,3.687075
4,(Bartlett Pear),(Banana),0.05102,0.214286,0.05102,1.0,4.666667,0.040087,inf


In [13]:
rules.sort_values(by='lift',ascending=False).head(n=20)

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
17,(Apple Honeycrisp),"(Banana, Hass Avocados)",0.078231,0.061224,0.05102,0.652174,10.652174,0.046231,2.69898
14,"(Banana, Hass Avocados)",(Apple Honeycrisp),0.061224,0.078231,0.05102,0.833333,10.652174,0.046231,5.530612
3,(Hass Avocados),(Apple Honeycrisp),0.068027,0.078231,0.05102,0.75,9.586957,0.045699,3.687075
16,(Hass Avocados),"(Apple Honeycrisp, Banana)",0.068027,0.078231,0.05102,0.75,9.586957,0.045699,3.687075
2,(Apple Honeycrisp),(Hass Avocados),0.078231,0.068027,0.05102,0.652174,9.586957,0.045699,2.679422
15,"(Apple Honeycrisp, Banana)",(Hass Avocados),0.078231,0.068027,0.05102,0.652174,9.586957,0.045699,2.679422
11,(Sweet Mini Peppers),(Banana),0.05102,0.214286,0.05102,1.0,4.666667,0.040087,inf
13,"(Apple Honeycrisp, Hass Avocados)",(Banana),0.05102,0.214286,0.05102,1.0,4.666667,0.040087,inf
12,(Tart Cherry Yoghurt),(Banana),0.05102,0.214286,0.05102,1.0,4.666667,0.040087,inf
0,(Apple Honeycrisp),(Banana),0.078231,0.214286,0.078231,1.0,4.666667,0.061467,inf


If a basket contains all the items in set a (antecedants) then it is likely to contain all the items in set b (consequents) 
support is the percent of the of transactions that contain all elements in set a and set b


In [16]:
#aisles
orders_ids = orders_products.order_id.unique()
list_of_lists = []

for order_id in orders_ids:
    inner_list = orders_products.loc[orders_products['order_id'] == order_id, 'aisle']
    list_of_lists.append(list(set(inner_list))) #remove duplicates

list_of_lists[0:5]

[['trail mix snack mix', 'fruit vegetable snacks', 'cream', 'cereal'],
 ['water seltzer sparkling water', 'bulk dried fruits vegetables', 'butter'],
 ['fresh fruits',
  'packaged poultry',
  'packaged produce',
  'yogurt',
  'lunch meat',
  'refrigerated',
  'milk'],
 ['buns rolls'],
 ['cookies cakes', 'packaged vegetables fruits']]

In [17]:
te = TransactionEncoder()
te_ary = te.fit(list_of_lists).transform(list_of_lists,sparse=False)

df = pd.DataFrame(te_ary, columns=te.columns_)
a = apriori(df, min_support=0.1,use_colnames=True).sort_values(by='support',ascending=False)
frequent_itemsets = apriori(df, min_support=0.1,use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.25)
rules.sort_values(by='lift',ascending=False).head(n=20)

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
10,"(fresh fruits, packaged vegetables fruits)",(yogurt),0.193878,0.227891,0.102041,0.526316,2.309505,0.057858,1.630008
12,(yogurt),"(fresh fruits, packaged vegetables fruits)",0.227891,0.193878,0.102041,0.447761,2.309505,0.057858,1.459735
11,"(yogurt, packaged vegetables fruits)",(fresh fruits),0.102041,0.465986,0.102041,1.0,2.145985,0.054491,inf
9,"(fresh fruits, yogurt)",(packaged vegetables fruits),0.173469,0.292517,0.102041,0.588235,2.010944,0.051298,1.718173
13,(packaged vegetables fruits),"(fresh fruits, yogurt)",0.292517,0.173469,0.102041,0.348837,2.010944,0.051298,1.269315
1,(fresh vegetables),(fresh fruits),0.170068,0.465986,0.142857,0.84,1.802628,0.063608,3.337585
0,(fresh fruits),(fresh vegetables),0.465986,0.170068,0.142857,0.306569,1.802628,0.063608,1.196849
5,(fresh fruits),(yogurt),0.465986,0.227891,0.173469,0.372263,1.633511,0.067275,1.229987
6,(yogurt),(fresh fruits),0.227891,0.465986,0.173469,0.761194,1.633511,0.067275,2.236182
7,(yogurt),(packaged vegetables fruits),0.227891,0.292517,0.102041,0.447761,1.530719,0.035379,1.281118


In [18]:
#departments
orders_ids = orders_products.order_id.unique()
list_of_lists = []

for order_id in orders_ids:
    inner_list = orders_products.loc[orders_products['order_id'] == order_id, 'department']
    list_of_lists.append(list(set(inner_list))) #remove duplicates

list_of_lists[0:5]

[['snacks', 'breakfast', 'dairy eggs'],
 ['beverages', 'bulk', 'dairy eggs'],
 ['meat seafood', 'dairy eggs', 'produce', 'beverages', 'deli'],
 ['bakery'],
 ['produce', 'snacks']]

In [19]:
te = TransactionEncoder()
te_ary = te.fit(list_of_lists).transform(list_of_lists,sparse=False)

df = pd.DataFrame(te_ary, columns=te.columns_)
a = apriori(df, min_support=0.1,use_colnames=True).sort_values(by='support',ascending=False)
frequent_itemsets = apriori(df, min_support=0.1,use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.25)
rules.sort_values(by='lift',ascending=False).head(n=20)

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
20,(bakery),"(produce, dairy eggs)",0.193878,0.360544,0.12585,0.649123,1.800397,0.055949,1.822449
19,"(produce, dairy eggs)",(bakery),0.360544,0.193878,0.12585,0.349057,1.800397,0.055949,1.238391
14,(deli),(produce),0.159864,0.602041,0.146259,0.914894,1.519654,0.050014,4.67602
18,"(bakery, dairy eggs)",(produce),0.14966,0.602041,0.12585,0.840909,1.396764,0.035749,2.501458
17,"(bakery, produce)",(dairy eggs),0.159864,0.568027,0.12585,0.787234,1.385909,0.035043,2.030272
2,(bakery),(produce),0.193878,0.602041,0.159864,0.824561,1.36961,0.043142,2.268367
3,(produce),(bakery),0.602041,0.193878,0.159864,0.265537,1.36961,0.043142,1.097567
0,(bakery),(dairy eggs),0.193878,0.568027,0.14966,0.77193,1.358966,0.039532,1.894035
1,(dairy eggs),(bakery),0.568027,0.193878,0.14966,0.263473,1.358966,0.039532,1.094491
21,"(produce, snacks)",(dairy eggs),0.20068,0.568027,0.142857,0.711864,1.253222,0.028865,1.4992
