Instacart

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
np.random.seed(210508)

In [3]:
transactions = pd.read_csv('all_order_products.csv')

In [4]:
transactions.head(10)

Unnamed: 0,order_id,product_id,add_to_cart_order
0,2,33120,1
1,2,28985,2
2,2,9327,3
3,2,45918,4
4,2,30035,5
5,2,17794,6
6,2,40141,7
7,2,1819,8
8,2,43668,9
9,3,33754,1


In [5]:
transactions.shape

(33819106, 3)

In [6]:
transactions.groupby(['order_id'],as_index=False)['product_id']\
    .count()\
    .rename(columns={'product_id':'count'})\
    .shape

(3346083, 2)

In [7]:
transactions.product_id.nunique()

49685

In [8]:
products = pd.read_csv('products.csv')

In [9]:
products.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
5,6,Dry Nose Oil,11,11
6,7,Pure Coconut Water With Orange,98,7
7,8,Cut Russet Potatoes Steam N' Mash,116,1
8,9,Light Strawberry Blueberry Yogurt,120,16
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7


In [10]:
products.shape

(49688, 4)

In [11]:
products.department_id.nunique()

21

In [12]:
df_merged = pd.merge(transactions,products,"inner",on="product_id")

In [13]:
df_merged.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,product_name,aisle_id,department_id
0,2,33120,1,Organic Egg Whites,86,16
1,26,33120,5,Organic Egg Whites,86,16
2,120,33120,13,Organic Egg Whites,86,16
3,327,33120,5,Organic Egg Whites,86,16
4,390,33120,28,Organic Egg Whites,86,16


In [14]:
txs = df_merged.groupby(['order_id'])['department_id'].apply(lambda x: list(np.unique(x)))

In [15]:
txs.head()

order_id
1                                 [4, 15, 16]
2                                 [4, 13, 16]
3                              [3, 4, 12, 16]
4                          [3, 7, 11, 14, 19]
5    [4, 6, 7, 9, 11, 12, 13, 16, 17, 19, 20]
Name: department_id, dtype: object

In [16]:
txs_list = txs.values.tolist()

In [17]:
from mlxtend.preprocessing import TransactionEncoder

In [18]:
te = TransactionEncoder()
txs_formatted = te.fit(txs_list).transform(txs_list)

In [19]:
txs_formatted.shape

(3346083, 21)

In [20]:
txs_formatted

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False,  True, ..., False, False, False],
       ...,
       [False, False,  True, ...,  True,  True, False],
       [False, False, False, ...,  True, False, False],
       [ True, False, False, ...,  True, False, False]])

In [21]:
df = pd.DataFrame(txs_formatted, columns = te.columns_)
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
0,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,False,False
1,False,False,False,True,False,False,False,False,False,False,...,False,True,False,False,True,False,False,False,False,False
2,False,False,True,True,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False
3,False,False,True,False,False,False,True,False,False,False,...,False,False,True,False,False,False,False,True,False,False
4,False,False,False,True,False,True,True,False,True,False,...,True,True,False,False,True,True,False,True,True,False


In [22]:
from mlxtend.frequent_patterns import apriori

In [23]:
apriori_df = apriori(df, min_support=0.1, use_colnames=True)
apriori_df.sort_values(by = 'support',ascending=False)

Unnamed: 0,support,itemsets
2,0.749009,(4)
9,0.676833,(16)
32,0.550223,"(16, 4)"
3,0.453914,(7)
11,0.432969,(19)
...,...,...
33,0.102139,"(17, 4)"
93,0.101744,"(16, 1, 3, 4)"
42,0.101234,"(9, 19)"
88,0.100386,"(19, 13, 7)"


In [24]:
from mlxtend.frequent_patterns import association_rules

In [25]:
rules_df = association_rules(apriori_df,metric="confidence",min_threshold=0.5)
rules_df.loc[rules_df['lift']>1.4].sort_values(by="lift",ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
163,"(16, 20)","(19, 4)",0.193277,0.337303,0.104436,0.540346,1.601961,0.039244,1.44173
165,"(20, 4)","(16, 19)",0.206267,0.322617,0.104436,0.506317,1.569408,0.037891,1.372102
107,(20),"(19, 4)",0.239857,0.337303,0.120562,0.502642,1.490181,0.039658,1.332435
145,"(16, 3)","(19, 4)",0.225124,0.337303,0.112843,0.501247,1.486045,0.036908,1.328709
27,(15),(13),0.212404,0.348315,0.109272,0.514452,1.476974,0.035288,1.342164
122,"(16, 3, 4)",(1),0.194753,0.368218,0.101744,0.522425,1.418792,0.030032,1.322895
135,"(16, 19, 4)",(1),0.270548,0.368218,0.140453,0.519142,1.409875,0.040832,1.313862
58,"(16, 20)",(1),0.193277,0.368218,0.100155,0.518196,1.407305,0.028987,1.311283
161,"(16, 20, 4)",(19),0.171983,0.432969,0.104436,0.60725,1.402527,0.029973,1.443747
