Instacart

In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(210508)

In [3]:
transactions = pd.read_csv('all_order_products.csv')

In [4]:
transactions.head(10)

Unnamed: 0,order_id,product_id,add_to_cart_order
0,2,33120,1
1,2,28985,2
2,2,9327,3
3,2,45918,4
4,2,30035,5
5,2,17794,6
6,2,40141,7
7,2,1819,8
8,2,43668,9
9,3,33754,1


In [5]:
transactions.shape

(33819106, 3)

In [6]:
transactions.groupby(['order_id'],as_index=False)['product_id']\
    .count()\
    .rename(columns={'product_id':'count'})\
    .shape

(3346083, 2)

In [7]:
transactions.product_id.nunique()

49685

In [8]:
products = pd.read_csv('products.csv')

In [9]:
products.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
5,6,Dry Nose Oil,11,11
6,7,Pure Coconut Water With Orange,98,7
7,8,Cut Russet Potatoes Steam N' Mash,116,1
8,9,Light Strawberry Blueberry Yogurt,120,16
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7


In [10]:
products.shape

(49688, 4)

In [11]:
products.aisle_id.nunique()

134

In [12]:
df_merged = pd.merge(transactions,products,"inner",on="product_id")

In [13]:
df_merged.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,product_name,aisle_id,department_id
0,2,33120,1,Organic Egg Whites,86,16
1,26,33120,5,Organic Egg Whites,86,16
2,120,33120,13,Organic Egg Whites,86,16
3,327,33120,5,Organic Egg Whites,86,16
4,390,33120,28,Organic Egg Whites,86,16


In [14]:
txs = df_merged.groupby(['order_id'])['aisle_id'].apply(lambda x: list(np.unique(x)))

In [15]:
txs.head()

order_id
1                           [21, 24, 83, 95, 108, 120]
2                  [17, 19, 83, 86, 88, 104, 105, 123]
3                          [35, 83, 91, 112, 120, 123]
4                [3, 11, 31, 48, 64, 78, 93, 107, 125]
5    [1, 4, 21, 24, 32, 33, 45, 49, 54, 61, 78, 83,...
Name: aisle_id, dtype: object

In [16]:
txs_list = txs.values.tolist()

In [17]:
from mlxtend.preprocessing import TransactionEncoder

In [18]:
te = TransactionEncoder()
txs_formatted = te.fit(txs_list).transform(txs_list)

In [19]:
txs_formatted.shape

(3346083, 134)

In [20]:
txs_formatted

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False,  True, ..., False, False, False]])

In [21]:
df = pd.DataFrame(txs_formatted, columns = te.columns_)
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,125,126,127,128,129,130,131,132,133,134
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,True,False,False,True,False,False,False,False,False,False,...,False,False,True,False,False,False,True,False,False,False


In [22]:
from mlxtend.frequent_patterns import apriori

In [23]:
apriori_df = apriori(df, min_support=0.1, use_colnames=True)
apriori_df.sort_values(by = 'support',ascending=False)

Unnamed: 0,support,itemsets
1,0.556755,(24)
5,0.444341,(83)
15,0.367445,(123)
19,0.318137,"(24, 83)"
26,0.270937,"(24, 123)"
14,0.263093,(120)
6,0.243671,(84)
29,0.235156,"(123, 83)"
0,0.22985,(21)
12,0.191574,(115)


In [24]:
from mlxtend.frequent_patterns import association_rules

In [25]:
rules_df = association_rules(apriori_df,metric="confidence",min_threshold=0.5)
rules_df.loc[rules_df['lift']>1.4].sort_values(by="lift",ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
21,"(24, 83)",(123),0.318137,0.367445,0.187271,0.588649,1.602006,0.070373,1.537749
22,(123),"(24, 83)",0.367445,0.318137,0.187271,0.509658,1.602006,0.070373,1.390585
19,"(24, 123)",(83),0.270937,0.444341,0.187271,0.691198,1.555556,0.066882,1.7994
23,"(120, 24)",(123),0.187954,0.367445,0.105528,0.561458,1.528007,0.036466,1.442405
15,"(24, 21)",(83),0.15521,0.444341,0.10465,0.674247,1.517407,0.035684,1.705765
24,"(120, 123)",(24),0.127933,0.556755,0.105528,0.824872,1.48157,0.034301,2.530974
18,"(120, 83)",(24),0.144146,0.556755,0.117873,0.817736,1.468754,0.037619,2.431887
13,(123),(83),0.367445,0.444341,0.235156,0.639977,1.440283,0.071885,1.543399
14,(83),(123),0.444341,0.367445,0.235156,0.529224,1.440283,0.071885,1.343645
20,"(83, 123)",(24),0.235156,0.556755,0.187271,0.796368,1.430374,0.056347,2.176699
