# Association rules using Instacart Market Basket Analysis Dataset

### Motivation:
Understand consumer behaviour using a machine learning algorithm. 

## 1 - Importing the libraries and preparing the data

In [None]:
import numpy as np 
import pandas as pd 



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
!pip install apyori

In [None]:
from apyori import apriori
from collections import Counter
from datetime import datetime
from itertools import combinations
import matplotlib.pyplot as plt

> ### Preparing the data

In [None]:
import zipfile
with zipfile.ZipFile('../input/instacart-market-basket-analysis/aisles.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('output')
with zipfile.ZipFile('../input/instacart-market-basket-analysis/departments.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('output')    
with zipfile.ZipFile('../input/instacart-market-basket-analysis/orders.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('output')
with zipfile.ZipFile('../input/instacart-market-basket-analysis/order_products__prior.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('output')
with zipfile.ZipFile('../input/instacart-market-basket-analysis/order_products__train.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('output')
with zipfile.ZipFile('../input/instacart-market-basket-analysis/products.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('output')

Checking if the files are in the folder 'output'

In [None]:
from subprocess import check_output
print(check_output(["ls", "./output"]).decode("utf8"))

### Reading the datasets
Checking data types and other information about the Aisles, Departments, Products and Orders Datasets. 

The dataset aisles contains all the ailes in the supermarket. To each aisle it is assigned an ID. 

In [None]:
aisles = pd.read_csv('./output/aisles.csv')
aisles.head()

In [None]:
aisles.info()

In [None]:
aisles.isna().sum(axis = 0)

The dataset departments contains a list of all departments in the supermarket. To each department it is assigned an ID. 

In [None]:
departments = pd.read_csv('./output/departments.csv')
departments.head()

In [None]:
departments.info()

In [None]:
departments.shape

There are 21 departments in this supermarket

The dataset Products lists all the products there are in the supermarket classified by aisle ID and department ID

In [None]:
products = pd.read_csv('./output/products.csv') 
products.head()

In [None]:
products.info()

In [None]:
products.shape

In [None]:
products.isna().sum(axis = 0)

In [None]:
products.describe()

The dataset orders tells to which set (prior, train, test) an order belongs, considering the machine learning model. 

In [None]:
orders = pd.read_csv('./output/orders.csv') 
orders.head()

In [None]:
orders.info()

In [None]:
orders.shape

Let's check how many data points are associated to each label of the eval_set column:

In [None]:
orders.eval_set.value_counts()

Deleting unecessary data for this analysis:

In [None]:
orders.drop('eval_set', axis = 1, inplace=True)

Checking for missing values

In [None]:
orders.describe()

In [None]:
orders.isna().sum(axis = 0)

In [None]:
orders.loc[orders.days_since_prior_order.isna()]

The column days_since_prior_order has NaN values becuase the first order (order_number = 1) obviously has no information on days since prior order, since this is the first order. 

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18, 6))
ax[0].boxplot(orders.order_dow, patch_artist=True)
ax[0].set_title('Boxplot day of week')
ax[0].set_ylabel('day of week')
n_bins=12
ax[1].hist(orders.order_hour_of_day, color='c', bins=n_bins)
ax[1].set_title('Histogram hour of day')
ax[1].set_xlabel('hour')
ax[1].set_ylabel('count');

In [None]:
plt.figure(figsize = (15,5))
plt.bar(range(200), orders.days_since_prior_order[:200] + 1)
plt.title('Days since prior order')
plt.xlabel('index')
plt.ylabel('days since prior order + 1');

In [None]:
orders_prior = pd.read_csv('./output/order_products__prior.csv') 
orders_prior.head()

This dataframe has an important information: the 'reordered' column informs us how many times a client ordered again a product.

In [None]:
orders_prior.info()

In [None]:
orders_prior.shape

In [None]:
orders_prior.isna().sum(axis=0)

There are no NaN values in this dataframe so we do not need to treat the data. 

### Creating a new dataframe for the study

In [None]:
orders_apriori = orders.copy()
orders_apriori.head()

In [None]:
#Visualizing number of orders by user id:
orders_by_user = orders.groupby('user_id')['order_number'].max()

In [None]:
orders_by_user.head()

User 1 made 11 orders, user 2 made 15 orders and so forth...

In [None]:
orders_prior.head(15)

Now we will merge 2 dataframes: orders (only the information order_id and user_id and orders_prior order_id and add_to_cart_order(renamed to size_of_order)

In [None]:
products_by_user = orders[['order_id', 'user_id']].merge(
    orders_prior[['order_id', 'add_to_cart_order']].groupby('order_id').max().rename({'add_to_cart_order': 'size_of_order'}, axis = 1),
                                                                                        on = 'order_id')

In [None]:
products_by_user.head()

In [None]:
products_by_user = products_by_user.drop('order_id', axis = 1).groupby('user_id')['size_of_order'].sum()

Checking minimum and maximum orders by user

In [None]:
min_ord= min(orders_by_user)
max_ord= max(orders_by_user)
print("The maximum orders by user is {} and the minimum orders by user is {} " .format(max_ord, min_ord))


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16,5))
ax[0].hist(orders_by_user, bins = max(orders_by_user) - min(orders_by_user))
ax[0].set_title('Count of orders by user')
ax[0].set_xlabel('number of orders')
ax[0].set_ylabel('count')

ax[1].hist(products_by_user, bins = 100, color='k')
ax[1].set_title('Count of products by user')
ax[1].set_xlabel('number of products')
ax[1].set_ylabel('count');

In [None]:
orders_apriori.drop(['user_id', 'order_id'], axis = 1, inplace=True)

In [None]:
orders_apriori.head()

#### Orders by number

In [None]:
orders.head()

In [None]:
orders_by_order_number = orders.order_number.value_counts()
orders_by_order_number

In [None]:
plt.bar(orders_by_order_number.index, orders_by_order_number)
plt.title('Number of orders by order number')
plt.xlabel('order number')
plt.ylabel('number of orders');

In [None]:
def order_number_categorical(order_number):
  if order_number in range(3):
    return 'order_number_1-3'
  if order_number in range(3, 5):
    return 'order_number_4-5'
  if order_number in range(5, 10):
    return 'order_number_6-10'
  if order_number in range(10, 20):
    return 'order_number_11-20'
  if order_number in range(20, 40):
    return 'order_number_21-40'
  if order_number in range(40, 60):
    return 'order_number_41-60'
  if order_number >= 60:
    return 'order_number_60+'

In [None]:
orders_apriori.order_number = orders_apriori.order_number.map(order_number_categorical)

#### Orders by day of the week

In [None]:
orders_by_dow = orders.order_dow.value_counts()
orders_by_dow

In [None]:
products_by_dow = orders[['order_id', 'order_dow']].merge(
    orders_prior[['order_id', 'add_to_cart_order']].groupby('order_id').max().rename({'add_to_cart_order': 'order_size'}, axis = 1),
    on = 'order_id'
)

In [None]:
products_by_dow = products_by_dow.drop('order_id', axis=1).groupby('order_dow')['order_size'].sum()

In [None]:
products_by_dow

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].bar(orders_by_dow.index, orders_by_dow)
ax[0].set_title('Number of orders by day of week')
ax[0].set_xlabel('day of week')
ax[0].set_ylabel('number of orders')

ax[1].bar(products_by_dow.index, products_by_dow)
ax[1].set_title('Number of products by day of week')
ax[1].set_xlabel('day of week')
ax[1].set_ylabel('number of products');

In [None]:
def dow_categorical(dow):
  if dow in [0, 1]:
    return 'weekend'
  else:
    return 'weekday'

In [None]:
orders_apriori.order_dow = orders_apriori.order_dow.map(dow_categorical)
orders_apriori.head()

#### Hour of the day

In [None]:
orders_by_hour = orders.order_hour_of_day.value_counts()
orders_by_hour

In [None]:
products_by_hour = orders[['order_id', 'order_hour_of_day']].merge(
    orders_prior[['order_id', 'add_to_cart_order']].groupby('order_id').max().rename({'add_to_cart_order': 'order_size'}, axis = 1),
    on = 'order_id'
)
products_by_hour = products_by_hour.drop('order_id', axis = 1).groupby('order_hour_of_day')['order_size'].sum()

In [None]:
products_by_hour

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].bar(orders_by_hour.index, orders_by_hour, color='m')
ax[0].set_title('Number of orders by hour of day')
ax[0].set_xlabel('hour of day')
ax[0].set_ylabel('number of orders')

ax[1].bar(products_by_hour.index, products_by_hour, color='k')
ax[1].set_title('Number of products by hour of day')
ax[1].set_xlabel('hour of day')
ax[1].set_ylabel('number of products');

In [None]:
def hour_categorical(hour):
  if hour in range(7):
    return 'early_hours'
  if hour in range(7,10):
    return 'hour_' + str(hour)
  if hour in range(10, 17):
    return 'peak_hours'
  if hour in range(17, 24):
    return 'hour_' + str(hour)

In [None]:
orders_apriori.order_hour_of_day = orders_apriori.order_hour_of_day.map(hour_categorical)
orders_apriori.head()

In [None]:
plt.hist(orders.days_since_prior_order, bins = 30)
plt.title('Histogram of days since prior order')
plt.xlabel('days')
plt.ylabel('count of days');

In [None]:
def interval_categorical(interval):
  if np.isnan(interval):
    return 'first_order'
  elif interval in [7, 14, 21]:
    return 'interval_weekly'
  elif interval == 30:
    return 'interval_30+'
  else:
    return 'interval_others'

In [None]:
orders_apriori.days_since_prior_order = orders_apriori.days_since_prior_order.map(interval_categorical)
orders_apriori.head()

### Products bought again

In [None]:
products_id_to_name = {k: v for k, v in zip(products.product_id, products.product_name)}
print(products_id_to_name)

In [None]:
order_products_names = orders_prior.copy()
order_products_names['product_name'] = order_products_names.product_id.map(lambda x: products_id_to_name[x])
order_products_names

In [None]:
reorder_proportion = pd.crosstab(order_products_names.product_name, order_products_names.reordered)
reorder_proportion

In [None]:
reorder_proportion.sort_values(by = 0, ascending=False)

In [None]:
reorder_proportion.sort_values(by = 1, ascending=False)

In [None]:
reorder_proportion['total'] = reorder_proportion.sum(axis = 1)
reorder_proportion['0.perc'] = reorder_proportion[0] / reorder_proportion['total']
reorder_proportion['1.perc'] = reorder_proportion[1] / reorder_proportion['total']

In [None]:
reorder_proportion.head()

In [None]:
reorder_proportion.sort_values(by = ['0.perc', 'total'], ascending = False)[['0.perc', 'total']]

In [None]:
reorder_proportion.sort_values(by = ['1.perc', 'total'], ascending = False)[['1.perc', 'total']]

In [None]:
reorder_proportion.total.sort_values(ascending=False)

### Products which were not bought

In [None]:
products_bought = sorted(orders_prior.product_id.unique())
print(len(products_bought), len(products))

In [None]:
products_not_bought = list(products.product_id[~products.product_id.isin(products_bought)])
products_not_bought

In [None]:
[products_id_to_name[product] for product in products_not_bought]

In [None]:
products_not_registered = list(pd.Series(products_bought)[~pd.Series(products_bought).isin(products.product_id)])
print(len(products_not_registered), products_not_registered)

### Size of basket

In [None]:
cart_size = orders_prior.groupby('order_id')['add_to_cart_order'].max()
cart_size

In [None]:
cart_size = cart_size.value_counts()
cart_size

In [None]:
plt.bar(cart_size.index, cart_size)
plt.title('Count of order size')
plt.xlabel('order size')
plt.ylabel('count');

### Best sellers

In [None]:
add_to_cart = pd.crosstab(order_products_names.product_name, order_products_names.add_to_cart_order)
add_to_cart

In [None]:
for i in range(1,6):
  print('ORDER = ', i)
  print(add_to_cart.sort_values(by = i, ascending=False)[i][:5])
  print('\n')

## Association Rules

### Purchase behavior

In [None]:
orders_apriori.head()

In [None]:
orders_apriori.shape

In [None]:
transactions = []
for i in range(orders_apriori.shape[0]):
  transactions.append([str(orders_apriori.values[i, j]) for j in range(orders_apriori.shape[1])])

In [None]:
transactions[:4]

In [None]:
0.005 * 100

In [None]:
start = datetime.now()
rules = apriori(transactions, min_support = 0.005, min_confidence = 0.2, min_lift = 2)
results = list(rules)
print('Execution time: ', datetime.now() - start)

In [None]:
len(results)

In [None]:
results[0]

In [None]:
results[0][0]

In [None]:
results[0][1]

In [None]:
r = results[0][2]
r

In [None]:
type(r)

In [None]:
r[0]

In [None]:
r[1]

In [None]:
r[0][2]

In [None]:
r[0][3]

In [None]:
A = []
B = []
support = []
confidence = []
lift = []

for result in results:
  s = result[1]
  result_rules = result[2]
  for result_rule in result_rules:
    a = list(result_rule[0])
    b = list(result_rule[1])
    c = result_rule[2]
    l = result_rule[3]
    A.append(a)
    B.append(b)
    support.append(s)
    confidence.append(c)
    lift.append(l) 

rules_df = pd.DataFrame({
    'A': A,
    'B': B,
    'support': support,
    'confidence': confidence,
    'lift': lift
})

rules_df = rules_df.sort_values(by = 'lift', ascending = False).reset_index(drop = True)
len(rules_df)

In [None]:
A[0]

In [None]:
B[0]

In [None]:
A[1], B[1]

In [None]:
rules_df.head()

### Associations between products

#### Pipeline

In [None]:
transactions_df = orders_prior[['order_id', 'product_id']][:5000]
transactions_df

In [None]:
n_orders = len(set(transactions_df.order_id))
n_products = len(set(transactions_df.product_id))
print(n_orders, n_products)

In [None]:
transactions_df.product_id.value_counts()

In [None]:
product_frequency = transactions_df.product_id.value_counts() / n_orders
product_frequency

In [None]:
min(product_frequency), max(product_frequency)

In [None]:
plt.hist(product_frequency, bins = 100)
plt.title('Number of times each product frequency occurs')
plt.xlabel('product frequency')
plt.ylabel('number of times');

In [None]:
plt.hist(product_frequency, bins = 100)
plt.title('Number of times each product frequency occurs')
plt.xlabel('product frequency')
plt.ylabel('number of times')
plt.ylim([0, 100]);

In [None]:
min_support = 0.01
products_apriori = product_frequency[product_frequency >= min_support]
print(products_apriori)

In [None]:
products_apriori.index

In [None]:
transactions_apriori = transactions_df[transactions_df.product_id.isin(products_apriori.index)]
transactions_apriori

In [None]:
order_sizes = transactions_apriori.order_id.value_counts()
order_sizes

In [None]:
max(order_sizes), min(order_sizes)

In [None]:
plt.hist(order_sizes, bins = max(order_sizes) - min(order_sizes))
plt.title('Number of times each order size occurs')
plt.xlabel('order size')
plt.ylabel('number of times');

In [None]:
min_lenght = 2
orders_apriori = order_sizes[order_sizes >= min_lenght]
print(orders_apriori)

In [None]:
transactions_apriori = transactions_apriori[transactions_apriori.order_id.isin(orders_apriori.index)]
print(transactions_apriori)

In [None]:
transactions_by_order = transactions_apriori.groupby('order_id')['product_id']
for order_id, order_list in transactions_by_order:
  print('Order_id:', order_id, '\nOrder_list: ', list(order_list))
  product_combinations = combinations(order_list, 2)
  print('Product combinations:')
  print([i for i in product_combinations])
  print('\n')

In [None]:
min(1, 4)

In [None]:
def product_combinations(transactions_df, max_length = 5):
  transactions_by_order = transactions_df.groupby('order_id')['product_id']
  max_length_reference = max_length
  for order_id, order_list in transactions_by_order:
    max_length = min(max_length_reference, len(order_list))
    order_list = sorted(order_list)
    for l in range(2, max_length + 1):
      product_combinations = combinations(order_list, l)
      for combination in product_combinations:
        yield combination

In [None]:
combs = product_combinations(transactions_apriori)

In [None]:
combs

In [None]:
next(iter(combs))

In [None]:
next(iter(combs))

In [None]:
for _ in range(100):
  print(next(iter(combs)))

In [None]:
n_orders

In [None]:
combs = product_combinations(transactions_apriori)
counter = Counter(combs).items()
combinations_count = pd.Series([x[1] for x in counter], index = [x[0] for x in counter])
combinations_frequency = combinations_count / n_orders
print(combinations_frequency)

In [None]:
min(combinations_frequency), max(combinations_frequency)

In [None]:
combinations_apriori = combinations_frequency[combinations_frequency >= min_support]
combinations_apriori = combinations_apriori[combinations_apriori.index.map(len) >= min_lenght]
print(combinations_apriori, len(combinations_apriori))

In [None]:
A = []
B = []
AB = []
for c in combinations_apriori.index:
  c_length = len(c)
  for l in range(1, c_length):
    comb = combinations(c, l)
    for a in comb:
      AB.append(c)
      b = list(c)
      for e in a:
        b.remove(e)
      b = tuple(b)
      if len(a) == 1:
        a = a[0]
      A.append(a)
      if len(b) == 1:
        b = b[0]
      B.append(b)

In [None]:
apriori_df = pd.DataFrame({'A': A,
                           'B': B,
                           'AB': AB})

In [None]:
apriori_df.head()

In [None]:
products_apriori

In [None]:
combinations_frequency

In [None]:
support = {**{k: v for k, v in products_apriori.items()},
           **{k: v for k, v in combinations_frequency.items()}}

In [None]:
support

In [None]:
apriori_df[['support_A', 'support_B', 'support_AB']] = apriori_df[['A', 'B', 'AB']].applymap(lambda x: support[x])

In [None]:
apriori_df.head()

In [None]:
apriori_df.drop('AB', axis = 1, inplace=True)
apriori_df.head()

In [None]:
apriori_df['confidence'] = apriori_df.support_AB / apriori_df.support_A

In [None]:
apriori_df['lift'] = apriori_df.confidence / apriori_df.support_B

In [None]:
min_confidence = 0.2
min_lift = 1.0
apriori_df = apriori_df[apriori_df.confidence >= min_confidence]
apriori_df = apriori_df[apriori_df.lift >= min_lift]

In [None]:
apriori_df = apriori_df.sort_values(by = 'lift', ascending=False).reset_index(drop = True)
apriori_df.head()

In [None]:
products_id_to_name[12341]

In [None]:
def convert_product_id_to_name(product_ids):
  if type(product_ids) == int:
    return products_id_to_name[product_ids]
  names = []
  for prod in product_ids:
    name = products_id_to_name[prod]
    names.append(name)
  names = tuple(names)
  return names

In [None]:
apriori_df[['A', 'B']] = apriori_df[['A', 'B']].applymap(convert_product_id_to_name)
apriori_df

### Creating association rules

In [None]:
def association_rules(orders_prior, min_support, min_length = 2, max_length = 5, 
                      min_confidence = 0.2, min_lift = 1.0):
    
    print('Loading data...')
    transactions_df = ordesr_prior[['order_id', 'product_id']]

    print('Calculating product supports...')
    n_orders = len(set(transactions_df.order_id))
    product_frequency = transactions_df.product_id.value_counts()/n_orders
    products_apriori = product_frequency[product_frequency >= min_support]
    transactions_apriori = transactions_df[transactions_df.product_id.isin(products_apriori.index)]
    
    order_sizes = transactions_apriori.order_id.value_counts()
    orders_apriori = order_sizes[order_sizes >= min_length]
    transactions_apriori = transactions_apriori[transactions_apriori.order_id.isin(orders_apriori.index)]
    
    print('Calculating product combinations and supports...')

#### Running the rules

In [None]:
orders_prior.head()

In [None]:
start = datetime.now()
rules = association_rules(orders_prior, min_support = 0.01)
print('Execution time: ', datetime.now() - start)

In [None]:
rules

In [None]:
start = datetime.now()
rules = association_rules(order_products, min_support = 0.005, max_length=4)
print('Execution time: ', datetime.now() - start)

In [None]:
start = datetime.now()
rules = association_rules(order_products, min_support = 0.002, max_length=3)
print('Execution time: ', datetime.now() - start)

In [None]:
rules.head()

In [None]:
start = datetime.now()
rules = association_rules(order_products, min_support = 0.001, max_length=2)
print('Execution time: ', datetime.now() - start)

In [None]:
rules.head()