In [1]:
import pandas as pd
import numpy as np
from itertools import combinations, groupby
from collections import Counter
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
aisles=pd.read_csv('../input/market-basket-analysis/aisles.csv')
departments=pd.read_csv('../input/market-basket-analysis/departments.csv')
orders=pd.read_csv('../input/market-basket-analysis/orders.csv')
products=pd.read_csv('../input/market-basket-analysis/products.csv')
sample_submission=pd.read_csv('../input/market-basket-analysis/sample_submission.csv')
order_products__prior=pd.read_csv('../input/market-basket-analysis/order_products__prior.csv')
order_products__train=pd.read_csv('../input/market-basket-analysis/order_products__train.csv')

In [4]:
orders['days_since_prior_order']=orders['days_since_prior_order'].fillna(-1)

In [5]:
product_dict = dict(zip(products['product_id'], products['product_name']))
product_dict_inv = dict(zip(products['product_name'], products['product_id']))

In [None]:
#Merging  train order and prior orders first (to check)

Dataset_orders = pd.concat([order_products__prior, order_products__train]).sort_values(by=['order_id'])

In [None]:
#Concatenating with other DFs

Dataset_orders = pd.merge(left = Dataset_orders, right = products,
                             left_on='product_id', right_on='product_id').sort_values(by=['order_id']).reset_index(drop=True)
Dataset_orders = pd.merge(left = Dataset_orders, right = aisles,
                             left_on='aisle_id', right_on='aisle_id').sort_values(by=['order_id']).reset_index(drop=True)
Dataset_orders = pd.merge(left = Dataset_orders, right = departments,
                             left_on='department_id', right_on='department_id').sort_values(by=['order_id']).reset_index(drop=True)
prior_train_orders = pd.merge(left = Dataset_orders, right = orders,
                             left_on='order_id', right_on='order_id').sort_values(by=['order_id']).reset_index(drop=True)

### Apriori

In [8]:
df1 = pd.merge(order_products__prior, orders, on= 'order_id')
prod_aisles = pd.merge(products, aisles, on = 'aisle_id')
df2 = pd.merge(prod_aisles, departments, on = 'department_id')
combined_df = pd.merge(df1, df2, on = 'product_id').reset_index(drop=True)

In [9]:
np.random.seed(942)
temp_df = prior_train_orders.sample(n=1000000)[['user_id','product_name']]
basket = pd.crosstab(temp_df['user_id'],temp_df['product_name']).astype('bool').astype('int')



In [41]:
orders_items = prior_train_orders.set_index('order_id')['product_id']
orders_items.head(10)

order_id
1    49302
1    49683
1    13176
1    43633
1    10246
1    47209
1    22035
1    11109
2    17794
2    30035
Name: product_id, dtype: int64

In [42]:
# frequency counts for items and item pairs
def frequency(iterable):
    return iterable.value_counts().rename("frequency") if type(iterable) == pd.core.series.Series else pd.Series(Counter(iterable)).rename("frequency")

# number of unique orders
def order_count(order_item):
    return len(set(order_item.index))

# generator function to yield item pairs
def build_item_pairs(order_item):
    order_item = order_item.reset_index().values
    for _, data in groupby(order_item, lambda x: x[0]):
        item_list = [item[1] for item in data]

        for item_pair in combinations(item_list, 2):
            yield item_pair

# frequency and support associated with item
def merge_item_stats(item_pairs, item_stats):
    return (item_pairs
            .merge(item_stats.rename(columns={'frequency': 'frequencyA', 'support': 'supportA'}), left_on='item_A', right_index=True)
            .merge(item_stats.rename(columns={'frequency': 'frequencyB', 'support': 'supportB'}), left_on='item_B', right_index=True))

# name associated with item
def merge_item_name(rules, item_name):
    columns = ['itemA', 'itemB', 'freqAB', 'supportAB', 'frequencyA', 'supportA', 'frequencyB', 'supportB',
               'confAtoB', 'confBtoA', 'lift']
    rules = (rules
             .merge(item_name.rename(columns={'item_name': 'itemA'}), left_on='item_A', right_on='item_id')
             .merge(item_name.rename(columns={'item_name': 'itemB'}), left_on='item_B', right_on='item_id'))
    return rules[columns]


In [43]:
def generate_association_rules(order_item, min_support):
    # item frequency and support
    item_stats = frequency(order_item).to_frame("frequency")
    item_stats['support'] = item_stats['frequency'] / order_count(order_item) * 100

    # order_item items below min support
    qualifying_items = item_stats[item_stats['support'] >= min_support].index
    order_item = order_item[order_item.isin(qualifying_items)]

    # orders with less than 2 items
    order_size = frequency(order_item.index)
    qualifying_orders = order_size[order_size >= 2].index
    order_item = order_item[order_item.index.isin(qualifying_orders)]

    # item frequency and support
    item_stats = frequency(order_item).to_frame("frequency")
    item_stats['support'] = item_stats['frequency'] / order_count(order_item) * 100

    # build item pairs
    item_pair_gen = build_item_pairs(order_item)

    # item pair frequency and support
    item_pairs = frequency(item_pair_gen).to_frame("freqAB")
    item_pairs['supportAB'] = item_pairs['freqAB'] / len(qualifying_orders) * 100

    # item_pairs below min support
    item_pairs = item_pairs[item_pairs['supportAB'] >= min_support]

    # association rules df
    item_pairs = item_pairs.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
    item_pairs = merge_item_stats(item_pairs, item_stats)

    # association rules metrics
    item_pairs['confAtoB'] = item_pairs['supportAB'] / item_pairs['supportA']
    item_pairs['confBtoA'] = item_pairs['supportAB'] / item_pairs['supportB']
    item_pairs['lift'] = item_pairs['supportAB'] / (item_pairs['supportA'] * item_pairs['supportB'])

    return item_pairs.sort_values('lift', ascending=False)


In [44]:
rules = generate_association_rules(orders_items, 0.01)

In [45]:
rules[['item_A', 'item_B', 'lift']].sort_values('lift', ascending=False).head(10)

Unnamed: 0,item_A,item_B,lift
38192,11212,12820,5.218205
23112,44781,32201,4.751334
26347,32201,44781,4.649155
45670,93,32792,4.619217
43798,12060,21527,4.191511
22561,21527,12060,3.97601
38297,1577,35050,3.941266
36409,1577,12060,3.914005
31197,12060,1577,3.841016
36080,32792,93,3.76862


In [46]:
rules['item_A'] = [product_dict[id] for id in rules['item_A']]
rules['item_B'] = [product_dict[id] for id in rules['item_B']]

In [47]:
rules[['item_A', 'item_B', 'lift']].sort_values('lift', ascending=False).head(10)

Unnamed: 0,item_A,item_B,lift
38192,Apple Blueberry Fruit Yogurt Smoothie,Organic Fruit Yogurt Smoothie Mixed Berry,5.218205
23112,"0% Greek, Blueberry on the Bottom Yogurt",Nonfat Strawberry With Fruit On The Bottom Gre...,4.751334
26347,Nonfat Strawberry With Fruit On The Bottom Gre...,"0% Greek, Blueberry on the Bottom Yogurt",4.649155
45670,Uncured Cracked Pepper Beef,Chipotle Beef & Pork Realstick,4.619217
43798,Unsweetened Whole Milk Mixed Berry Greek Yogurt,Unsweetened Whole Milk Blueberry Greek Yogurt,4.191511
22561,Unsweetened Whole Milk Blueberry Greek Yogurt,Unsweetened Whole Milk Mixed Berry Greek Yogurt,3.97601
38297,Unsweetened Whole Milk Peach Greek Yogurt,Unsweetened Whole Milk Strawberry Yogurt,3.941266
36409,Unsweetened Whole Milk Peach Greek Yogurt,Unsweetened Whole Milk Mixed Berry Greek Yogurt,3.914005
31197,Unsweetened Whole Milk Mixed Berry Greek Yogurt,Unsweetened Whole Milk Peach Greek Yogurt,3.841016
36080,Chipotle Beef & Pork Realstick,Uncured Cracked Pepper Beef,3.76862
