In [None]:
import os
import numpy as np
import pandas as pd
from zipfile import ZipFile
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display
import sys

input = '../input/instacart-market-basket-analysis/'
output = '/kaggle/working/'

In [None]:
def extract_zip(file_name: str)-> str:
    with ZipFile(file_name, 'r') as zip:
        zip.extractall()
        print('Done!')

In [None]:
def size(obj):
    return "{0:.2f} MB".format(sys.getsizeof(obj) / (1000 * 1000))

In [None]:
extract_zip(input + 'aisles.csv.zip')
extract_zip(input + 'departments.csv.zip')
extract_zip(input + 'order_products__prior.csv.zip')
extract_zip(input + 'order_products__train.csv.zip')
extract_zip(input + 'orders.csv.zip')
extract_zip(input + 'products.csv.zip')
extract_zip(input + 'sample_submission.csv.zip')

In [None]:
order_products_prior = pd.read_csv(output + 'order_products__prior.csv')
products = pd.read_csv(output + 'products.csv')

In [None]:
print('order_products_prior -- dimensions: {0};   size: {1}'.format(order_products_prior.shape, size(order_products_prior)))

In [None]:
order_products_prior = order_products_prior.set_index('order_id')['product_id'].rename('item_id')
display(order_products_prior.head(10))
type(order_products_prior)

In [None]:
print('dimensions: {0};   size: {1};   unique_orders: {2};   unique_items: {3}'
      .format(order_products_prior.shape, size(order_products_prior), len(order_products_prior.index.unique()), len(order_products_prior.value_counts())))

In [None]:
class AprioriAlg():
    # Returns frequency counts for items and item pairs
    def freq(self, iterable):
        if type(iterable) == pd.core.series.Series:
            return iterable.value_counts().rename("freq")
        else: 
            return pd.Series(Counter(iterable)).rename("freq")


    # Returns number of unique orders
    def order_count(self,order_item):
        return len(set(order_item.index))


    # Returns generator that yields item pairs, one at a time
    def get_item_pairs(self,order_item):
        order_item = order_item.reset_index().to_numpy()
        for order_id, order_object in groupby(order_item, lambda x: x[0]):
            item_list = [item[1] for item in order_object]

            for item_pair in combinations(item_list, 2):
                yield item_pair


    # Returns frequency and support associated with item
    def merge_item_stats(self,item_pairs, item_stats):
        return (item_pairs
                    .merge(item_stats.rename(columns={'freq': 'freqA', 'support': 'supportA'}), left_on='item_A', right_index=True)
                    .merge(item_stats.rename(columns={'freq': 'freqB', 'support': 'supportB'}), left_on='item_B', right_index=True))


    # Returns name associated with item
    def merge_item_name(self,rules, item_name):
        columns = ['itemA','itemB','freqAB','supportAB','freqA','supportA','freqB','supportB', 
                   'confidenceAtoB','confidenceBtoA','lift']
        rules = (rules
                    .merge(item_name.rename(columns={'item_name': 'itemA'}), left_on='item_A', right_on='item_id')
                    .merge(item_name.rename(columns={'item_name': 'itemB'}), left_on='item_B', right_on='item_id'))
        return rules[columns] 
    
    # Return association rules sorted by lift in descending order
    def fit_transform(self,order_item, min_support):
        print("Starting order_item: {:22d}".format(len(order_item)))

        # Calculate item frequency and support
        item_stats = self.freq(order_item).to_frame("freq")
        item_stats['support'] = item_stats['freq'] / self.order_count(order_item) * 100

        # Filter from order_item items below min support 
        qualifying_items = item_stats[item_stats['support'] >= min_support].index
        order_item = order_item[order_item.isin(qualifying_items)]

        print("Items with support >= {}: {:15d}".format(min_support, len(qualifying_items)))
        print("Remaining order_item: {:21d}".format(len(order_item)))

        # Filter from order_item orders with less than 2 items
        order_size = self.freq(order_item.index)
        qualifying_orders = order_size[order_size >= 2].index
        order_item = order_item[order_item.index.isin(qualifying_orders)]

        print("Remaining orders with 2+ items: {:11d}".format(len(qualifying_orders)))
        print("Remaining order_item: {:21d}".format(len(order_item)))

        # Recalculate item frequency and support
        item_stats = self.freq(order_item).to_frame("freq")
        item_stats['support'] = item_stats['freq'] / self.order_count(order_item) * 100

        # Get item pairs generator
        item_pair_gen = self.get_item_pairs(order_item)


        # Calculate item pair frequency and support
        item_pairs = self.freq(item_pair_gen).to_frame("freqAB")
        item_pairs['supportAB'] = item_pairs['freqAB'] / len(qualifying_orders) * 100

        print("Item pairs: {:31d}".format(len(item_pairs)))


        # Filter from item_pairs those below min support
        item_pairs = item_pairs[item_pairs['supportAB'] >= min_support]

        print("Item pairs with support >= {}: {:10d}\n".format(min_support, len(item_pairs)))


        # Create table of association rules and compute relevant metrics
        item_pairs = item_pairs.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
        item_pairs = self.merge_item_stats(item_pairs, item_stats)

        item_pairs['confidenceAtoB'] = item_pairs['supportAB'] / item_pairs['supportA']
        item_pairs['confidenceBtoA'] = item_pairs['supportAB'] / item_pairs['supportB']
        item_pairs['lift'] = item_pairs['supportAB'] / (item_pairs['supportA'] * item_pairs['supportB'])

        return item_pairs.sort_values('lift', ascending=False)

In [None]:
alg = AprioriAlg()
##1/3 because all is too much for memory allocation
items = order_products_prior[:len(order_products_prior)//3]
rules = alg.fit_transform(items, 0.01)

In [None]:
# Replace item ID with item name and display association rules
item_name   = products.rename(columns={'product_id':'item_id', 'product_name':'item_name'})
rules_final = alg.merge_item_name(rules, item_name).sort_values('lift', ascending=False)
rules_final.reset_index()
display(rules_final)

In [None]:
rules_final.loc[rules_final['itemA'] == 'Organic Hass Avocado']