# About MBA with Association Rules

Market Basket Analysis with Association Rules is a technique that enables one to find sets of items that are often found together within a customer's basket (a transaction) across all orders.
It is primarily used in business (albeit recently less so) to create, augument or improve:
* bundles of products
* cashier suggestions for in-store clients after a completed product scan but before payment
* in-store product placement

# Setup

In [None]:
from zipfile import ZipFile  # working with zipped input
from mlxtend.frequent_patterns import fpgrowth, association_rules  # MBA
from scipy import sparse  # sparse matrices
import numpy as np
import pandas as pd
import os

In [None]:
# Loading & processing data

def preDot(text):
    return text.rsplit('.', 1)[0]

np.random.seed(73)
pd.options.mode.chained_assignment = None
dataDict = {}

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        with ZipFile(os.path.join(dirname, filename), 'r') as zipf:
            unzipped_fn = preDot(filename)
            with zipf.open(unzipped_fn) as f:
                dataDict[preDot(unzipped_fn)] = pd.read_csv(f)

train_orders = dataDict['orders'][dataDict['orders']['eval_set'] == 'train'].drop('eval_set', axis=1)
prior_orders = dataDict['orders'][dataDict['orders']['eval_set'] == 'prior'].drop('eval_set', axis=1)
test_orders = dataDict['orders'][dataDict['orders']['eval_set'] == 'test'].drop('eval_set', axis=1)

# Transactional Table Prep

In [None]:
# limiting and splitting the dataframe into three relatively equal parts for memory efficiency below
small_train = dataDict['order_products__train'][['order_id', 'product_id']]
small_train_split = (small_train[:461543], small_train[461543:461543*2-1], small_train[461543*2-1:])

In [None]:
# heuristical prep of data
# use of sparse matrices for memory efficency

pivots = []
for df in small_train_split:
    pvt = ~(df.pivot(index='order_id', columns='product_id', values='product_id').isna())
    pivots.append(pvt.astype(pd.SparseDtype(bool)))
del pvt

product_cols = sorted(small_train.product_id.unique())

In [None]:
for i in range(len(pivots)):
    # reindexing to add extra columns and standardize the format for vstack
    # we sparse them again here b/c otherwise we would end up having regular boolean columns
    pivots[i] = pivots[i].reindex(columns=product_cols, fill_value=False).astype(pd.SparseDtype(bool))
    pivots[i] = sparse.csr_matrix(pivots[i])
# concat vertically
pivots = sparse.vstack(pivots)

In [None]:
# re-map and densify for algos
truth_table = pd.DataFrame(pivots.todense(), index=small_train.order_id.unique(), columns=product_cols)

# Generating Frequent Item Sets

It essentially means removing infrequent itemsets (i.e., those below the minimum support specfied at 5 occurences in the transactional table).

Questions to keep in mind while mining rules:
* how to determine the minimum support value?
* how many item sets / rules should be obtained?
* what metric to pick for rules? what should be its threshold value?
* should one focus on account for the the base popularity of antecendents (*confidence*) or should consequents be involved as well (*lift*)?

[Here](https://paginas.fe.up.pt/~ec/files_0506/slides/04_AssociationRules.pdf) one can find a short summary of how association rule mining works.

In [None]:
# takes less than a minute to execute
frequent_itemsets = fpgrowth(truth_table, min_support=5/len(truth_table), use_colnames=True)

In [None]:
frequent_itemsets

# Generating Association Rules

Setting up rules from item sets with 80% confidence.

In [None]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)

In [None]:
print("μ number of consequents:", rules['consequents'].apply(len).mean())
rules

In [None]:
# selecting out rules that might potentially not be enhancing
rules = rules[rules.lift > 1]

# Recommendations

In [None]:
# a simplification of the table
rules_ante_cons = rules[['antecedents', 'consequents']]

In [None]:
# creating customers' baskets
baskets = small_train.groupby('order_id')['product_id'].apply(frozenset)
baskets.name = "basket"  # antecedents

In [None]:
recommendations = train_orders.join(baskets, on="order_id")
recommendations["recommendations"] = [frozenset() for _ in range(len(recommendations))]

We need to check if antecedents of each rule are **a subset** (<=) of some client's basket, e.g.

```recommendations.loc[frozenset({4605, 21903, 47626, 49683}) <= recommendations.recommendations, "basket"]```.

In [None]:
# computationally-intensive; might require an optimization
for idx, antecedent in enumerate(rules_ante_cons["antecedents"]):
    lookup = antecedent <= recommendations.basket, "recommendations"
    recommendations.loc[lookup] = recommendations.loc[lookup].apply(
        frozenset.union,
        args=(rules_ante_cons.loc[idx, "consequents"],)
    )
# recommendations = recommendations.rename(columns={"antecedents": "basket"})
# this may be changed earlier
recommendations.loc[:, "recommendations"] = recommendations.recommendations - recommendations.basket

In [None]:
# non-empty recommendations
non_empty_recs = recommendations[recommendations.recommendations.apply(bool)]
print("1 out of approx.", round(1/(len(non_empty_recs) / len(recommendations))), "transactions will result in a recommendation being suggested to a customer.")
# mapping codes to product names
def map_products(codes):
    if isinstance(codes, pd.Series):
        return codes.apply(map_products)
    return frozenset(map(products.get, codes))

products = dataDict["products"]
products = products.set_index("product_id")["product_name"].to_dict()
non_empty_recs.loc[:, ["basket", "recommendations"]] = non_empty_recs[["basket", "recommendations"]].apply(map_products)
display(non_empty_recs)

# Practical single-basket MBA Example

In [None]:
def mba_diagram(sample_basket, sample_recommendation):
    import matplotlib.pyplot as plt

    def get_text_box_coords(txt):
        we = plt.Text.get_window_extent(txt, renderer=fig.canvas.get_renderer())
        return ax.transAxes.inverted().transform(we)
    def get_rightmost_vmid(box):
        return box[1][0], (box[0][1] + box[1][1]) / 2

    fig, ax = plt.subplots(figsize=(20,10))
    title = ax.set_title("An illustration of a recommendation system for a sample customer basket\n(basket ← suggestion)", fontsize=18)
    ax.axis('off')
    basket_txt = ax.text(.05, .95, sample_basket, ha='left', va='top', wrap=True,size=12,
                  bbox=dict(boxstyle='round,pad=1', fc='w', ec='lightblue'))

    basket_rightmost, basket_vmid = get_rightmost_vmid(get_text_box_coords(basket_txt))

    arrow_txt = ax.text(
        basket_rightmost*1.4, basket_vmid, "Add", ha="center", va="center", size=35,
        bbox=dict(boxstyle="larrow,pad=0.6", fc="lightgreen", ec="g", lw=2))
    arrow_rightmost, arrow_vmid = get_rightmost_vmid(get_text_box_coords(arrow_txt))

    recommendation_txt = ax.text(arrow_rightmost * 1.14, arrow_vmid, sample_recommendation, ha='left', va='top', wrap=True, fontsize=25,
                  bbox=dict(boxstyle='round,pad=1', fc='w', ec='r'))
    recommendation_txt_pos = recommendation_txt.get_position()
    recommendation_txt.set_position((
        recommendation_txt_pos[0],
        recommendation_txt_pos[1] + (get_text_box_coords(recommendation_txt)[1][1]-get_text_box_coords(recommendation_txt)[0][1]) / 2
    ))

In [None]:
sample_index = np.random.randint(len(non_empty_recs))
sample_basket = "\n".join(non_empty_recs.iloc[sample_index].loc["basket"])
sample_recommendation = "\n".join(non_empty_recs.iloc[sample_index].loc["recommendations"])
mba_diagram(sample_basket, sample_recommendation)