# Association & Apriori

In [1]:
import pandas as pd

import numpy as np

import scipy

import os

import seaborn as sns

from matplotlib import pyplot as plt

from mlxtend.frequent_patterns import apriori, association_rules

import warnings
warnings.filterwarnings('ignore')

In [2]:
organic = pd.read_csv('organic.csv')

In [3]:
product_counts = organic.groupby('product_id')['order_id'].count().reset_index().rename(columns = {'order_id':'frequency'})
product_counts = product_counts.sort_values('frequency', ascending=False)[0:100].reset_index(drop=True)
product_counts

Unnamed: 0,product_id,frequency
0,13176,379245
1,21137,264586
2,21903,241837
3,47209,213561
4,47766,176697
...,...,...
95,26940,19284
96,37067,18577
97,18370,18449
98,41665,18360


In [4]:
freq_products = list(product_counts.product_id)
freq_products[:10]

[13176, 21137, 21903, 47209, 47766, 27845, 27966, 22935, 24964, 45007]

In [5]:
order_products = organic[organic['product_id'].isin(freq_products)]
order_products.shape

(5398189, 20)

In [6]:
basket = order_products.pivot_table(columns='product_name', values='reordered', index='order_id').reset_index().fillna(0).set_index('order_id')

In [7]:
basket1 = basket[:300000]

In [8]:
basket1.head(50)

product_name,apple honeycrisp organic,bag of organic bananas,frozen organic wild blueberries,michigan organic kale,"milk, organic, vitamin d",organic 2% reduced fat milk,organic avocado,organic baby arugula,organic baby broccoli,organic baby carrots,...,organic unsweetened almond milk,organic white onions,organic whole milk,organic whole strawberries,organic whole string cheese,organic yams,organic yellow onion,organic yellow peaches,organic zucchini,"yokids squeezers organic low-fat yogurt, strawberry"
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
frequent_items = apriori(basket1, min_support=0.01, use_colnames=True)
frequent_items.head()

Unnamed: 0,support,itemsets
0,0.03283,(apple honeycrisp organic)
1,0.165283,(bag of organic bananas)
2,0.01001,(frozen organic wild blueberries)
3,0.02441,(michigan organic kale)
4,0.06987,(organic avocado)


In [10]:
rules = association_rules(frequent_items, metric='lift', min_threshold=1)
rules.sort_values('lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
19,(organic raspberries),(organic strawberries),0.054823,0.10888,0.01315,0.239861,2.202988,0.007181,1.172312,0.577745
18,(organic strawberries),(organic raspberries),0.10888,0.054823,0.01315,0.120775,2.202988,0.007181,1.075011,0.612792
4,(bag of organic bananas),(organic raspberries),0.165283,0.054823,0.01628,0.098498,1.796635,0.007219,1.048446,0.531203
5,(organic raspberries),(bag of organic bananas),0.054823,0.165283,0.01628,0.296954,1.796635,0.007219,1.187286,0.469123
2,(organic hass avocado),(bag of organic bananas),0.088843,0.165283,0.02587,0.291187,1.761743,0.011186,1.177626,0.47454
3,(bag of organic bananas),(organic hass avocado),0.165283,0.088843,0.02587,0.156519,1.761743,0.011186,1.080234,0.517996
17,(organic strawberries),(organic hass avocado),0.10888,0.088843,0.015987,0.146828,1.652666,0.006313,1.067964,0.443169
16,(organic hass avocado),(organic strawberries),0.088843,0.10888,0.015987,0.179942,1.652666,0.006313,1.086655,0.433424
11,(organic avocado),(organic baby spinach),0.06987,0.097807,0.011073,0.158485,1.620389,0.00424,1.072106,0.411624
10,(organic baby spinach),(organic avocado),0.097807,0.06987,0.011073,0.113217,1.620389,0.00424,1.048881,0.42437


### Key Metrics:

#### Support
- Percentage of orders that contains the item set.

#### Confidence
- Given two items, A and B, confidence measures the percentage of times that item B is purchased, given that item A was purchased.


#### Lift
- Given two items, A and B, lift indicates whether there is a relationship between A and B, or whether the two items are occuring together in the same orders simply by chance 
 * lift = 1 implies no relationship between A and B.
   (ie: A and B occur together only by chance)

 * lift > 1 implies that there is a positive relationship between A and B.
   (ie:  A and B occur together more often than random)

 * lift < 1 implies that there is a negative relationship between A and B.
   (ie:  A and B occur together less often than random)