In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

There are three major components of Apriori algorithm:

Support Confidence Lift

Support Support refers to the default popularity of an item and can be calculated by finding number of transactions containing a particular item divided by total number of transactions. Suppose we want to find support for item B. Support(B) = (Transactions containing (B))/(Total Transactions)

Confidence Confidence refers to the likelihood that an item B is also bought if item A is bought. It can be calculated by finding the number of transactions where A and B are bought together, divided by total number of transactions where A is bought. Confidence(A→B) = (Transactions containing both (A and B))/(Transactions containing A)

Lift Lift(A -> B) refers to the increase in the ratio of sale of B when A is sold. Lift(A –> B) can be calculated by dividing Confidence(A -> B) divided by Support(B). Mathematically it can be represented as: Lift(A→B) = (Confidence (A→B))/(Support (B))

A Lift of 1 means there is no association between products A and B. Lift of greater than 1 means products A and B are more likely to be bought together. Finally, Lift of less than 1 refers to the case where two products are unlikely to be bought together.



In [None]:
import numpy as np  
import matplotlib.pyplot as plt  
import pandas as pd  

# we need to install mlxtend on anaconda prompt by typing 'pip install mlxtend'
from mlxtend.frequent_patterns import apriori  
from mlxtend.frequent_patterns import association_rules


In [None]:
store_data = pd.read_csv('../input/BreadBasket_DMS.csv') 

In [None]:
# lets visualize which items are more popular.

Items={}
for item in store_data['Item']:
    if item in Items:
        Items[item]= Items[item] + 1
    else:
        Items[item]=1
        
keys=[]
vals=[]
for i,k in Items.items():
    if k>30:
        keys.append(i)
        vals.append(k)


In [None]:
plt.bar(keys, vals, label="Items sold in 2017")
plt.rcParams["figure.figsize"] = [20,10]
plt.ylabel ('Number of Transactions in Percentage')
plt.xlabel ('Items Sold')
plt.xticks(list(keys), rotation=90)
plt.legend (bbox_to_anchor=(1, 1), loc="best", borderaxespad=0.)

plt.show()

This analysis requires that all the data for a transaction be included in 1 row and the items should be 1-hot encoded.

In [None]:
store_data['Quantity']= 1

In [None]:
store_data.head(7)

In [None]:
basket = store_data.groupby(['Transaction', 'Item'])['Quantity'].sum().unstack().fillna(0)
print(basket.head())

In [None]:
# There are a lot of zeros in the data but we also need to make sure any positive values are converted to a 1 
# and anything less the 0 is set to 0. This step will complete the one hot encoding of the data

def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)

In [None]:
# we can change the min_support value from 0 to 1, default value if 0.5 but since our
# support values are less than 0.5 so to include more datasets i am keeping its value 
# 0.03 to include more sets. but the result will depend on teh lift value after applying
# association rules.

frequent_itemsets = apriori(basket_sets,min_support=0.03,  use_colnames=True)
print(frequent_itemsets)


In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

As i have mentioned earlier that

A Lift of 1 means there is no association between products A and B. Lift of greater than 1 means products A and B are more likely to be bought together. Finally, Lift of less than 1 refers to the case where two products are unlikely to be bought together.

so with the value of lift 1.1 we can say that items are not associated with each other.