In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Preparation

In [None]:
# Load the Dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.preprocessing import TransactionEncoder

# Load transactions from pandas.
df = pd.read_csv("/kaggle/input/groceries-dataset/Groceries_dataset.csv")

# Print the header
print(df.head(10))

In [None]:
# Grouping each observation by customer
df_grouped=df.groupby(['Member_number','Date'])['itemDescription'].apply(sum)
df_grouped

In [None]:
# Create a list of transaction
transactions = [a[1]['itemDescription'].tolist() for a in list(df.groupby(['Member_number','Date']))]
transactions

In [None]:
# Encoding transactions
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
te.columns_

In [None]:

transactions = pd.DataFrame(te_ary, columns=te.columns_)
pf = transactions.describe()
pf

In [None]:
pf.iloc[0]-pf.iloc[3]

In [None]:
f = pf.iloc[0]-pf.iloc[3]
a = f.tolist()
b = list(f.index)
item = pd.DataFrame([[a[r],b[r]]for r in range(len(a))], columns=['Count','Item'])
item = item.sort_values(['Count'], ascending=False).head(50)
transactions

## Simplest Metric

In [None]:
# Computing Support for Single Items
print(transactions.mean().sort_values(ascending=False))

## Condifence & Lift

In [None]:
# Print first five items
print(transactions.head())

In [None]:
# Computing support.
supportWmOv = np.logical_and(transactions['whole milk'], transactions['other vegetables']).mean()
supportWm = transactions['whole milk'].mean()
supportOv = transactions['other vegetables'].mean()

# Compute and print confidence and lift.
confidence = supportWmOv / supportWm
lift = supportWmOv / (supportWm * supportOv)

# Print results.
print(supportOv, confidence, lift)

## Computing Leverage

In [None]:
# Compute and print leverage
leverage = supportWmOv - supportWm * supportOv
print(leverage)

## Computing Conviction

In [None]:
# Compute support for NOT "almonds"
support_n_Ov = 1.0 - onehot['other vegetables'].mean()

# Compute support for "asparagus" and NOT "almonds"
supportWm_n_Ov = supportWm - supportWmOv

# Compute conviction
conviction = supportWm*support_n_Ov / supportWmOv
print(conviction)

## Calculating Metrics

In [None]:
# Let's define the functions to calculate the metrics from the original data.
from itertools import permutations

def supportA(itemA, df):
    return float(df[itemA].mean())    

def supportB(itemB, df):
    return float(df[itemB].mean())

def confidence(itemA,itemB,df):
    return float(np.logical_and(df[itemA],df[itemB]).mean() /(df[itemA].mean()))

def lift(itemA,itemB,df):
    return float(np.logical_and(df[itemA],df[itemB]).mean() /(df[itemA].mean() * df[itemB].mean()))

def leverage(itemA,itemB,df):
    return np.logical_and(df[itemA],df[itemB]).mean() - (df[itemA].mean()*df[itemB].mean())

def conviction(itemA, itemB, df):
    # Compute support for A and B
    supportAB = np.logical_and(df[itemA], df[itemB]).mean()
    # Compute support for A
    supportA = df[itemA].mean()
    # Compute support for not B
    supportnB = 1.0 - df[itemB].mean()
    # Compute support for A not B
    supportAnB = supportA - supportAB
    # Compute conviction
    return float(supportA*supportnB / supportAnB)

In [None]:
item_pairs = list()
for itemA,itemB in permutations(onehot,2):
    item_pairs.append(list((itemA,itemB, #names
                            onehot[itemA].sum(),onehot[itemB].sum(), #individual count
                            np.logical_and(onehot[itemA],onehot[itemB]).sum(), #pair count
                            supportA(itemA, onehot),
                            supportB(itemB, onehot),
                            confidence(itemA,itemB,onehot), #confidence
                            lift(itemA,itemB,onehot), #lift
                            leverage(itemA,itemB,onehot), # leverage
                            conviction(itemA, itemB, onehot)
                            ))) # 

In [None]:
item_pairs = pd.DataFrame(item_pairs,columns = ['itemA','itemB',
                                                'countItemA','countItemB',
                                                'countItemA&B',
                                                'Antecedent Support',
                                                'Consequent Support',
                                                'Confidence',
                                                'Lift',
                                                'Leverage',
                                                'Conviction'])

item_pairs.sample(5)

## Performing Multi-Metric Filtering

In [None]:
# Select subset of rules with low consequent support.
#rules = item_pairs[item_pairs['Consequent Support'] < 0.05]
#print(len(rules))
#rules

In [None]:
# Select subset of rules with lift > 1.5.
#rules_2 = rules[rules['Lift'] > 1.5]
#print(len(rules_2))
#rules_2

## Apriori Algorithm

In [None]:
# Import Apriori algorithm
from mlxtend.frequent_patterns import apriori

In [None]:
# Compute frequent itemsets
frequent_itemsets = apriori(transactions, min_support = 0.0005,max_len = 4, use_colnames = True)

# Print number of itemsets
print(len(frequent_itemsets))

In [None]:
# Print frequent itemsets
print(frequent_itemsets.head())

## Apriori and Computing Association Rule 

In [None]:
# Import Apriori algorithm
from mlxtend.frequent_patterns import apriori, association_rules

# Compute association rules
Arules = association_rules(frequent_itemsets,
                           metric = "support",
                           min_threshold = 0.001)

In [None]:
Arules

In [None]:
# Raise the threshold
# Compute association rules
Arules_2 = association_rules(frequent_itemsets,
                           metric = "support",
                           min_threshold = 0.010)

Arules_2

In [None]:
# Raise the threshold
# Compute association rules
Arules_3 = association_rules(frequent_itemsets,
                           metric = "support",
                           min_threshold = 0.050)

Arules_3

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=1, sharex=True, figsize=(10,5))
sns.boxenplot(x='antecedent support', data=Arules, linewidth=0.9, color="royalblue", ax=ax[0])
sns.boxenplot(x='support', data=Arules, linewidth=0.9, color="royalblue", ax=ax[1])
sns.boxenplot(x='confidence', data=Arules, linewidth=0.9, color="royalblue", ax=ax[2])
plt.tight_layout()
plt.show()

In [None]:
filtered_rules = Arules[(Arules['antecedent support'] > 0.06) &
                        (Arules['support'] > 0.002) &
                        (Arules['confidence'] > 0.04) &
                        (Arules['lift'] > 1.00)]

filtered_rules