## Import Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
from itertools import permutations

## Load Data

In [3]:
df = pd.read_csv('Market_Basket.csv', header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [4]:
# Remove whitespaces
for col in df.columns:
    df[col] = df[col].str.strip()

## Data Preprocessing

In [5]:
# Make df a list of list of items
df_list = df.values.tolist()

In [6]:
# Put all items in a list
all_items = []
for transaction in df_list:
    for item in transaction:
        if str(item) != 'nan':
            all_items.append(item)

In [7]:
# Get distinct items
list_items = list(set(all_items))

In [8]:
# Get shape of data
df.shape

(7501, 20)

### Association Analysis

In [9]:
# Get total transaction
total_transaction = df.shape[0]
total_transaction

7501

In [10]:
# Count number of items
count_list = []
for item in list_items:
    count = all_items.count(item)
    count_list.append(count)

item_count = pd.DataFrame(count_list, columns=['count'],index=list_items)
item_count = item_count.sort_values(by=['count'], ascending=False)
item_count.head(20)

Unnamed: 0,count
mineral water,1788
eggs,1348
spaghetti,1306
french fries,1282
chocolate,1230
green tea,991
milk,972
ground beef,737
frozen vegetables,715
pancakes,713


In [11]:
# Get top 20 items
top_items = list(item_count.head(20).index)

In [12]:
def get_possible_permutations(items):
    '''
    Get possible permutations of the product 2 at a time.
    
    input: list, list of all items
    returns t
    '''
    perm_list = []
    for r in range(2, 3):
        all_perm = permutations(items, r)
        for perm in all_perm:
            perm_list.append(perm)
    return perm_list

In [13]:
def get_frequency(perm):
    '''
    Number of transactions such that:
    both the antecedent and consequent is present
    '''
    antecedent = perm[0]
    consequent = perm[1]
    count = 0
    for transaction in df_list:
        if (antecedent in transaction) and (consequent in transaction):
            count += 1
    return count

In [15]:
# Create table for results
result = pd.DataFrame()

# Create column for the paired permutations
result['paired items'] = get_possible_permutations(top_items)
# Get count of the antecedents
result['antecedent count'] = result['paired items'].apply(lambda perm: item_count.loc[perm[0],'count'])
# Get count of the consequents
result['consequent count'] = result['paired items'].apply(lambda perm: item_count.loc[perm[1], 'count'])
# Get paired item frequency
result['paired item frequency'] = result['paired items'].apply(lambda perm: get_frequency(perm))
# Calculate for the support
result['support'] = result['paired item frequency']/total_transaction
# Calculate for confidence
result['confidence'] = result['paired item frequency']/result['antecedent count']
# Calculate for lift
result['lift'] = result['confidence']/(result['consequent count']/total_transaction)

In [16]:
result

Unnamed: 0,paired items,antecedent count,consequent count,paired item frequency,support,confidence,lift
0,"(mineral water, eggs)",1788,1348,382,0.050927,0.213647,1.188845
1,"(mineral water, spaghetti)",1788,1306,448,0.059725,0.250559,1.439085
2,"(mineral water, french fries)",1788,1282,253,0.033729,0.141499,0.827912
3,"(mineral water, chocolate)",1788,1230,395,0.052660,0.220917,1.347236
4,"(mineral water, green tea)",1788,991,233,0.031063,0.130313,0.986357
...,...,...,...,...,...,...,...
375,"(turkey, low fat yogurt)",469,574,38,0.005066,0.081023,1.058810
376,"(turkey, shrimp)",469,536,48,0.006399,0.102345,1.432263
377,"(turkey, tomatoes)",469,513,49,0.006532,0.104478,1.527654
378,"(turkey, olive oil)",469,494,40,0.005333,0.085288,1.295029
