In [3]:
# import pandas for dataframe
import pandas as pd

# import numpy for computing functionality
import numpy as np

# import seaborn for visualizing data
import seaborn as sns

%matplotlib inline

#importing apriori from mlxtend
from mlxtend.frequent_patterns import apriori

#import association rules from mlxtend
from mlxtend.frequent_patterns import association_rules
warnings.filterwarnings("ignore")

In [4]:
#loading the online retail dataset
retail_df = pd.read_excel("Online Retail.xlsx")

# displaying first 5 rows of the dataset
retail_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [5]:
#remove additional spaces
retail_df['Description'] = retail_df['Description'].str.strip()
#remove NA values
retail_df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
# converting InvoiceNo to string type
retail_df['InvoiceNo'] = retail_df['InvoiceNo'].astype('str')
#remove cancelled orders
retail_df = retail_df[~retail_df['InvoiceNo'].str.contains('C')]

In [6]:
# Market basket analysis for transactions that involve customers from France
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

def create_basket(country_filter):
    basket = (retail_df[retail_df['Country'] == country_filter]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))
    return basket

In [7]:
country_filter = "France"
basket_french = create_basket("France")
basket_sets = basket_french.applymap(encode_units)
basket_sets.drop('POSTAGE', inplace=True, axis=1)

In [8]:
# using apriori for frequent dataset mining for france market transactions
frequent_itemsets = apriori(basket_sets, min_support=0.05, use_colnames=True)

In [9]:
# Fetching association rules for the frequency dataset
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE PINK),0.096939,0.102041,0.07398,0.763158,7.478947,0.064088,3.791383
1,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE GREEN),0.102041,0.096939,0.07398,0.725,7.478947,0.064088,3.283859
2,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.094388,0.096939,0.079082,0.837838,8.642959,0.069932,5.568878
3,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.096939,0.094388,0.079082,0.815789,8.642959,0.069932,4.916181
4,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE PINK),0.094388,0.102041,0.07398,0.783784,7.681081,0.064348,4.153061


From the table we can notice that the probability of product ALARM CLOCK BAKELIKE PINK with ALARM CLOCK BAKELIKE GREEN being purchased together(support) is 7.3%. The probability of these two products purchased together is 7.3 %. Since the lift score > 1 for these two products, we can make use of ALARM CLOCK BAKELIKE PINK to make predictions on ALARM CLOCK BAKELIKE GREEN, since lift score >1 implies posistive impact. This rule can be incorrect 3.28 times more often if this association rule was purely a random chance.

In [10]:
# Market basket analysis for transactions that involve customers from Germany
basket_germany = create_basket("Germany")
basket2_sets = basket_germany.applymap(encode_units)
basket2_sets.drop('POSTAGE', inplace=True, axis=1)

In [11]:
# using apriori for frequent dataset mining for germany market transactions
frequent_itemsets_germany = apriori(basket2_sets, min_support=0.05, use_colnames=True)

In [12]:
# Fetching association rules for the frequency dataset
rules = association_rules(frequent_itemsets_germany, metric="lift", min_threshold=1.2)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(PLASTERS IN TIN CIRCUS PARADE),(PLASTERS IN TIN WOODLAND ANIMALS),0.115974,0.137856,0.067834,0.584906,4.242887,0.051846,2.076984
1,(PLASTERS IN TIN WOODLAND ANIMALS),(PLASTERS IN TIN CIRCUS PARADE),0.137856,0.115974,0.067834,0.492063,4.242887,0.051846,1.740427
2,(PLASTERS IN TIN CIRCUS PARADE),(ROUND SNACK BOXES SET OF 4 FRUITS),0.115974,0.157549,0.050328,0.433962,2.754455,0.032057,1.48833
3,(ROUND SNACK BOXES SET OF 4 FRUITS),(PLASTERS IN TIN CIRCUS PARADE),0.157549,0.115974,0.050328,0.319444,2.754455,0.032057,1.298977
4,(PLASTERS IN TIN CIRCUS PARADE),(ROUND SNACK BOXES SET OF4 WOODLAND),0.115974,0.245077,0.056893,0.490566,2.001685,0.02847,1.481887


From the table we can notice that the probability of product PLASTERS IN TIN CIRCUS PARADE with PLASTERS IN TIN WOODLAND ANIMALS being purchased together(support) is 6.7%. The probability of these two products purchased together is 58.4 %. Since the lift score > 1 for these two products, we can make use of PLASTERS IN TIN CIRCUS PARADE to make predictions on PLASTERS IN TIN WOODLAND ANIMALS, since lift score > 1 implies posistive impact. This rule can be incorrect 2.07 times more often if this association rule was purely a random chance.

In [13]:
# Market basket analysis for transactions that involve customers from United Kingdom
basket_uk = create_basket("United Kingdom")
basket3_sets = basket_uk.applymap(encode_units)
basket3_sets.drop('POSTAGE', inplace=True, axis=1)

In [14]:
# using apriori for frequent dataset mining for UK market transactions
frequent_itemsets_uk = apriori(basket3_sets, min_support=0.05, use_colnames=True)
# Fetching association rules for the frequency dataset
rules = association_rules(frequent_itemsets_uk, metric="lift", min_threshold=0.8)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


There are no rules for united kingdom for this filter with the specified support value so we will try few other filters.

In [15]:
retail_filtered = retail_df[retail_df['Country']=="United Kingdom"]

In [16]:
basket = (retail_filtered[retail_filtered['Quantity']<10]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [17]:
basket4_sets = basket.applymap(encode_units)
basket4_sets.drop('POSTAGE', inplace=True, axis=1)

In [18]:
# using apriori for frequent dataset mining for UK market transactions on a different filter
frequent_itemsets_uk = apriori(basket3_sets, min_support=0.03, use_colnames=True)
# Fetching association rules for the frequency dataset
rules = association_rules(frequent_itemsets_uk, metric="lift", min_threshold=0.5)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.049821,0.046928,0.03016,0.605376,12.900183,0.027822,2.415142
1,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.046928,0.049821,0.03016,0.642694,12.900183,0.027822,2.659288
2,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.03766,0.050035,0.03091,0.820768,16.403939,0.029026,5.300203
3,(GREEN REGENCY TEACUP AND SAUCER),(PINK REGENCY TEACUP AND SAUCER),0.050035,0.03766,0.03091,0.617773,16.403939,0.029026,2.517719
4,(ROSES REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.051267,0.050035,0.037553,0.732497,14.639752,0.034988,3.551237


From the table we can notice that the probability of product ALARM CLOCK BAKELIKE RED with ALARM CLOCK BAKELIKE GREEN being purchased together(support) is 3.01%. The probability of these two products purchased together is 60.5 %. Since the lift score > 1 for these two products, we can make use of ALARM CLOCK BAKELIKE RED to make predictions on ALARM CLOCK BAKELIKE GREEN, since lift score > 1 implies posistive impact. This rule can be incorrect 2.41 times more often if this association rule was purely a random chance.