In [1]:
#step 1: import libraries

import numpy as np 
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules 

In [2]:
#step 2: import data

data = pd.read_excel(r"C:\Users\roche\Desktop\SUBJECT FOLDERS\3rd Year\2nd Semester\Data Mining\Apriori Algorithm in Jupyter\Online Retail.xlsx") 
data.head() 

In [3]:
data.columns 

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [4]:
#step 3: Cleaning the data - disregarded code
#this will be executed in BigQuery MySQL

#same procedure, dropping rows without scores and ranks

# Stripping extra spaces in the description 
data['Description'] = data['Description'].str.strip() 
  
# Dropping the rows without any invoice number 
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True) 
data['InvoiceNo'] = data['InvoiceNo'].astype('str') 
  
# Dropping all transactions which were done on credit 
data = data[~data['InvoiceNo'].str.contains('C')] 

In [5]:
#step 4: Splitting the data according to the region of transaction

# Transactions done in France 
basket_France = (data[data['Country'] =="France"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
# Transactions done in the United Kingdom 
basket_UK = (data[data['Country'] =="United Kingdom"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
# Transactions done in Portugal 
basket_Por = (data[data['Country'] =="Portugal"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
basket_Sweden = (data[data['Country'] =="Sweden"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 

In [6]:
#step 5: Hot coding of data

# Defining the hot encoding function to make the data suitable  
# for the concerned libraries 
def hot_encode(x): 
    if(x<= 0): 
        return 0
    if(x>= 1): 
        return 1
  
# Encoding the datasets 
basket_encoded = basket_France.map(hot_encode) 
basket_France = basket_encoded 
  
basket_encoded = basket_UK.map(hot_encode) 
basket_UK = basket_encoded 
  
basket_encoded = basket_Por.map(hot_encode) 
basket_Por = basket_encoded 
  
basket_encoded = basket_Sweden.map(hot_encode) 
basket_Sweden = basket_encoded 

In [7]:
#step 6: Building the models and analyzing the results

# Building the model 
frq_items = apriori(basket_France.astype('bool'), min_support = 0.06, use_colnames = True) 
  
# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
display(rules) 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
23,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.000000,1.306667,0.017961,inf,0.254144
182,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.099490,0.975000,7.644000,0.086474,34.897959,0.967949
183,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.099490,0.975000,7.077778,0.085433,34.489796,0.956294
188,"(SET/20 RED RETROSPOT PAPER NAPKINS, POSTAGE, ...",(SET/6 RED SPOTTY PAPER PLATES),0.084184,0.127551,0.081633,0.969697,7.602424,0.070895,28.790816,0.948294
189,"(SET/20 RED RETROSPOT PAPER NAPKINS, POSTAGE, ...",(SET/6 RED SPOTTY PAPER CUPS),0.084184,0.137755,0.081633,0.969697,7.039282,0.070036,28.454082,0.936804
...,...,...,...,...,...,...,...,...,...,...
62,(POSTAGE),(PLASTERS IN TIN STRONGMAN),0.765306,0.081633,0.063776,0.083333,1.020833,0.001302,1.001855,0.086957
46,(POSTAGE),(PACK OF 6 SKULL PAPER CUPS),0.765306,0.063776,0.061224,0.080000,1.254400,0.012417,1.017635,0.864130
137,(POSTAGE),"(LUNCH BAG APPLE DESIGN, LUNCH BAG RED RETROSPOT)",0.765306,0.066327,0.061224,0.080000,1.206154,0.010464,1.014862,0.728261
16,(POSTAGE),(COFFEE MUG APPLES DESIGN),0.765306,0.068878,0.061224,0.080000,1.161481,0.008512,1.012090,0.592391


In [8]:
frq_items = apriori(basket_UK.astype('bool'), min_support = 0.02, use_colnames = True) 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
display(rules) 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
166,"(PINK REGENCY TEACUP AND SAUCER, ROSES REGENCY...",(GREEN REGENCY TEACUP AND SAUCER),0.029249,0.050035,0.026410,0.902930,18.046041,0.024947,9.786434,0.973047
164,"(GREEN REGENCY TEACUP AND SAUCER, PINK REGENCY...",(ROSES REGENCY TEACUP AND SAUCER),0.030910,0.051267,0.026410,0.854419,16.666089,0.024826,6.516893,0.969980
27,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.037660,0.050035,0.030910,0.820768,16.403939,0.029026,5.300203,0.975787
172,"(JUMBO STORAGE BAG SUKI, JUMBO BAG PINK POLKADOT)",(JUMBO BAG RED RETROSPOT),0.027053,0.103820,0.021696,0.801980,7.724749,0.018887,4.525711,0.894752
146,(PINK REGENCY TEACUP AND SAUCER),(ROSES REGENCY TEACUP AND SAUCER),0.037660,0.051267,0.029249,0.776671,15.149556,0.027319,4.248149,0.970542
...,...,...,...,...,...,...,...,...,...,...
181,(JUMBO BAG RED RETROSPOT),"(JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO STOR...",0.103820,0.027482,0.020571,0.198142,7.209989,0.017718,1.212832,0.961083
136,(WHITE HANGING HEART T-LIGHT HOLDER),(NATURAL SLATE HEART CHALKBOARD),0.116034,0.065302,0.021964,0.189289,2.898653,0.014387,1.152936,0.740992
160,(WHITE HANGING HEART T-LIGHT HOLDER),(WOODEN PICTURE FRAME WHITE FINISH),0.116034,0.057642,0.021642,0.186519,3.235826,0.014954,1.158427,0.781659
145,(WHITE HANGING HEART T-LIGHT HOLDER),(PARTY BUNTING),0.116034,0.085391,0.020250,0.174515,2.043711,0.010341,1.107966,0.577730


In [9]:
frq_items = apriori(basket_Por.astype('bool'), min_support = 0.05, use_colnames = True) 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
display(rules) 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1170,(SET 12 COLOUR PENCILS SPACEBOY),(SET 12 COLOUR PENCILS DOLLY GIRL),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf,1.000000
1171,(SET 12 COLOUR PENCILS DOLLY GIRL),(SET 12 COLOUR PENCILS SPACEBOY),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf,1.000000
1172,(SET OF 4 KNICK KNACK TINS LONDON),(SET 12 COLOUR PENCILS DOLLY GIRL),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf,1.000000
1173,(SET 12 COLOUR PENCILS DOLLY GIRL),(SET OF 4 KNICK KNACK TINS LONDON),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf,1.000000
1174,(SET 12 COLOUR PENCILS DOLLY GIRL),(SET OF 4 KNICK KNACK TINS POPPIES),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf,1.000000
...,...,...,...,...,...,...,...,...,...,...
1056,(POSTAGE),(REGENCY CAKESTAND 3 TIER),0.517241,0.086207,0.051724,0.1,1.160000,0.007134,1.015326,0.285714
1059,(POSTAGE),(RETROSPOT HEART HOT WATER BOTTLE),0.517241,0.086207,0.051724,0.1,1.160000,0.007134,1.015326,0.285714
1066,(POSTAGE),(SET OF 3 CAKE TINS PANTRY DESIGN),0.517241,0.086207,0.051724,0.1,1.160000,0.007134,1.015326,0.285714
1070,(POSTAGE),(SET OF 36 TEATIME PAPER DOILIES),0.517241,0.086207,0.051724,0.1,1.160000,0.007134,1.015326,0.285714


In [10]:
frq_items = apriori(basket_Sweden.astype('bool'), min_support = 0.04, use_colnames = True) 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
display(rules) 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(PACK OF 72 SKULL CAKE CASES),(12 PENCILS SMALL TUBE SKULL),0.055556,0.055556,0.055556,1.000000,18.000000,0.052469,inf,1.000000
1,(12 PENCILS SMALL TUBE SKULL),(PACK OF 72 SKULL CAKE CASES),0.055556,0.055556,0.055556,1.000000,18.000000,0.052469,inf,1.000000
4,(36 DOILIES DOLLY GIRL),(ASSORTED BOTTLE TOP MAGNETS),0.055556,0.055556,0.055556,1.000000,18.000000,0.052469,inf,1.000000
5,(ASSORTED BOTTLE TOP MAGNETS),(36 DOILIES DOLLY GIRL),0.055556,0.055556,0.055556,1.000000,18.000000,0.052469,inf,1.000000
180,(CHILDRENS CUTLERY DOLLY GIRL),(CHILDRENS CUTLERY CIRCUS PARADE),0.055556,0.055556,0.055556,1.000000,18.000000,0.052469,inf,1.000000
...,...,...,...,...,...,...,...,...,...,...
25486,(POSTAGE),"(WOODEN OWLS LIGHT GARLAND, CHILDRENS CUTLERY ...",0.611111,0.055556,0.055556,0.090909,1.636364,0.021605,1.038889,1.000000
202,(POSTAGE),(CUPCAKE LACE PAPER SET 6),0.611111,0.083333,0.055556,0.090909,1.090909,0.004630,1.008333,0.214286
384,(POSTAGE),(MINI PLAYING CARDS DOLLY GIRL),0.611111,0.083333,0.055556,0.090909,1.090909,0.004630,1.008333,0.214286
482,(POSTAGE),(ROUND SNACK BOXES SET OF4 WOODLAND),0.611111,0.083333,0.055556,0.090909,1.090909,0.004630,1.008333,0.214286
