In [1]:
import numpy as np 
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
# load data into a pandas dataframe
DATA_LOCATION='Online Retail (1).xlsx';
data = pd.read_excel(DATA_LOCATION) 

# show the top 10 rows
data.head(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047.0,United Kingdom


In [3]:
# What are the columns of our data?
print(data.columns)
print('---------------------------------------------------------------------\n')
# What are the different regions that the transactions are from? (Countries)
print(data['Country'].unique())

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')
---------------------------------------------------------------------

['United Kingdom' 'France' 'Australia' 'Netherlands' 'Germany' 'Norway'
 'EIRE' 'Switzerland' 'Spain' 'Poland' 'Portugal' 'Italy' 'Belgium'
 'Lithuania' 'Japan' 'Iceland' 'Channel Islands' 'Denmark' 'Cyprus'
 'Sweden' 'Austria' 'Israel' 'Finland' 'Bahrain' 'Greece' 'Hong Kong'
 'Singapore' 'Lebanon' 'United Arab Emirates' 'Saudi Arabia'
 'Czech Republic' 'Canada' 'Unspecified' 'Brazil' 'USA'
 'European Community' 'Malta' 'RSA']


In [4]:
# Stripping extra spaces in the descriptioni 
data['Description']= data['Description'].str.strip();
  
# Dropping the rows without any invoice number 
data = data.dropna(axis=0, subset=['InvoiceNo'])

# Dropping all transactions which were done on credit 
data['InvoiceNo'] = data['InvoiceNo'].astype('str') 
data = data[~data['InvoiceNo'].str.contains('C')] 


In [5]:
# Splitting the data according to the region of transaction
# select data rows by country designated below
# use groupby('InvoiceNo', 'Description')['Quantity']
# sum data
# fillna(0)
# set_index('Invoice Number)

# Transactions done in France 
basket_France =(data[data['Country'] =="France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))
  
# Transactions done in the United Kingdom 
basket_UK = (data[data['Country'] =="United Kingdom"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))
  
# Transactions done in Portugal 
basket_Por =(data[data['Country'] =="Portugal"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))
  
# Transactions done in Sweden     
basket_Sweden =(data[data['Country'] =="Sweden"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))

In [6]:
# Define a hot encoding function to make the data suitable  
# for the concerned libraries 
# value >= 1 -> 1 and similar for less than 0
def hot_encode(x): 
    if x <= 0:
        return 0
    if x >= 1:
        return 1
  
# Encoding the datasets 
# apply hot_encode function to data from above
basket_encoded_France = basket_France.applymap(hot_encode)
basket_France = basket_encoded_France 
  
basket_encoded_UK = basket_UK.applymap(hot_encode)
basket_UK = basket_encoded_UK 
  
basket_encoded_Por =basket_Por.applymap(hot_encode)
basket_Por = basket_encoded_Por 
  
basket_encoded_Sweden = basket_Sweden.applymap(hot_encode)
basket_Sweden = basket_encoded_Sweden 

## France


In [7]:
# for each of the countries build a model
# view apriori documation
# min_support = 0.05
frq_items_France =apriori(basket_encoded_France, min_support=0.05, use_colnames=True) 
  
# Collecting the inferred rules in a dataframe 
# use association_rules with metric='lift' and mininum threshold = 1
rules = association_rules(frq_items_France, metric="lift", min_threshold=1)

# given the rules sort by confidence & lift, in descending order

rules=rules.sort_values(['confidence','lift'],ascending=['False','False'])

rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
213,(POSTAGE),"(LUNCH BAG SPACEBOY DESIGN, LUNCH BAG APPLE DE...",0.765306,0.063776,0.05102,0.066667,1.045333,0.002213,1.003098
27,(POSTAGE),(CIRCUS PARADE CHILDRENS EGG CUP),0.765306,0.056122,0.05102,0.066667,1.187879,0.00807,1.011297
96,(POSTAGE),(PARTY BUNTING),0.765306,0.056122,0.05102,0.066667,1.187879,0.00807,1.011297
226,(POSTAGE),"(LUNCH BAG WOODLAND, LUNCH BAG RED RETROSPOT)",0.765306,0.056122,0.05102,0.066667,1.187879,0.00807,1.011297
36,(POSTAGE),(JAM MAKING SET PRINTED),0.765306,0.053571,0.05102,0.066667,1.244444,0.010022,1.014031


## United Kingdom

In [8]:
frq_items_UK =apriori(basket_encoded_UK, min_support=0.05, use_colnames=True) 
  
# Collecting the inferred rules in a dataframe 
# use association_rules with metric='lift' and mininum threshold = 1
rules = association_rules(frq_items_UK, metric="lift", min_threshold=1)

# given the rules sort by confidence & lift, in descending order

rules=rules.sort_values(['confidence','lift'],ascending=['False','False'])

rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


## Portugal

In [9]:
frq_items_Por =apriori(basket_encoded_Por, min_support=0.05, use_colnames=True) 
  
# Collecting the inferred rules in a dataframe 
# use association_rules with metric='lift' and mininum threshold = 1
rules = association_rules(frq_items_France, metric="lift", min_threshold=1)

# given the rules sort by confidence & lift, in descending order

rules=rules.sort_values(['confidence','lift'],ascending=['False','False'])

rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
213,(POSTAGE),"(LUNCH BAG SPACEBOY DESIGN, LUNCH BAG APPLE DE...",0.765306,0.063776,0.05102,0.066667,1.045333,0.002213,1.003098
27,(POSTAGE),(CIRCUS PARADE CHILDRENS EGG CUP),0.765306,0.056122,0.05102,0.066667,1.187879,0.00807,1.011297
96,(POSTAGE),(PARTY BUNTING),0.765306,0.056122,0.05102,0.066667,1.187879,0.00807,1.011297
226,(POSTAGE),"(LUNCH BAG WOODLAND, LUNCH BAG RED RETROSPOT)",0.765306,0.056122,0.05102,0.066667,1.187879,0.00807,1.011297
36,(POSTAGE),(JAM MAKING SET PRINTED),0.765306,0.053571,0.05102,0.066667,1.244444,0.010022,1.014031


## Sweden

In [10]:
frq_items_Sweden =apriori(basket_encoded_Sweden, min_support=0.05, use_colnames=True) 
  
# Collecting the inferred rules in a dataframe 
# use association_rules with metric='lift' and mininum threshold = 1
rules = association_rules(frq_items_France, metric="lift", min_threshold=1)

# given the rules sort by confidence & lift, in descending order

rules=rules.sort_values(['confidence','lift'],ascending=['False','False'])

rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
213,(POSTAGE),"(LUNCH BAG SPACEBOY DESIGN, LUNCH BAG APPLE DE...",0.765306,0.063776,0.05102,0.066667,1.045333,0.002213,1.003098
27,(POSTAGE),(CIRCUS PARADE CHILDRENS EGG CUP),0.765306,0.056122,0.05102,0.066667,1.187879,0.00807,1.011297
96,(POSTAGE),(PARTY BUNTING),0.765306,0.056122,0.05102,0.066667,1.187879,0.00807,1.011297
226,(POSTAGE),"(LUNCH BAG WOODLAND, LUNCH BAG RED RETROSPOT)",0.765306,0.056122,0.05102,0.066667,1.187879,0.00807,1.011297
36,(POSTAGE),(JAM MAKING SET PRINTED),0.765306,0.053571,0.05102,0.066667,1.244444,0.010022,1.014031
