# Aim: Program to Implement Apriori Algorithm

## Step 1: Import Required Libraries

In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

## Step2: Loading and Exploring the Data

In [2]:
data = pd.read_excel('online_retail.xlsx')

### Read from 'data'

In [3]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
data.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [6]:
data.Country.unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

## Step 3: Clean the data

### Stripping Extra Spaces from Description

In [3]:
data['Description'] = data['Description'].str.strip()

### Drop rows with empty invoice number

In [4]:
data.dropna(axis=0, subset=['InvoiceNo'], inplace=True)

In [5]:
data['InvoiceNo'] = data['InvoiceNo'].astype('str')

### Drop Credit Transactions

In [6]:
data = data[~data['InvoiceNo'].str.contains('C')]

## Step 4: Split Data Regional Transactions

In [8]:
bucket_France = (data[data['Country']=="France"].groupby(['InvoiceNo', 'Description'])
                 ['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))

In [9]:
bucket_UK = (data[data['Country']=="United Kingdom"].groupby(['InvoiceNo', 'Description'])
                 ['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))

In [10]:
bucket_Portugal = (data[data['Country']=="Portugal"].groupby(['InvoiceNo', 'Description'])
                 ['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))

In [11]:
bucket_Italy = (data[data['Country']=="Italy"].groupby(['InvoiceNo', 'Description'])
                 ['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))

In [7]:
bucket_Germany = (data[data['Country']=="Italy"].groupby(['InvoiceNo', 'Description'])
                 ['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))

## Step 5: Hot Encode the data

### Define the hot encode function

In [9]:
def hot_encode(x):
    if(x<=0):
        return 0
    if(x>=1):
        return 1

## Encoding Datasets

In [13]:
b_encoded = bucket_France.applymap(hot_encode)
bucket_France = b_encoded

In [14]:
b_encoded = bucket_UK.applymap(hot_encode)
bucket_UK = b_encoded

In [15]:
b_encoded = bucket_Portugal.applymap(hot_encode)
bucket_Portugal = b_encoded

In [16]:
b_encoded = bucket_Italy.applymap(hot_encode)
bucket_Italy = b_encoded

In [10]:
b_encoded = bucket_Germany.applymap(hot_encode)
bucket_Germany = b_encoded

## Step 6: Build Models and Analyse Results

### France:

In [17]:
frq_items = apriori(bucket_France, min_support=0.05, use_colnames = True)

### Collecting the inferred rules in a dataframe

In [18]:
rules = association_rules(frq_items, metric="lift", min_threshold=1)

In [19]:
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])

In [20]:
print(rules.head())

                                           antecedents  \
45                        (JUMBO BAG WOODLAND ANIMALS)   
260  (PLASTERS IN TIN CIRCUS PARADE, RED TOADSTOOL ...   
272  (RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...   
301  (SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...   
300  (SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...   

                         consequents  antecedent support  consequent support  \
45                         (POSTAGE)            0.076531            0.765306   
260                        (POSTAGE)            0.051020            0.765306   
272                        (POSTAGE)            0.053571            0.765306   
301  (SET/6 RED SPOTTY PAPER PLATES)            0.102041            0.127551   
300    (SET/6 RED SPOTTY PAPER CUPS)            0.102041            0.137755   

      support  confidence      lift  leverage  conviction  
45   0.076531       1.000  1.306667  0.017961         inf  
260  0.051020       1.000  1.306667  0.011974     

## UK

In [21]:
frq_items = apriori(bucket_UK, min_support=0.04, use_colnames = True)

In [22]:
rules = association_rules(frq_items, metric="lift", min_threshold=1)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])
print(rules.head())

                 antecedents                consequents  antecedent support  \
1  (JUMBO BAG PINK POLKADOT)  (JUMBO BAG RED RETROSPOT)            0.062088   
0  (JUMBO BAG RED RETROSPOT)  (JUMBO BAG PINK POLKADOT)            0.103820   

   consequent support   support  confidence      lift  leverage  conviction  
1            0.103820  0.042053    0.677308  6.523895  0.035607    2.777201  
0            0.062088  0.042053    0.405057  6.523895  0.035607    1.576473  


In [24]:
frq_items = apriori(bucket_Portugal, min_support=0.04, use_colnames = True)

In [25]:
rules = association_rules(frq_items, metric="lift", min_threshold=1)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])
print(rules.head())

                              antecedents                         consequents  \
1170     (SET 12 COLOUR PENCILS SPACEBOY)  (SET 12 COLOUR PENCILS DOLLY GIRL)   
1171   (SET 12 COLOUR PENCILS DOLLY GIRL)    (SET 12 COLOUR PENCILS SPACEBOY)   
1172   (SET 12 COLOUR PENCILS DOLLY GIRL)  (SET OF 4 KNICK KNACK TINS LONDON)   
1173   (SET OF 4 KNICK KNACK TINS LONDON)  (SET 12 COLOUR PENCILS DOLLY GIRL)   
1174  (SET OF 4 KNICK KNACK TINS POPPIES)  (SET 12 COLOUR PENCILS DOLLY GIRL)   

      antecedent support  consequent support   support  confidence       lift  \
1170            0.051724            0.051724  0.051724         1.0  19.333333   
1171            0.051724            0.051724  0.051724         1.0  19.333333   
1172            0.051724            0.051724  0.051724         1.0  19.333333   
1173            0.051724            0.051724  0.051724         1.0  19.333333   
1174            0.051724            0.051724  0.051724         1.0  19.333333   

      leverage  conviction

In [26]:
frq_items = apriori(bucket_Italy, min_support=0.04, use_colnames = True)

In [27]:
rules = association_rules(frq_items, metric="lift", min_threshold=1)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])
print(rules.head())

                           antecedents                        consequents  \
22          (RED LOVE HEART SHAPE CUP)  (BAKING MOULD CHOCOLATE CUPCAKES)   
23   (BAKING MOULD CHOCOLATE CUPCAKES)         (RED LOVE HEART SHAPE CUP)   
54          (HOME BUILDING BLOCK WORD)         (BATH BUILDING BLOCK WORD)   
55          (BATH BUILDING BLOCK WORD)         (HOME BUILDING BLOCK WORD)   
164      (CHRISTMAS CRAFT WHITE FAIRY)   (CHRISTMAS CRAFT LITTLE FRIENDS)   

     antecedent support  consequent support   support  confidence  lift  \
22             0.052632            0.052632  0.052632         1.0  19.0   
23             0.052632            0.052632  0.052632         1.0  19.0   
54             0.052632            0.052632  0.052632         1.0  19.0   
55             0.052632            0.052632  0.052632         1.0  19.0   
164            0.052632            0.052632  0.052632         1.0  19.0   

     leverage  conviction  
22   0.049861         inf  
23   0.049861         inf  
54

In [11]:
frq_items = apriori(bucket_Germany, min_support=0.03, use_colnames = True)

In [12]:
rules = association_rules(frq_items, metric="lift", min_threshold=1)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])
print(rules.head())

                           antecedents                        consequents  \
22   (BAKING MOULD CHOCOLATE CUPCAKES)         (RED LOVE HEART SHAPE CUP)   
23          (RED LOVE HEART SHAPE CUP)  (BAKING MOULD CHOCOLATE CUPCAKES)   
54          (HOME BUILDING BLOCK WORD)         (BATH BUILDING BLOCK WORD)   
55          (BATH BUILDING BLOCK WORD)         (HOME BUILDING BLOCK WORD)   
164   (CHRISTMAS CRAFT LITTLE FRIENDS)      (CHRISTMAS CRAFT WHITE FAIRY)   

     antecedent support  consequent support   support  confidence  lift  \
22             0.052632            0.052632  0.052632         1.0  19.0   
23             0.052632            0.052632  0.052632         1.0  19.0   
54             0.052632            0.052632  0.052632         1.0  19.0   
55             0.052632            0.052632  0.052632         1.0  19.0   
164            0.052632            0.052632  0.052632         1.0  19.0   

     leverage  conviction  
22   0.049861         inf  
23   0.049861         inf  
54