## Importing libraries and reading the file

In [11]:
import pandas as pd
import numpy as np
import re
import mlxtend.frequent_patterns as ml

df = pd.read_csv("online_retail.csv")
df.head(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


## Deleting items with 'InvoiceNo' starting with C and suspicion Description (ex: wrong order, lower case instead of upper case)

In [33]:
df = df[~df.InvoiceNo.str.contains('C')]
dfnew = df[~df.Description.str.contains('[a-z]+|wrongly|bad|wrong|away|fixed|test|sold|adjust|return|sample|damage|fix', regex = True, na=False)]

## Creating a suitable data structure in order to use the methods FP growth and Apriori

### Creating a dataframe grouping by InvoiceNo and Description

In [34]:
orders = dfnew.groupby(['InvoiceNo', 'Description'], as_index=False)
orders = orders.first().drop(columns=['StockCode', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country'])
orders

Unnamed: 0,InvoiceNo,Description
0,536365,CREAM CUPID HEARTS COAT HANGER
1,536365,GLASS STAR FROSTED T-LIGHT HOLDER
2,536365,KNITTED UNION FLAG HOT WATER BOTTLE
3,536365,RED WOOLLY HOTTIE WHITE HEART.
4,536365,SET 7 BABUSHKA NESTING BOXES
...,...,...
517844,581587,CIRCUS PARADE LUNCH BOX
517845,581587,PACK OF 20 SPACEBOY NAPKINS
517846,581587,PLASTERS IN TIN CIRCUS PARADE
517847,581587,PLASTERS IN TIN STRONGMAN


### Creating a dataframe grouping by Description

In [35]:
items = dfnew.groupby(['Description'], as_index=False)
items = items.first().drop(columns=['InvoiceNo','StockCode', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country'])
items

Unnamed: 0,Description
0,4 PURPLE FLOCK DINNER CANDLES
1,50'S CHRISTMAS GIFT BAG LARGE
2,DOLLY GIRL BEAKER
3,I LOVE LONDON MINI BACKPACK
4,I LOVE LONDON MINI RUCKSACK
...,...
4003,ZINC T-LIGHT HOLDER STARS SMALL
4004,ZINC TOP 2 DOOR WOODEN SHELF
4005,ZINC WILLIE WINKIE CANDLE STICK
4006,ZINC WIRE KITCHEN ORGANISER


### Creating a dictionary of orders: InvoiceNo as key, all the products of the order as values

In [36]:
orders_dict = {}
for i, row in orders.iterrows():
    if row.InvoiceNo not in orders_dict.keys():
        orders_dict[row.InvoiceNo] = []
    orders_dict[row.InvoiceNo].append(row.Description)
orders_dict

{'536365': ['CREAM CUPID HEARTS COAT HANGER',
  'GLASS STAR FROSTED T-LIGHT HOLDER',
  'KNITTED UNION FLAG HOT WATER BOTTLE',
  'RED WOOLLY HOTTIE WHITE HEART.',
  'SET 7 BABUSHKA NESTING BOXES',
  'WHITE HANGING HEART T-LIGHT HOLDER',
  'WHITE METAL LANTERN'],
 '536366': ['HAND WARMER RED POLKA DOT', 'HAND WARMER UNION JACK'],
 '536367': ['ASSORTED COLOUR BIRD ORNAMENT',
  'BOX OF 6 ASSORTED COLOUR TEASPOONS',
  'BOX OF VINTAGE ALPHABET BLOCKS',
  'BOX OF VINTAGE JIGSAW BLOCKS ',
  'DOORMAT NEW ENGLAND',
  'FELTCRAFT PRINCESS CHARLOTTE DOLL',
  'HOME BUILDING BLOCK WORD',
  'IVORY KNITTED MUG COSY ',
  'LOVE BUILDING BLOCK WORD',
  "POPPY'S PLAYHOUSE BEDROOM ",
  "POPPY'S PLAYHOUSE KITCHEN",
  'RECIPE BOX WITH METAL HEART'],
 '536368': ['BLUE COAT RACK PARIS FASHION',
  'JAM MAKING SET WITH JARS',
  'RED COAT RACK PARIS FASHION',
  'YELLOW COAT RACK PARIS FASHION'],
 '536369': ['BATH BUILDING BLOCK WORD'],
 '536370': [' SET 2 TEA TOWELS I LOVE LONDON ',
  'ALARM CLOCK BAKELIKE GREEN',

### Creating a matrix representing all the orders: the ith row represent an order and the jth coloumn represent a product, if matrix[i][j] is equal to 1, the order contains the jth product. In order to do so a dictionary containing the description of every product is created to speed things up

In [37]:
description_dict = {}
matrix = []

for (i, row) in items.iterrows():
    if row.Description not in description_dict.keys():
        description_dict[row.Description] = i

for key, values in orders_dict.items():
    appoarray = np.zeros(len(items))
    for val in values:
        appoarray[description_dict[val]] = 1
    matrix.append(appoarray.tolist())

### Creating the final dataframe I was looking for

In [38]:
df = pd.DataFrame(data = matrix, columns = items.Description)
df

Description,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,I LOVE LONDON MINI RUCKSACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,...,ZINC STAR T-LIGHT HOLDER,ZINC SWEETHEART SOAP DISH,ZINC SWEETHEART WIRE LETTER RACK,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK,ZINC WIRE KITCHEN ORGANISER,ZINC WIRE SWEETHEART LETTER TRAY
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
19909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Applying the FP growth method, it requires the dataframe just created and minsup

In [39]:
fi = ml.fpgrowth(df, 0.02, use_colnames=True)
print(len(fi))
print(fi.to_string())

379
      support                                           itemsets
0    0.113505               (WHITE HANGING HEART T-LIGHT HOLDER)
1    0.023053              (KNITTED UNION FLAG HOT WATER BOTTLE)
2    0.021546                   (RED WOOLLY HOTTIE WHITE HEART.)
3    0.024057                           (HAND WARMER UNION JACK)
4    0.073075                    (ASSORTED COLOUR BIRD ORNAMENT)
5    0.039024                         (HOME BUILDING BLOCK WORD)
6    0.031591                         (LOVE BUILDING BLOCK WORD)
7    0.030134                              (DOORMAT NEW ENGLAND)
8    0.022550                (FELTCRAFT PRINCESS CHARLOTTE DOLL)
9    0.021897                        (POPPY'S PLAYHOUSE KITCHEN)
10   0.021245                       (POPPY'S PLAYHOUSE BEDROOM )
11   0.056903                         (JAM MAKING SET WITH JARS)
12   0.056552                                          (POSTAGE)
13   0.052785                        (ALARM CLOCK BAKELIKE RED )
14   0.049219        

## Looking for association rules using confidence as a metric and requiring at least 85%

In [40]:
rules = ml.association_rules(fi, metric='confidence', min_threshold=0.85)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(PINK REGENCY TEACUP AND SAUCER, ROSES REGENCY...",(GREEN REGENCY TEACUP AND SAUCER),0.030084,0.050977,0.027221,0.904841,17.750046,0.025688,9.973068
1,"(PINK REGENCY TEACUP AND SAUCER, GREEN REGENCY...",(ROSES REGENCY TEACUP AND SAUCER ),0.031791,0.053538,0.027221,0.85624,15.993055,0.025519,6.58363


## Applying the Apriori method, it requires the dataframe just created and minsup

In [41]:
fa = ml.apriori(df, 0.05, use_colnames=True)
print(len(fa))
print(fa.to_string())

33
     support                              itemsets
0   0.052785           (ALARM CLOCK BAKELIKE RED )
1   0.073075       (ASSORTED COLOUR BIRD ORNAMENT)
2   0.050977     (GREEN REGENCY TEACUP AND SAUCER)
3   0.060318               (HEART OF WICKER SMALL)
4   0.058360              (JAM MAKING SET PRINTED)
5   0.056903            (JAM MAKING SET WITH JARS)
6   0.061172             (JUMBO BAG PINK POLKADOT)
7   0.105068             (JUMBO BAG RED RETROSPOT)
8   0.059013   (JUMBO SHOPPER VINTAGE RED PAISLEY)
9   0.059465              (JUMBO STORAGE BAG SUKI)
10  0.063935             (LUNCH BAG  BLACK SKULL.)
11  0.052032              (LUNCH BAG APPLE DESIGN)
12  0.057757                 (LUNCH BAG CARS BLUE)
13  0.054744             (LUNCH BAG PINK POLKADOT)
14  0.078550             (LUNCH BAG RED RETROSPOT)
15  0.058109          (LUNCH BAG SPACEBOY DESIGN )
16  0.054492              (LUNCH BAG SUKI DESIGN )
17  0.051128                  (LUNCH BAG WOODLAND)
18  0.062729     (NATURAL SL

## Which method is faster? In this case Apriori 

In [42]:
import timeit
print(timeit.timeit(lambda: ml.apriori(df, 0.05), number=1))
print(timeit.timeit(lambda: ml.fpgrowth(df, 0.05), number=1))

1.2233536999999615
3.359753400000045


## Usually Fp-tree is faster because the complexity is O(n).
## Apriori can be very fast if no items satisfy the minimum support. For example when your longest itemsets are 2 itemsets, a quite naive version can be fine. Apriori pruning as well as the fp-tree only begin to shine when you go for longer itemsets, which may require choosing a low support parameter.