In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
df = pd.read_csv('../datasets/Online_Retail.csv',index_col='Unnamed: 0')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Country        541909 non-null  object 
 1   CustomerID     406829 non-null  float64
 2   Description    540455 non-null  object 
 3   InvoiceNo      541909 non-null  object 
 4   Quantity       541909 non-null  float64
 5   StockCode      541909 non-null  object 
 6   UnitPrice      541909 non-null  float64
 7   InvoiceDate_1  541909 non-null  object 
dtypes: float64(3), object(5)
memory usage: 37.2+ MB


In [4]:
df.describe(include='all')

Unnamed: 0,Country,CustomerID,Description,InvoiceNo,Quantity,StockCode,UnitPrice,InvoiceDate_1
count,541909,406829.0,540455,541909.0,541909.0,541909,541909.0,541909
unique,38,,4223,25900.0,,4070,,23260
top,United Kingdom,,WHITE HANGING HEART T-LIGHT HOLDER,573585.0,,85123A,,2011-10-31 14:41:00
freq,495478,,2369,1114.0,,2313,,1114
mean,,15287.69057,,,9.55225,,4.611114,
std,,1713.600303,,,218.081158,,96.759853,
min,,12346.0,,,-80995.0,,-11062.06,
25%,,13953.0,,,1.0,,1.25,
50%,,15152.0,,,3.0,,2.08,
75%,,16791.0,,,10.0,,4.13,


In [5]:
df.head()

Unnamed: 0,Country,CustomerID,Description,InvoiceNo,Quantity,StockCode,UnitPrice,InvoiceDate_1
0,United Kingdom,17850.0,WHITE HANGING HEART T-LIGHT HOLDER,536365.0,6.0,85123A,2.55,2010-12-01 08:26:00
1,United Kingdom,17850.0,WHITE METAL LANTERN,536365.0,6.0,71053.0,3.39,2010-12-01 08:26:00
2,United Kingdom,17850.0,CREAM CUPID HEARTS COAT HANGER,536365.0,8.0,84406B,2.75,2010-12-01 08:26:00
3,United Kingdom,17850.0,KNITTED UNION FLAG HOT WATER BOTTLE,536365.0,6.0,84029G,3.39,2010-12-01 08:26:00
4,United Kingdom,17850.0,RED WOOLLY HOTTIE WHITE HEART.,536365.0,6.0,84029E,3.39,2010-12-01 08:26:00


In [6]:
df['Description']=df['Description'].str.strip()
df['Description']=df['Description'].str.lower()
df['Description']

0          white hanging heart t-light holder
1                         white metal lantern
2              cream cupid hearts coat hanger
3         knitted union flag hot water bottle
4              red woolly hottie white heart.
                         ...                 
541904            pack of 20 spaceboy napkins
541905            children's apron dolly girl
541906           childrens cutlery dolly girl
541907        childrens cutlery circus parade
541908           baking set 9 piece retrospot
Name: Description, Length: 541909, dtype: object

In [7]:
df['Description'].nunique()

4194

In [8]:
df['Description'].unique()

array(['white hanging heart t-light holder', 'white metal lantern',
       'cream cupid hearts coat hanger', ..., 'lost',
       'cream hanging heart t-light holder',
       'paper craft , little birdie'], dtype=object)

In [9]:
df['InvoiceNo'].sort_values()

0         536365.0
1         536365.0
2         536365.0
3         536365.0
4         536365.0
            ...   
540449     C581490
541541     C581499
541715     C581568
541716     C581569
541717     C581569
Name: InvoiceNo, Length: 541909, dtype: object

In [10]:
def clean_invoice(invoice):
    invoice = invoice.replace('.','')
    if invoice.isalnum():
        return int(invoice[1:])
    return int(invoice)

In [11]:
df['InvoiceNo']=df['InvoiceNo'].apply(clean_invoice)
df['InvoiceNo'].sort_values()

0         363650
1         363650
2         363650
3         363650
4         363650
           ...  
541896    815870
541895    815870
541907    815870
541900    815870
541908    815870
Name: InvoiceNo, Length: 541909, dtype: int64

In [12]:
df[['Description','InvoiceNo']].groupby(by='InvoiceNo').count().sort_values(by='Description')

Unnamed: 0_level_0,Description
InvoiceNo,Unnamed: 1_level_1
514290,0
486840,0
543520,0
486830,0
773390,0
...,...
584750,705
807290,721
814920,731
812190,749


In [13]:
df['Country'].unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [14]:
df[['Country','Description','InvoiceNo']].groupby(['Country','InvoiceNo']).count().reset_index()

Unnamed: 0,Country,InvoiceNo,Description
0,Australia,363890,14
1,Australia,376760,8
2,Australia,394190,10
3,Australia,402670,46
4,Australia,402800,2
...,...,...,...
25744,Unspecified,639470,7
25745,Unspecified,640510,16
25746,Unspecified,653030,66
25747,Unspecified,766460,19


In [15]:
df['Country'].value_counts()

United Kingdom          495478
Germany                   9495
France                    8557
EIRE                      8196
Spain                     2533
Netherlands               2371
Belgium                   2069
Switzerland               2002
Portugal                  1519
Australia                 1259
Norway                    1086
Italy                      803
Channel Islands            758
Finland                    695
Cyprus                     622
Sweden                     462
Unspecified                446
Austria                    401
Denmark                    389
Japan                      358
Poland                     341
Israel                     297
USA                        291
Hong Kong                  288
Singapore                  229
Iceland                    182
Canada                     151
Greece                     146
Malta                      127
United Arab Emirates        68
European Community          61
RSA                         58
Lebanon 

# main code

In [17]:
germanydf = df[df.Country=='Germany'].copy()

In [18]:
germanydf

Unnamed: 0,Country,CustomerID,Description,InvoiceNo,Quantity,StockCode,UnitPrice,InvoiceDate_1
1109,Germany,12662.0,set of 6 t-lights santa,365270,6.0,22809.0,2.95,2010-12-01 13:04:00
1110,Germany,12662.0,rotating silver angels t-light hldr,365270,6.0,84347.0,2.55,2010-12-01 13:04:00
1111,Germany,12662.0,multi colour silver t-light holder,365270,12.0,84945.0,0.85,2010-12-01 13:04:00
1112,Germany,12662.0,5 hook hanger magic toadstool,365270,12.0,22242.0,1.65,2010-12-01 13:04:00
1113,Germany,12662.0,3 hook hanger magic garden,365270,12.0,22244.0,1.95,2010-12-01 13:04:00
...,...,...,...,...,...,...,...,...
541801,Germany,12713.0,set of 4 pantry jelly moulds,815780,12.0,22993.0,1.25,2011-12-09 12:16:00
541802,Germany,12713.0,pack of 20 napkins pantry design,815780,12.0,22907.0,0.85,2011-12-09 12:16:00
541803,Germany,12713.0,pack of 20 napkins red apples,815780,12.0,22908.0,0.85,2011-12-09 12:16:00
541804,Germany,12713.0,jingle bell heart antique silver,815780,12.0,23215.0,2.08,2011-12-09 12:16:00


In [25]:
pivotdata = germanydf.groupby(['InvoiceNo','Description'])['Quantity'].sum().unstack()

In [26]:
pivotdata

Description,10 colour spaceboy pen,12 coloured party balloons,12 ivory rose peg place settings,12 message cards with envelopes,12 pencil small tube woodland,12 pencils small tube red retrospot,12 pencils small tube skull,12 pencils tall tube posy,12 pencils tall tube red retrospot,12 pencils tall tube skulls,...,yuletide images gift wrap set,zinc heart t-light holder,zinc star t-light holder,zinc box sign home,zinc folkart sleigh bells,zinc heart lattice t-light holder,zinc metal heart decoration,zinc t-light holder star large,zinc t-light holder stars small,zinc willie winkie candle stick
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
365270,,,,,,,,,,,...,,,,,,,,,,
368400,,,,,,,,,,,...,,,,,,,,,,
368610,,,,,,,,,,,...,,,,,,,,,,
369670,,,,,,,,,,,...,,,,,,,,,,
369830,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812660,,,,,,,,,,,...,,,,,,,,,,
814940,,,,,,,,,,,...,,,,,,,,,,
815700,,,,,,,,,,,...,,,,,,,,,,
815740,,,,,,,,,,,...,,,,,,,,,,


In [27]:
pivotdata.fillna(0,inplace=True)

In [28]:
pivotdata

Description,10 colour spaceboy pen,12 coloured party balloons,12 ivory rose peg place settings,12 message cards with envelopes,12 pencil small tube woodland,12 pencils small tube red retrospot,12 pencils small tube skull,12 pencils tall tube posy,12 pencils tall tube red retrospot,12 pencils tall tube skulls,...,yuletide images gift wrap set,zinc heart t-light holder,zinc star t-light holder,zinc box sign home,zinc folkart sleigh bells,zinc heart lattice t-light holder,zinc metal heart decoration,zinc t-light holder star large,zinc t-light holder stars small,zinc willie winkie candle stick
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
365270,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
368400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
368610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
369670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
369830,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
814940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
815700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
815740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
def hot_encoder(val):
    if val >= 1:
        return 1
    return 0

In [35]:
pivotdata = pivotdata.applymap(hot_encoder)
pivotdata

Description,10 colour spaceboy pen,12 coloured party balloons,12 ivory rose peg place settings,12 message cards with envelopes,12 pencil small tube woodland,12 pencils small tube red retrospot,12 pencils small tube skull,12 pencils tall tube posy,12 pencils tall tube red retrospot,12 pencils tall tube skulls,...,yuletide images gift wrap set,zinc heart t-light holder,zinc star t-light holder,zinc box sign home,zinc folkart sleigh bells,zinc heart lattice t-light holder,zinc metal heart decoration,zinc t-light holder star large,zinc t-light holder stars small,zinc willie winkie candle stick
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
365270,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
368400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
368610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
369670,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
369830,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812660,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
814940,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
815700,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
815740,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
item_freq = apriori(pivotdata,min_support=.05, use_colnames=True)
item_freq.sort_values('support',ascending=False)

Unnamed: 0,support,itemsets
12,0.620232,(postage)
18,0.185738,(round snack boxes set of4 woodland)
37,0.170813,"(round snack boxes set of4 woodland, postage)"
17,0.119403,(round snack boxes set of 4 fruits)
36,0.114428,"(postage, round snack boxes set of 4 fruits)"
11,0.104478,(plasters in tin woodland animals)
16,0.104478,(regency cakestand 3 tier)
41,0.099502,"(round snack boxes set of4 woodland, round sna..."
22,0.096186,(woodland charlotte bag)
43,0.094527,"(round snack boxes set of4 woodland, postage, ..."


In [40]:
ar = association_rules(item_freq,metric='confidence', min_threshold=.1)

In [42]:
ar.sort_values(['lift','confidence'],ascending=[False,False],inplace=True)
ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
7,(plasters in tin circus parade),(plasters in tin woodland animals),0.087894,0.104478,0.05141,0.584906,5.598383,0.042227,2.157395
6,(plasters in tin woodland animals),(plasters in tin circus parade),0.104478,0.087894,0.05141,0.492063,5.598383,0.042227,1.795709
38,(round snack boxes set of 4 fruits),"(round snack boxes set of4 woodland, postage)",0.119403,0.170813,0.094527,0.791667,4.634709,0.074132,3.9801
33,"(round snack boxes set of4 woodland, postage)",(round snack boxes set of 4 fruits),0.170813,0.119403,0.094527,0.553398,4.634709,0.074132,1.971772
29,(round snack boxes set of4 woodland),(round snack boxes set of 4 fruits),0.185738,0.119403,0.099502,0.535714,4.486607,0.077325,1.89667
30,(round snack boxes set of 4 fruits),(round snack boxes set of4 woodland),0.119403,0.185738,0.099502,0.833333,4.486607,0.077325,4.885572
35,"(postage, round snack boxes set of 4 fruits)",(round snack boxes set of4 woodland),0.114428,0.185738,0.094527,0.826087,4.447593,0.073274,4.682007
36,(round snack boxes set of4 woodland),"(postage, round snack boxes set of 4 fruits)",0.185738,0.114428,0.094527,0.508929,4.447593,0.073274,1.803347
32,(spaceboy lunch box),(round snack boxes set of4 woodland),0.077944,0.185738,0.053068,0.680851,3.665653,0.038591,2.551354
31,(round snack boxes set of4 woodland),(spaceboy lunch box),0.185738,0.077944,0.053068,0.285714,3.665653,0.038591,1.290879
