## **Apriori and TBAR algorithms**
by Eva Aßmann, Paul Vogler

# Explore Dataset

In [0]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [0]:
#-------------------------------------------------------------------------------------------
#Dataset from  (http://archive.ics.uci.edu/ml/datasets/Online+Retail)
#using code from https://www.edureka.co/blog/apriori-algorithm/
#-------------------------------------------------------------------------------------------
df = pd.read_excel("Online Retail.xlsx")
# df = pd.read_csv("Online Retail.xlsx", sep=";") 
df.head() 

In [0]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

* InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.
* StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.
* Description: Product (item) name. Nominal.
* Quantity: The quantities of each product (item) per transaction. Numeric.
* InvoiceDate: Invice Date and time. Numeric, the day and time when each transaction was generated.
* UnitPrice: Unit price. Numeric, Product price per unit in sterling.
* CustomerID: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.
* Country: Country name. Nominal, the name of the country where each customer resides.

In [0]:
df.shape

(65534, 8)

In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65534 entries, 0 to 65533
Data columns (total 8 columns):
InvoiceNo      65534 non-null object
StockCode      65534 non-null object
Description    65368 non-null object
Quantity       65534 non-null int64
InvoiceDate    65534 non-null object
UnitPrice      65534 non-null object
CustomerID     40218 non-null float64
Country        65534 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 4.0+ MB


In [0]:
df.describe()

Unnamed: 0,Quantity,CustomerID
count,65534.0,40218.0
mean,8.363231,15384.033517
std,413.697637,1766.863499
min,-74215.0,12346.0
25%,1.0,14001.0
50%,2.0,15358.0
75%,8.0,17019.0
max,74215.0,18283.0


## Analyse InvoiceNo values

In [0]:
[elem for elem in df.InvoiceNo if len(elem)<6]

[]

In [0]:
# Special InvoiceNo labels
set([elem[0] for elem in df.InvoiceNo.unique() if elem[0].isalpha() ])

{'C'}

In [0]:
df[df.InvoiceNo.str.startswith("A")]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


## Analyse StockCode values
Values deviating from predefined format (5-digits id)

In [0]:
set([elem for elem in df.StockCode if len(elem)<5])

{'C2', 'D', 'DOT', 'M', 'POST', 'S', 'm'}

In [0]:
df[df.StockCode.str.startswith("B")][df.StockCode!="B"]

  """Entry point for launching an IPython kernel.


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
4406,536779,BANK CHARGES,Bank Charges,1,12/2/10 15:08,15,15823.0,United Kingdom
14435,C537572,BANK CHARGES,Bank Charges,-1,12/7/10 12:00,9538,,United Kingdom
28992,C538680,BANK CHARGES,Bank Charges,-1,12/13/10 17:10,96692,,United Kingdom
62508,541505,BANK CHARGES,Bank Charges,1,1/18/11 15:58,15,15939.0,United Kingdom
64573,C541653,BANK CHARGES,Bank Charges,-1,1/20/11 11:50,105015,,United Kingdom


In [0]:
df[df.StockCode.str.contains("CRUK")]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [0]:
df[df.StockCode.str.contains("C2")]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
1423,536540,C2,CARRIAGE,1,12/1/10 14:05,50,14911.0,EIRE
12119,537368,C2,CARRIAGE,1,12/6/10 12:40,50,14911.0,EIRE
12452,537378,C2,CARRIAGE,1,12/6/10 13:06,50,14911.0,EIRE
19975,537963,C2,CARRIAGE,1,12/9/10 11:30,50,13369.0,United Kingdom
20016,538002,C2,CARRIAGE,1,12/9/10 11:48,50,14932.0,Channel Islands
34369,539337,C2,CARRIAGE,1,12/17/10 10:46,50,,EIRE
34725,539421,C2,CARRIAGE,1,12/17/10 14:21,50,14016.0,EIRE
37644,539473,C2,CARRIAGE,1,12/19/10 14:24,50,14911.0,EIRE
39777,539688,C2,CARRIAGE,1,12/21/10 11:00,150,12678.0,France
42332,539984,C2,CARRIAGE,1,12/23/10 14:58,50,14911.0,EIRE


In [0]:
df[df.StockCode.str.contains("M")]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
918,536500,46000M,POLYESTER FILLER PAD 45x45cm,10,12/1/10 12:35,155,17377.0,United Kingdom
2141,536562,79302M,"ART LIGHTS,FUNK MONKEY",6,12/1/10 15:08,295,13468.0,United Kingdom
2239,536569,M,Manual,1,12/1/10 15:35,125,16274.0,United Kingdom
2250,536569,M,Manual,1,12/1/10 15:35,1895,16274.0,United Kingdom
2423,536591,90214M,"LETTER ""M"" BLING KEY RING",1,12/1/10 16:57,125,14606.0,United Kingdom
...,...,...,...,...,...,...,...,...
64314,541597,46000M,POLYESTER FILLER PAD 45x45cm,10,1/19/11 16:19,155,14031.0,United Kingdom
64569,C541650,M,Manual,-1,1/20/11 11:44,5444,,United Kingdom
64570,C541651,M,Manual,-1,1/20/11 11:48,12838,,United Kingdom
64663,541658,M,Manual,1,1/20/11 12:16,255,15529.0,United Kingdom


In [0]:
df[df.StockCode.str.contains("m")]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
40383,539736,m,Manual,1,12/21/10 15:18,255,,United Kingdom


In [0]:
df[df.StockCode.str.contains("PADS")]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [0]:
df[df.StockCode.str.contains("POST")]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
45,536370,POST,POSTAGE,3,12/1/10 8:45,18,12583.0,France
386,536403,POST,POSTAGE,1,12/1/10 11:27,15,12791.0,Netherlands
1123,536527,POST,POSTAGE,1,12/1/10 13:04,18,12662.0,Germany
5073,536840,POST,POSTAGE,1,12/2/10 18:27,18,12738.0,Germany
5258,536852,POST,POSTAGE,1,12/3/10 9:51,18,12686.0,France
...,...,...,...,...,...,...,...,...
63402,541567,POST,POSTAGE,3,1/19/11 11:51,18,12681.0,France
63437,541569,POST,POSTAGE,3,1/19/11 12:14,40,13520.0,Switzerland
64419,541607,POST,POSTAGE,1,1/20/11 9:53,2943,,United Kingdom
64522,541631,POST,POSTAGE,6,1/20/11 10:48,18,12637.0,France


## Analyse Description
- contains nan
- items are upper case
- unspecified additional info is lowercase (faulty, mixed up, amazon, thrown away-can't sale)
  - not consistently formatted
  - .....amazon and check exists as upper and lower case..?? is there more overlap??

In [0]:
for elem in df.Description.unique():
  if type(elem) == str and elem.replace(" ","").isupper() and len(elem)<10:
    print(elem)

POSTAGE
CARRIAGE
BINGO SET
SPACE OWL
SOMBRERO 
SAMPLES


In [0]:
for elem in df.Description.unique():
  if type(elem) == str and elem.replace(" ","").islower():
    print(elem)

amazon
check
damages
faulty
amazon sales
reverse 21/5/10 adjustment
mouldy, thrown away.
found
counted


In [0]:
df[df.Description=="wet"]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


## Analyse Quantity
- There are negative quantities

In [0]:
df.Quantity.describe()

count    65534.000000
mean         8.363231
std        413.697637
min     -74215.000000
25%          1.000000
50%          2.000000
75%          8.000000
max      74215.000000
Name: Quantity, dtype: float64

In [0]:
df[df.Quantity<0].shape, df[df.Quantity>0].shape, df[df.Quantity==0.0].shape

((1192, 8), (64342, 8), (0, 8))

In [0]:
df[df.Quantity<0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,12/1/10 9:41,275,14527.0,United Kingdom
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,12/1/10 9:49,465,15311.0,United Kingdom
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,12/1/10 10:24,165,17548.0,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,12/1/10 10:24,029,17548.0,United Kingdom
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,12/1/10 10:24,029,17548.0,United Kingdom
...,...,...,...,...,...,...,...,...
65097,C541693,22636,CHILDS BREAKFAST SET CIRCUS PARADE,-1,1/20/11 17:02,85,14309.0,United Kingdom
65098,C541693,84945,MULTI COLOUR SILVER T-LIGHT HOLDER,-6,1/20/11 17:02,085,14309.0,United Kingdom
65099,C541694,22440,BALLOON WATER BOMB PACK OF 35,-10,1/20/11 17:06,042,17364.0,United Kingdom
65100,C541694,22437,SET OF 9 BLACK SKULL BALLOONS,-10,1/20/11 17:06,085,17364.0,United Kingdom


## Analyse InvoiceDate


In [0]:
#[elem for elem in df.InvoiceDate.unique() if len(elem)<13]

## Analyse UnitPrice
- There are negative unit prices

In [0]:
df.UnitPrice.describe()

count     65534
unique      427
top        1,25
freq       5320
Name: UnitPrice, dtype: object

In [0]:
#df[df.UnitPrice<0].shape, df[df.UnitPrice>0].shape, df[df.UnitPrice==0].shape

In [0]:
#df[df.UnitPrice<0]

In [0]:
df[df.UnitPrice==0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,12/1/2010 11:52,0.0,,United Kingdom
1970,536545,21134,,1,12/1/2010 14:32,0.0,,United Kingdom
1971,536546,22145,,1,12/1/2010 14:33,0.0,,United Kingdom
1972,536547,37509,,1,12/1/2010 14:33,0.0,,United Kingdom
1987,536549,85226A,,1,12/1/2010 14:34,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
536981,581234,72817,,27,12/8/2011 10:33,0.0,,United Kingdom
538504,581406,46000M,POLYESTER FILLER PAD 45x45cm,240,12/8/2011 13:58,0.0,,United Kingdom
538505,581406,46000S,POLYESTER FILLER PAD 40x40cm,300,12/8/2011 13:58,0.0,,United Kingdom
538554,581408,85175,,20,12/8/2011 14:06,0.0,,United Kingdom


## Analyse CustomerID
- Conatins nan entries

In [0]:
[elem for elem in df.CustomerID if type(elem)==str and len(elem)<5]

[]

In [0]:
df[df.CustomerID.isnull()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,12/1/2010 11:52,0.00,,United Kingdom
1443,536544,21773,DECORATIVE ROSE BATHROOM BOTTLE,1,12/1/2010 14:32,2.51,,United Kingdom
1444,536544,21774,DECORATIVE CATS BATHROOM BOTTLE,2,12/1/2010 14:32,2.51,,United Kingdom
1445,536544,21786,POLKADOT RAIN HAT,4,12/1/2010 14:32,0.85,,United Kingdom
1446,536544,21787,RAIN PONCHO RETROSPOT,2,12/1/2010 14:32,1.66,,United Kingdom
...,...,...,...,...,...,...,...,...
541536,581498,85099B,JUMBO BAG RED RETROSPOT,5,12/9/2011 10:26,4.13,,United Kingdom
541537,581498,85099C,JUMBO BAG BAROQUE BLACK WHITE,4,12/9/2011 10:26,4.13,,United Kingdom
541538,581498,85150,LADIES & GENTLEMEN METAL SIGN,1,12/9/2011 10:26,4.96,,United Kingdom
541539,581498,85174,S/4 CACTI CANDLES,1,12/9/2011 10:26,10.79,,United Kingdom


## Analyse Country

In [0]:
df.Country.unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

# Data Cleaning
- Missing values in Description field
- Cancelled transactions
- Negative quantities (negative unit prices)
- Missing values in CustomerID


In [0]:
# originally 10624 transactions with negative quantities and 2 transactions with negative unit prices
#df[df.Quantity < 0].shape, df[df.UnitPrice < 0].shape

In [0]:
# originally 9288 cancelled transactions
df[df.InvoiceNo.str.startswith("C")].shape

(1084, 8)

In [0]:
# originally 1454 nans in Description and 135080 nans in CustomerID
for elem in df.columns:
  print(elem)
  print(df[df[elem].isnull()].shape)

InvoiceNo
(0, 8)
StockCode
(0, 8)
Description
(166, 8)
Quantity
(0, 8)
InvoiceDate
(0, 8)
UnitPrice
(0, 8)
CustomerID
(25316, 8)
Country
(0, 8)


In [0]:
# drop nan description rows
df = df[~df.Description.isnull()]
df['Description'] = df['Description'].str.strip()

In [0]:
# drop cancelled transactions
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df.InvoiceNo.str.startswith("C")]
# remaining 474 negative quantities and 133243 nan customer ids

In [0]:
df[df.Quantity<0].Description.unique()

array(['?', 'check', 'damages', 'faulty', 'Dotcom sales',
       'reverse 21/5/10 adjustment', 'mouldy, thrown away.', 'counted',
       'Given away', 'Dotcom'], dtype=object)

In [0]:
# cant do informative association rule analysis with these descriptions, so drop all negative quantities (comprising negative unit prices)
df = df[~df.Quantity<0]
# remaining 132769 nan customer ids

In [0]:
# keep entries with nan customer id, since customer id is not relevant for associatino rules analysis
# insert dummy value instead
df[df.CustomerID.isnull()] = "no_data"

In [0]:
#df.dropna(axis = 0, subset =['InvoiceNo'], inplace = True) 
#df['InvoiceNo'] = df['InvoiceNo'].astype('str')

In [0]:
df.shape

(64266, 8)

In [0]:
df.tail(4)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.1,12680,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680,France
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,12/9/2011 12:50,4.95,12680,France


# Preprocessing

In [0]:
# Transactions done in France 
basket_france = (df[df['Country'] =="France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))
basket_france

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,15CM CHRISTMAS GLASS BALL 20 LIGHTS,16 PIECE CUTLERY SET PANTRY DESIGN,18PC WOODEN CUTLERY SET DISPOSABLE,20 DOLLY PEGS RETROSPOT,200 RED + WHITE BENDY STRAWS,3 HOOK HANGER MAGIC GARDEN,3 PIECE SPACEBOY COOKIE CUTTER SET,3 RAFFIA RIBBONS 50'S CHRISTMAS,3 STRIPEY MICE FELTCRAFT,3 TIER CAKE TIN RED AND CREAM,3 TRADITIONAl BISCUIT CUTTERS SET,36 DOILIES DOLLY GIRL,36 DOILIES VINTAGE CHRISTMAS,36 FOIL HEART CAKE CASES,36 FOIL STAR CAKE CASES,36 PENCILS TUBE RED RETROSPOT,36 PENCILS TUBE SKULLS,36 PENCILS TUBE WOODLAND,3D HEARTS HONEYCOMB PAPER GARLAND,3D TRADITIONAL CHRISTMAS STICKERS,3D VINTAGE CHRISTMAS STICKERS,4 IVORY DINNER CANDLES SILVER FLOCK,4 PINK DINNER CANDLE SILVER FLOCK,4 TRADITIONAL SPINNING TOPS,5 HOOK HANGER MAGIC TOADSTOOL,5 HOOK HANGER RED MAGIC TOADSTOOL,50'S CHRISTMAS GIFT BAG LARGE,6 GIFT TAGS 50'S CHRISTMAS,6 GIFT TAGS VINTAGE CHRISTMAS,6 RIBBONS EMPIRE,...,WOODLAND CHARLOTTE BAG,WOODLAND DESIGN COTTON TOTE BAG,WOODLAND LARGE BLUE FELT HEART,WOODLAND LARGE PINK FELT HEART,WOODLAND LARGE RED FELT HEART,WOODLAND MINI BACKPACK,WOODLAND PARTY BAG + STICKER SET,WOODLAND SMALL BLUE FELT HEART,WOODLAND SMALL PINK FELT HEART,WOODLAND SMALL RED FELT HEART,WOODLAND STORAGE BOX LARGE,WOODLAND STORAGE BOX SMALL,WORLD WAR 2 GLIDERS ASSTD DESIGNS,WRAP VINTAGE DOILY,WRAP 50'S CHRISTMAS,WRAP ALPHABET DESIGN,WRAP CAROUSEL,WRAP CHRISTMAS VILLAGE,WRAP CIRCUS PARADE,WRAP DOILEY DESIGN,WRAP DOLLY GIRL,WRAP ENGLISH ROSE,WRAP GINGHAM ROSE,WRAP GREEN PEARS,WRAP I LOVE LONDON,WRAP PAISLEY PARK,WRAP PINK FAIRY CAKES,WRAP POPPIES DESIGN,WRAP RED APPLES,WRAP RED VINTAGE DOILY,WRAP SUKI AND FRIENDS,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
536370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
basket_sets_france = basket_france.applymap(encode_units)
basket_sets_france.drop('POSTAGE', inplace=True, axis=1)
basket_sets_france

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,15CM CHRISTMAS GLASS BALL 20 LIGHTS,16 PIECE CUTLERY SET PANTRY DESIGN,18PC WOODEN CUTLERY SET DISPOSABLE,20 DOLLY PEGS RETROSPOT,200 RED + WHITE BENDY STRAWS,3 HOOK HANGER MAGIC GARDEN,3 PIECE SPACEBOY COOKIE CUTTER SET,3 RAFFIA RIBBONS 50'S CHRISTMAS,3 STRIPEY MICE FELTCRAFT,3 TIER CAKE TIN RED AND CREAM,3 TRADITIONAl BISCUIT CUTTERS SET,36 DOILIES DOLLY GIRL,36 DOILIES VINTAGE CHRISTMAS,36 FOIL HEART CAKE CASES,36 FOIL STAR CAKE CASES,36 PENCILS TUBE RED RETROSPOT,36 PENCILS TUBE SKULLS,36 PENCILS TUBE WOODLAND,3D HEARTS HONEYCOMB PAPER GARLAND,3D TRADITIONAL CHRISTMAS STICKERS,3D VINTAGE CHRISTMAS STICKERS,4 IVORY DINNER CANDLES SILVER FLOCK,4 PINK DINNER CANDLE SILVER FLOCK,4 TRADITIONAL SPINNING TOPS,5 HOOK HANGER MAGIC TOADSTOOL,5 HOOK HANGER RED MAGIC TOADSTOOL,50'S CHRISTMAS GIFT BAG LARGE,6 GIFT TAGS 50'S CHRISTMAS,6 GIFT TAGS VINTAGE CHRISTMAS,6 RIBBONS EMPIRE,...,WOODLAND CHARLOTTE BAG,WOODLAND DESIGN COTTON TOTE BAG,WOODLAND LARGE BLUE FELT HEART,WOODLAND LARGE PINK FELT HEART,WOODLAND LARGE RED FELT HEART,WOODLAND MINI BACKPACK,WOODLAND PARTY BAG + STICKER SET,WOODLAND SMALL BLUE FELT HEART,WOODLAND SMALL PINK FELT HEART,WOODLAND SMALL RED FELT HEART,WOODLAND STORAGE BOX LARGE,WOODLAND STORAGE BOX SMALL,WORLD WAR 2 GLIDERS ASSTD DESIGNS,WRAP VINTAGE DOILY,WRAP 50'S CHRISTMAS,WRAP ALPHABET DESIGN,WRAP CAROUSEL,WRAP CHRISTMAS VILLAGE,WRAP CIRCUS PARADE,WRAP DOILEY DESIGN,WRAP DOLLY GIRL,WRAP ENGLISH ROSE,WRAP GINGHAM ROSE,WRAP GREEN PEARS,WRAP I LOVE LONDON,WRAP PAISLEY PARK,WRAP PINK FAIRY CAKES,WRAP POPPIES DESIGN,WRAP RED APPLES,WRAP RED VINTAGE DOILY,WRAP SUKI AND FRIENDS,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
536370,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
536852,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
536974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
537065,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
537463,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580986,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
581001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
581171,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
581279,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Apriori Algorithm

In [0]:
# using the mlxtend library http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
from mlxtend.frequent_patterns import apriori, association_rules 

In [0]:
import time
start = time.time()
frequent_itemsets = apriori(basket_sets_france, min_support=0.07, use_colnames=True)
end = time.time()
print("Time Taken in seconds is:")
print(end-start)
frequent_itemsets

Time Taken in seconds is:
0.045038700103759766


Unnamed: 0,support,itemsets
0,0.071979,(4 TRADITIONAL SPINNING TOPS)
1,0.097686,(ALARM CLOCK BAKELIKE GREEN)
2,0.102828,(ALARM CLOCK BAKELIKE PINK)
3,0.095116,(ALARM CLOCK BAKELIKE RED)
4,0.077121,(BAKING SET 9 PIECE RETROSPOT)
5,0.071979,(CHILDRENS CUTLERY DOLLY GIRL)
6,0.100257,(DOLLY GIRL LUNCH BOX)
7,0.097686,(JUMBO BAG RED RETROSPOT)
8,0.077121,(JUMBO BAG WOODLAND ANIMALS)
9,0.125964,(LUNCH BAG APPLE DESIGN)


In [0]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print(rules.shape)
rules.head(10)

(26, 9)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE GREEN),0.102828,0.097686,0.07455,0.725,7.421711,0.064505,3.28114
1,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE PINK),0.097686,0.102828,0.07455,0.763158,7.421711,0.064505,3.788061
2,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.095116,0.097686,0.079692,0.837838,8.576814,0.0704,5.564267
3,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.097686,0.095116,0.079692,0.815789,8.576814,0.0704,4.912229
4,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE RED),0.102828,0.095116,0.07455,0.725,7.622297,0.06477,3.290488
5,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE PINK),0.095116,0.102828,0.07455,0.783784,7.622297,0.06477,4.149422
6,(SPACEBOY LUNCH BOX),(DOLLY GIRL LUNCH BOX),0.125964,0.100257,0.071979,0.571429,5.699634,0.059351,2.0994
7,(DOLLY GIRL LUNCH BOX),(SPACEBOY LUNCH BOX),0.100257,0.125964,0.071979,0.717949,5.699634,0.059351,3.098855
8,(PLASTERS IN TIN CIRCUS PARADE),(PLASTERS IN TIN SPACEBOY),0.169666,0.136247,0.089974,0.530303,3.892224,0.066858,1.838958
9,(PLASTERS IN TIN SPACEBOY),(PLASTERS IN TIN CIRCUS PARADE),0.136247,0.169666,0.089974,0.660377,3.892224,0.066858,2.444873


**Filter the Dataframe for a high lift and high confidence**

In [0]:
rules[ (rules['lift'] >= 6) & (rules['confidence'] >= 0.8) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.095116,0.097686,0.079692,0.837838,8.576814,0.0704,5.564267
3,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.097686,0.095116,0.079692,0.815789,8.576814,0.0704,4.912229
18,(SET/6 RED SPOTTY PAPER CUPS),(SET/6 RED SPOTTY PAPER PLATES),0.138817,0.128535,0.123393,0.888889,6.915556,0.10555,7.843188
19,(SET/6 RED SPOTTY PAPER PLATES),(SET/6 RED SPOTTY PAPER CUPS),0.128535,0.138817,0.123393,0.96,6.915556,0.10555,21.529563
20,"(SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...",(SET/6 RED SPOTTY PAPER PLATES),0.102828,0.128535,0.100257,0.975,7.5855,0.08704,34.858612
21,"(SET/6 RED SPOTTY PAPER CUPS, SET/6 RED SPOTTY...",(SET/20 RED RETROSPOT PAPER NAPKINS),0.123393,0.133676,0.100257,0.8125,6.078125,0.083762,4.620394
22,"(SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...",(SET/6 RED SPOTTY PAPER CUPS),0.102828,0.138817,0.100257,0.975,7.023611,0.085983,34.447301


# TBAR Algorithm

**Transform the basket_sets_france so that for each row in the transaction table all columns with 0 would be omitted and for all columns with value 1 the column number would be in that row to fit the TBAR algorithm input:**

In [0]:
rowlist = []
for row in range(basket_sets_france.shape[0]):
  collist = []
  for col in range(basket_sets_france.shape[1]):
    if basket_sets_france.iloc[row][col] == 1:
        collist += [int(col)]
  if len(collist) > 0:
    rowlist += [collist]

In [0]:
import csv

with open("transformed.txt", "w", newline="") as f:
    writer = csv.writer(f, delimiter=' ')
    writer.writerows(rowlist)

code from https://github.com/AVINASH793/Apriori-Algorithm-using-Hashtree/blob/master/Apriori_Hashtree.py


In [0]:
import itertools
import time

#take input of file name and minimum support count
print("Enter the filename:")
filename = input()
print("Enter the minimum support count:")
min_support = int(input())

#read data from txt file
with open(filename) as f:
    content = f.readlines()

content = [x.strip() for x in content]

Transaction = []                  #to store transaction
Frequent_items_value = {}         #to store all frequent item sets

#to fill values in transaction from txt file
for i in range(0,len(content)):
    Transaction.append(content[i].split())

#function to get frequent one itemset
def frequent_one_item(Transaction,min_support):
    candidate1 = {}

    for i in range(0,len(Transaction)):
        for j in range(0,len(Transaction[i])):
            if Transaction[i][j] not in candidate1:
                candidate1[Transaction[i][j]] = 1
            else:
                candidate1[Transaction[i][j]] += 1

    frequentitem1 = []                      #to get frequent 1 itemsets with minimum support count
    for value in candidate1:
        if candidate1[value] >= min_support:
            frequentitem1 = frequentitem1 + [[value]]
            Frequent_items_value[tuple(value)] = candidate1[value]

    return frequentitem1

values = frequent_one_item(Transaction,min_support)
#print(values)
#print(Frequent_items_value)


# to remove infrequent 1 itemsets from transaction
Transaction1 = []
for i in range(0,len(Transaction)):
    list_val = []
    for j in range(0,len(Transaction[i])):
        if [Transaction[i][j]] in values:
            list_val.append(Transaction[i][j])
    Transaction1.append(list_val)


#class of Hash node
class Hash_node:
    def __init__(self):
        self.children = {}           #pointer to its children
        self.Leaf_status = True      #to know the status whether current node is leaf or not
        self.bucket = {}             #contains itemsets in bucket

#class of constructing and getting hashtree
class HashTree:
    # class constructor
    def __init__(self, max_leaf_count, max_child_count):
        self.root = Hash_node()
        self.max_leaf_count = max_leaf_count
        self.max_child_count = max_child_count
        self.frequent_itemsets = []

    # function to recursive insertion to make hashtree
    def recursively_insert(self, node, itemset, index, count):
        if index == len(itemset):
            if itemset in node.bucket:
                node.bucket[itemset] += count
            else:
                node.bucket[itemset] = count
            return

        if node.Leaf_status:                             #if node is leaf
            if itemset in node.bucket:
                node.bucket[itemset] += count
            else:
                node.bucket[itemset] = count
            if len(node.bucket) == self.max_leaf_count:  #if bucket capacity increases
                for old_itemset, old_count in node.bucket.items():

                    hash_key = self.hash_function(old_itemset[index])  #do hashing on next index
                    if hash_key not in node.children:
                        node.children[hash_key] = Hash_node()
                    self.recursively_insert(node.children[hash_key], old_itemset, index + 1, old_count)
                #since no more requirement of this bucket
                del node.bucket
                node.Leaf_status = False
        else:                                            #if node is not leaf
            hash_key = self.hash_function(itemset[index])
            if hash_key not in node.children:
                node.children[hash_key] = Hash_node()
            self.recursively_insert(node.children[hash_key], itemset, index + 1, count)

    def insert(self, itemset):
        itemset = tuple(itemset)
        self.recursively_insert(self.root, itemset, 0, 0)

    # to add support to candidate itemsets. Transverse the Tree and find the bucket in which this itemset is present.
    def add_support(self, itemset):
        Transverse_HNode = self.root
        itemset = tuple(itemset)
        index = 0
        while True:
            if Transverse_HNode.Leaf_status:
                if itemset in Transverse_HNode.bucket:    #found the itemset in this bucket
                    Transverse_HNode.bucket[itemset] += 1 #increment the count of this itemset.
                break
            hash_key = self.hash_function(itemset[index])
            if hash_key in Transverse_HNode.children:
                Transverse_HNode = Transverse_HNode.children[hash_key]
            else:
                break
            index += 1

    # to transverse the hashtree to get frequent itemsets with minimum support count
    def get_frequent_itemsets(self, node, support_count,frequent_itemsets):
        if node.Leaf_status:
            for key, value in node.bucket.items():
                if value >= support_count:                       #if it satisfies the condition
                    frequent_itemsets.append(list(key))          #then add it to frequent itemsets.
                    Frequent_items_value[key] = value
            return

        for child in node.children.values():
            self.get_frequent_itemsets(child, support_count,frequent_itemsets)

    # hash function for making HashTree
    def hash_function(self, val):
        return int(val) % self.max_child_count

#To generate hash tree from candidate itemsets
def generate_hash_tree(candidate_itemsets, max_leaf_count, max_child_count):
    htree = HashTree(max_child_count, max_leaf_count)             #create instance of HashTree
    for itemset in candidate_itemsets:
        htree.insert(itemset)                                     #to insert itemset into Hashtree
    return htree

#to generate subsets of itemsets of size k
def generate_k_subsets(dataset, length):
    subsets = []
    for itemset in dataset:
        subsets.extend(map(list, itertools.combinations(itemset, length)))
    return subsets

def subset_generation(ck_data,l):
    return map(list,set(itertools.combinations(ck_data,l)))

#apriori generate function to generate ck
def apriori_generate(dataset,k):
    ck = []
    #join step
    lenlk = len(dataset)
    for i in range(lenlk):
        for j in range(i+1,lenlk):
            L1 = list(dataset[i])[:k - 2]
            L2 = list(dataset[j])[:k - 2]
            if L1 == L2:
                ck.append(sorted(list(set(dataset[i]) | set(dataset[j]))))

    #prune step
    final_ck = []
    for candidate in ck:
        all_subsets = list(subset_generation(set(candidate), k - 1))
        found = True
        for i in range(len(all_subsets)):
            value = list(sorted(all_subsets[i]))
            if value not in dataset:
                found = False
        if found == True:
            final_ck.append(candidate)

    return ck,final_ck

def generateL(ck,min_support):
    support_ck = {}
    for val in Transaction1:
        for val1 in ck:
            value = set(val)
            value1 = set(val1)

            if value1.issubset(value):
                if tuple(val1) not in support_ck:
                    support_ck[tuple(val1)] = 1
                else:
                    support_ck[tuple(val1)] += 1
    frequent_item = []
    for item_set in support_ck:
        if support_ck[item_set] >= min_support:
            frequent_item.append(sorted(list(item_set)))
            Frequent_items_value[item_set] = support_ck[item_set]

    return frequent_item

# main apriori algorithm function
def apriori(L1,min_support):
    k = 2;
    L = []
    L.append(0)
    L.append(L1)
    print("enter max_leaf_count")              #maximum number of items in bucket i.e. bucket capacity of each node
    max_leaf_count = int(input())
    print("enter max_child_count")             #maximum number of child you want for a node
    max_child_count = int(input())

    start = time.time()
    while(len(L[k-1])>0):
        ck,final_ck = apriori_generate(L[k-1],k)                 #to generate candidate itemsets
        #print("C%d" %(k))
        #print(final_ck)
        h_tree = generate_hash_tree(ck,max_leaf_count,max_child_count)       #to generate hashtree
        if (k > 2):
            while(len(L[k-1])>0):
                l = generateL(final_ck, min_support)
                L.append(l)
                #print("Frequent %d item" % (k))
                #print(l)
                k = k + 1
                ck, final_ck = apriori_generate(L[k - 1], k)
                #print("C%d" % (k))
                #print(final_ck)
            break
        k_subsets = generate_k_subsets(Transaction1,k)                  #to generate subsets of each transaction
        for subset in k_subsets:
            h_tree.add_support(subset)                                  #to add support count to itemsets in hashtree
        lk = []
        h_tree.get_frequent_itemsets(h_tree.root,min_support,lk)                  #to get frequent itemsets
        #print("Frequent %d item" %(k))
        #print(lk)
        L.append(lk)
        k = k + 1
    end = time.time()
    return L,(end-start)


L_value,time_taken = apriori(values,min_support)
print("Time Taken is:")
print(time_taken)
#print("final L_value")
#print(L_value)
print("All frequent itemsets with their support count:")
print(Frequent_items_value)

Enter the filename:
transformed.txt
Enter the minimum support count:
28
enter max_leaf_count
1
enter max_child_count
2
Time Taken is:
0.02284526824951172
All frequent itemsets with their support count:
{('5', '9'): 38, ('6', '2'): 40, ('6', '3'): 37, ('7', '4', '6'): 41, ('1', '0', '4', '2'): 70, ('1', '1', '0', '5'): 62, ('1', '3', '0', '9'): 49, ('6', '9', '2'): 60, ('6', '9', '9'): 46, ('7', '0', '1'): 56, ('1', '0', '1', '6'): 54, ('1', '3', '5', '4'): 37, ('9', '4', '0'): 53, ('9', '4', '3'): 67, ('1', '0', '4', '8'): 49, ('1', '2', '3', '0'): 30, ('1', '3', '3', '2'): 48, ('8', '3', '7'): 40, ('1', '5', '0', '2'): 29, ('6', '9', '4'): 47, ('3', '6', '1'): 39, ('1', '2', '5', '1'): 54, ('1', '2', '5', '2'): 50, ('3', '3'): 28, ('9', '3', '8'): 66, ('6', '8', '9'): 33, ('9', '4', '1'): 32, ('1', '2', '3', '2'): 52, ('1', '3', '0', '4'): 28, ('8', '5', '2'): 32, ('1', '1', '0', '3'): 42, ('1', '0', '1'): 30, ('6', '2', '2'): 38, ('1', '0', '1', '2'): 38, ('1', '0', '2', '0'): 28, ('

**A support of 28 corresponds to the support of 0.07% for apriori**<br>
The number of frequent itemsets is the same

In [0]:
len(Frequent_items_value.keys())

50

But the exact frequent item sets differ

In [0]:
list(Frequent_items_value.keys())

[('5', '9'),
 ('6', '2'),
 ('6', '3'),
 ('7', '4', '6'),
 ('1', '0', '4', '2'),
 ('1', '1', '0', '5'),
 ('1', '3', '0', '9'),
 ('6', '9', '2'),
 ('6', '9', '9'),
 ('7', '0', '1'),
 ('1', '0', '1', '6'),
 ('1', '3', '5', '4'),
 ('9', '4', '0'),
 ('9', '4', '3'),
 ('1', '0', '4', '8'),
 ('1', '2', '3', '0'),
 ('1', '3', '3', '2'),
 ('8', '3', '7'),
 ('1', '5', '0', '2'),
 ('6', '9', '4'),
 ('3', '6', '1'),
 ('1', '2', '5', '1'),
 ('1', '2', '5', '2'),
 ('3', '3'),
 ('9', '3', '8'),
 ('6', '8', '9'),
 ('9', '4', '1'),
 ('1', '2', '3', '2'),
 ('1', '3', '0', '4'),
 ('8', '5', '2'),
 ('1', '1', '0', '3'),
 ('1', '0', '1'),
 ('6', '2', '2'),
 ('1', '0', '1', '2'),
 ('1', '0', '2', '0'),
 ('1', '0', '7', '3'),
 ('6', '3', '0'),
 ('6', '8', '6'),
 ('2', '5', '1'),
 ('9', '6', '8'),
 ('59', '62'),
 ('59', '63'),
 ('62', '63'),
 ('940', '943'),
 ('938', '940'),
 ('938', '943'),
 ('1251', '1252'),
 ('1232', '1251'),
 ('1232', '1252'),
 ('1232', '1251', '1252')]

**Add Titles back to the column positions**

In [0]:
frequent_items_cols = [tuple([basket_sets_france.columns[int(y)] for y in list(x)]) for x in list(Frequent_items_value.keys())]

In [0]:
frequent_items_sup = [(Frequent_items_value.get(x)/392.0) for x in list(Frequent_items_value.keys())]

In [0]:
d = {'support':frequent_items_sup,'itemsets':frequent_items_cols}
frequent_itemsets2 = pd.DataFrame(d)
frequent_itemsets2

Unnamed: 0,support,itemsets
0,0.096939,"(12 PENCILS SMALL TUBE RED RETROSPOT, 12 PENCI..."
1,0.102041,"(12 PENCILS SMALL TUBE SKULL, 12 EGG HOUSE PAI..."
2,0.094388,"(12 PENCILS SMALL TUBE SKULL, 12 MESSAGE CARDS..."
3,0.104592,"(12 PENCILS TALL TUBE POSY, 12 PENCIL SMALL TU..."
4,0.178571,"(12 COLOURED PARTY BALLOONS, 10 COLOUR SPACEBO..."
5,0.158163,"(12 COLOURED PARTY BALLOONS, 12 COLOURED PARTY..."
6,0.125,"(12 COLOURED PARTY BALLOONS, 12 MESSAGE CARDS ..."
7,0.153061,"(12 PENCILS SMALL TUBE SKULL, 12 PENCILS TALL ..."
8,0.117347,"(12 PENCILS SMALL TUBE SKULL, 12 PENCILS TALL ..."
9,0.142857,"(12 PENCILS TALL TUBE POSY, 10 COLOUR SPACEBOY..."


In [0]:
rules = association_rules(frequent_itemsets2, metric="lift", min_threshold=1, support_only=True)
print(rules.shape)
rules.head(10)

(0, 9)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
