In [1]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
raw_df = pd.read_csv("market_basket/market_basket_1.csv", sep=";")

  raw_df = pd.read_csv("market_basket/market_basket_1.csv", sep=";")


In [3]:
raw_df.head(2)

Unnamed: 0,BillNo,Itemname,Quantity,Date,Price,CustomerID,Country
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,6,01.12.2010 08:26,255,17850.0,United Kingdom
1,536365,WHITE METAL LANTERN,6,01.12.2010 08:26,339,17850.0,United Kingdom


In [4]:
# From the above, we see the relevant columns for forming market basket analysis are:
# 1. BillNo
# 2. Itemname
# 3. Date
# 4. Country

In [5]:
# Let us drop unwanted columns
raw_df.drop(columns=["Quantity", "Price", "CustomerID"], axis=1, inplace=True)

In [6]:
country_names = raw_df.Country.unique().tolist()

In [7]:
# For demo, let us select UK

In [8]:
uk_df = raw_df[raw_df["Country"] == "United Kingdom"].copy()

In [9]:
uk_df.head(2)

Unnamed: 0,BillNo,Itemname,Date,Country
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,01.12.2010 08:26,United Kingdom
1,536365,WHITE METAL LANTERN,01.12.2010 08:26,United Kingdom


In [10]:
uk_df.insert(0,"Transaction",uk_df["BillNo"].astype("str") + " | " + uk_df["Date"])

In [11]:
# Remove redundant cols
uk_df.drop(["BillNo", "Date", "Country"], axis=1, inplace=True)

In [12]:
uk_df.head(2)

Unnamed: 0,Transaction,Itemname
0,536365 | 01.12.2010 08:26,WHITE HANGING HEART T-LIGHT HOLDER
1,536365 | 01.12.2010 08:26,WHITE METAL LANTERN


In [13]:
uk_df.Transaction = uk_df.Transaction.astype("str")
uk_df.Itemname = uk_df.Itemname.astype("str")

In [14]:
unique_transactions = uk_df.Transaction.unique().tolist()

In [15]:
unique_transactions[0]

'536365 | 01.12.2010 08:26'

In [16]:
transaction_list = []

In [17]:
for transaction in unique_transactions:
    items_in_transaction = uk_df[uk_df["Transaction"] == transaction]["Itemname"].tolist()
    transaction_list.append(items_in_transaction)

KeyboardInterrupt: 

In [18]:
transaction_list

[['WHITE HANGING HEART T-LIGHT HOLDER',
  'WHITE METAL LANTERN',
  'CREAM CUPID HEARTS COAT HANGER',
  'KNITTED UNION FLAG HOT WATER BOTTLE',
  'RED WOOLLY HOTTIE WHITE HEART.',
  'SET 7 BABUSHKA NESTING BOXES',
  'GLASS STAR FROSTED T-LIGHT HOLDER'],
 ['HAND WARMER UNION JACK', 'HAND WARMER RED POLKA DOT'],
 ['ASSORTED COLOUR BIRD ORNAMENT',
  "POPPY'S PLAYHOUSE BEDROOM",
  "POPPY'S PLAYHOUSE KITCHEN",
  'FELTCRAFT PRINCESS CHARLOTTE DOLL',
  'IVORY KNITTED MUG COSY',
  'BOX OF 6 ASSORTED COLOUR TEASPOONS',
  'BOX OF VINTAGE JIGSAW BLOCKS',
  'BOX OF VINTAGE ALPHABET BLOCKS',
  'HOME BUILDING BLOCK WORD',
  'LOVE BUILDING BLOCK WORD',
  'RECIPE BOX WITH METAL HEART',
  'DOORMAT NEW ENGLAND'],
 ['JAM MAKING SET WITH JARS',
  'RED COAT RACK PARIS FASHION',
  'YELLOW COAT RACK PARIS FASHION',
  'BLUE COAT RACK PARIS FASHION'],
 ['BATH BUILDING BLOCK WORD'],
 ["PAPER CHAIN KIT 50'S CHRISTMAS"],
 ['HAND WARMER RED POLKA DOT', 'HAND WARMER UNION JACK'],
 ['WHITE HANGING HEART T-LIGHT HOLDER

In [19]:
encode = TransactionEncoder()
encoded_array = encode.fit_transform(transaction_list)

In [20]:
encoded_array

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [21]:
encode.columns_

['*Boombox Ipod Classic',
 '*USB Office Mirror Ball',
 '10 COLOUR SPACEBOY PEN',
 '12 COLOURED PARTY BALLOONS',
 '12 DAISY PEGS IN WOOD BOX',
 '12 EGG HOUSE PAINTED WOOD',
 '12 IVORY ROSE PEG PLACE SETTINGS',
 '12 MESSAGE CARDS WITH ENVELOPES',
 '12 PENCIL SMALL TUBE WOODLAND',
 '12 PENCILS SMALL TUBE RED RETROSPOT',
 '12 PENCILS SMALL TUBE SKULL',
 '12 PENCILS TALL TUBE POSY',
 '12 PENCILS TALL TUBE RED RETROSPOT',
 '12 PENCILS TALL TUBE SKULLS',
 '12 PENCILS TALL TUBE WOODLAND',
 '12 PINK HEN+CHICKS IN BASKET',
 '12 PINK ROSE PEG PLACE SETTINGS',
 '12 RED ROSE PEG PLACE SETTINGS',
 '15 PINK FLUFFY CHICKS IN BOX',
 '15CM CHRISTMAS GLASS BALL 20 LIGHTS',
 '2 DAISIES HAIR COMB',
 '2 PICTURE BOOK EGGS EASTER BUNNY',
 '2 PICTURE BOOK EGGS EASTER CHICKS',
 '2 PICTURE BOOK EGGS EASTER DUCKS',
 '20 DOLLY PEGS RETROSPOT',
 '200 BENDY SKULL STRAWS',
 '200 RED + WHITE BENDY STRAWS',
 '3 GARDENIA MORRIS BOXED CANDLES',
 '3 HEARTS HANGING DECORATION RUSTIC',
 '3 HOOK HANGER MAGIC GARDEN',
 '3 HOO

In [22]:
df = pd.DataFrame(data=encoded_array, columns=encode.columns_)

In [23]:
df

Unnamed: 0,*Boombox Ipod Classic,*USB Office Mirror Ball,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,...,showroom,throw away,thrown away,thrown away-can't sell,thrown away-can't sell.,wrong barcode,wrong barcode (22467),wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5012,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5013,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5014,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5015,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [24]:
model = apriori(df, min_support=0.03, max_len=5, use_colnames=True)

In [25]:
rule = association_rules(model, metric='lift', min_threshold=1)

In [27]:
rule.sort_values(by="lift", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
5,(ROSES REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.05302,0.054614,0.040462,0.763158,13.973588,0.037567,3.991628
4,(GREEN REGENCY TEACUP AND SAUCER),(ROSES REGENCY TEACUP AND SAUCER),0.054614,0.05302,0.040462,0.740876,13.973588,0.037567,3.654544
1,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.047638,0.050229,0.032888,0.690377,13.744521,0.030495,3.067503
0,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.050229,0.047638,0.032888,0.654762,13.744521,0.030495,2.758566
26,(WOODEN PICTURE FRAME WHITE FINISH),(WOODEN FRAME ANTIQUE WHITE),0.05601,0.063384,0.034283,0.6121,9.656931,0.030733,2.414578
27,(WOODEN FRAME ANTIQUE WHITE),(WOODEN PICTURE FRAME WHITE FINISH),0.063384,0.05601,0.034283,0.540881,9.656931,0.030733,2.056089
6,(HEART OF WICKER SMALL),(HEART OF WICKER LARGE),0.086905,0.066773,0.041858,0.481651,7.213269,0.036055,1.800385
7,(HEART OF WICKER LARGE),(HEART OF WICKER SMALL),0.066773,0.086905,0.041858,0.626866,7.213269,0.036055,2.447096
15,(JUMBO BAG RED RETROSPOT),(JUMBO BAG BAROQUE BLACK WHITE),0.093482,0.051027,0.034084,0.364606,7.145414,0.029314,1.493519
14,(JUMBO BAG BAROQUE BLACK WHITE),(JUMBO BAG RED RETROSPOT),0.051027,0.093482,0.034084,0.667969,7.145414,0.029314,2.730218
