In [1]:
# Langkah 1 : Mengimpor library yang diperlukan
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules #import fungsi apriori dan asosiasi rules

In [2]:
# Langkah 2 : Meload dan Mengeksplor data
# Memuat Data
data = pd.read_excel("online_retail_II.xlsx")
data.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [3]:
# Mengeksplor kolom data
data.columns

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')

In [4]:
# Mengeksplor berbagai wilayah transaksi
data.Country.unique()

array(['United Kingdom', 'France', 'USA', 'Belgium', 'Australia', 'EIRE',
       'Germany', 'Portugal', 'Japan', 'Denmark', 'Nigeria',
       'Netherlands', 'Poland', 'Spain', 'Channel Islands', 'Italy',
       'Cyprus', 'Greece', 'Norway', 'Austria', 'Sweden',
       'United Arab Emirates', 'Finland', 'Switzerland', 'Unspecified',
       'Malta', 'Bahrain', 'RSA', 'Bermuda', 'Hong Kong', 'Singapore',
       'Thailand', 'Israel', 'Lithuania', 'West Indies', 'Lebanon',
       'Korea', 'Brazil', 'Canada', 'Iceland'], dtype=object)

In [5]:
# Langkah 3 : Membersihkan Data
# Menghapus ruang ekstra dalam deskripsi misal spasi/kosong" dapat dihilangkan
data['Description'] = data['Description'].str.strip()

# Menghapus baris tanpa nomor faktur (InvoiceNo) / Hilang
data.dropna(axis = 0, subset =['Invoice'], inplace = True)
data['Invoice'] = data['Invoice'].astype('str')

# Menghapus semua transaksi yang dilakukan secara kredit (C=Credit)/ yang ada C maka dihilangkan
data = data[~data['Invoice'].str.contains('C')]

In [6]:
# Langkah 4 : Memisahkan data sesuai dengan wilayah transaksi
# Ambil 4 sampel yang memiliki data banyak dengan dipisahkan berdasarkan negaranya
# Transaksi dilakukan di UK
basket_Belgium = (data[data['Country'] == "Belgium"]
                 .groupby(['Invoice', 'Description'])['Quantity']
                 .sum().unstack().reset_index().fillna(0)
                 .set_index('Invoice'))

# Transaksi dilakukan di EIRE
basket_EIRE = (data[data['Country'] == "EIRE"]
                 .groupby(['Invoice', 'Description'])['Quantity']
                 .sum().unstack().reset_index().fillna(0)
                 .set_index('Invoice'))

# Transaksi dilakukan di Germany
basket_Germany = (data[data['Country'] == "Germany"]
                 .groupby(['Invoice', 'Description'])['Quantity']
                 .sum().unstack().reset_index().fillna(0)
                 .set_index('Invoice'))

# Transaksi dilakukan di France
basket_France = (data[data['Country'] == "France"]
                 .groupby(['Invoice', 'Description'])['Quantity']
                 .sum().unstack().reset_index().fillna(0)
                 .set_index('Invoice'))

# Contoh Menampilakn data swedia
basket_Belgium

Description,10 COLOUR SPACEBOY PEN,12 EGG HOUSE PAINTED WOOD,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE RED SPOTTY,12 PENCILS TALL TUBE RED SPOTTY,12 PENCILS TALL TUBE SKULLS,12 PENCILS TALL TUBE WOODLAND,36 DOILIES DOLLY GIRL,36 DOILIES SPACEBOY DESIGN,36 FOIL HEART CAKE CASES,...,WOODLAND PARTY BAG + STICKER SET,WOODLAND STORAGE BOX LARGE,WOODLAND STORAGE BOX SMALL,WOODLAND WATER TRANSFER TATTOOS,WRAP ENGLISH ROSE,WRAP GREEN PEARS,WRAP RED APPLES,WRAP SUKI AND FRIENDS,WRAP WEDDING DAY,YULETIDE IMAGES S/6 PAPER BOXES
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
490397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
491169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
493899,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
494511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
494513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
495719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497210,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Langkah 5 : Pengkodean Data dengan "Hot Encoding"
# Mendefinisikan fungsi "Hot Encoding" untuk membuat data sesuai untuk library yang bersangkutan
# Mengenerate aturan asosiasi agar dapat menjadi 1 dan 0 (1 itu muncul, 0 tidak muncul)
def hot_encode(x):
    if(x<= 0): #jika yang dibeli lebih dari atau sama dengan 0 maka ditulis 0 (tidak dibeli)
        return 0
    if(x>= 1): #jika yang dibeli lebih dari atau sama dengan 1 maka ditulis 1 (dibeli)
        return 1

In [8]:
# Mengkodekan dataset untuk tiap-tiap wilayah
basket_encoded  = basket_Belgium.applymap(hot_encode)
basket_Belgium  = basket_encoded

basket_encoded   = basket_EIRE.applymap(hot_encode)
basket_EIRE      = basket_encoded

basket_encoded   = basket_Germany.applymap(hot_encode)
basket_Germany   = basket_encoded

basket_encoded   = basket_France.applymap(hot_encode)
basket_France   = basket_encoded

# Contoh Menampilakn data yang telah diencoded
basket_Belgium

Description,10 COLOUR SPACEBOY PEN,12 EGG HOUSE PAINTED WOOD,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE RED SPOTTY,12 PENCILS TALL TUBE RED SPOTTY,12 PENCILS TALL TUBE SKULLS,12 PENCILS TALL TUBE WOODLAND,36 DOILIES DOLLY GIRL,36 DOILIES SPACEBOY DESIGN,36 FOIL HEART CAKE CASES,...,WOODLAND PARTY BAG + STICKER SET,WOODLAND STORAGE BOX LARGE,WOODLAND STORAGE BOX SMALL,WOODLAND WATER TRANSFER TATTOOS,WRAP ENGLISH ROSE,WRAP GREEN PEARS,WRAP RED APPLES,WRAP SUKI AND FRIENDS,WRAP WEDDING DAY,YULETIDE IMAGES S/6 PAPER BOXES
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489447,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
490397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
491169,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
493899,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
494511,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
494513,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
495719,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
497210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
497947,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
497952,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Langkah 6 : Membangun model dan menganalisis hasilnya
# 1) Belgium
# Membangun Model
frq_items = apriori(basket_Belgium, min_support = 0.05, use_colnames = True)

# Mengumpulkan aturan yang disimpulkan dalam dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
# Sorting berdasarkan confidence dan lift rasio yang paling tinggi (nanti paling atas adalah rule paling kuat)
rules = rules.sort_values(['confidence', 'lift'], ascending = [False, False])
# Mendapatkan 5 rule yang paling kaut berupa ringkasan
#print(rules.head())

rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
12,(ALARM CLOCK BAKELIKE ORANGE),(ALARM CLOCK BAKELIKE RED),0.057692,0.057692,0.057692,1.0,17.333333,0.054364,inf
13,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE ORANGE),0.057692,0.057692,0.057692,1.0,17.333333,0.054364,inf
166,(JUMBO BAG PINK VINTAGE PAISLEY),(JUMBO BAG SCANDINAVIAN PAISLEY),0.057692,0.057692,0.057692,1.0,17.333333,0.054364,inf
167,(JUMBO BAG SCANDINAVIAN PAISLEY),(JUMBO BAG PINK VINTAGE PAISLEY),0.057692,0.057692,0.057692,1.0,17.333333,0.054364,inf
424,(POPPY'S PLAYHOUSE KITCHEN),(POPPY'S PLAYHOUSE BEDROOM),0.057692,0.057692,0.057692,1.0,17.333333,0.054364,inf


In [10]:
# Langkah 6 : Membangun model dan menganalisis hasilnya
# 2) EIRE
# Membangun Model
frq_items = apriori(basket_EIRE, min_support = 0.05, use_colnames = True)

# Mengumpulkan aturan yang disimpulkan dalam dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
# Sorting berdasarkan confidence dan lift rasio yang paling tinggi (nanti paling atas adalah rule paling kuat)
rules = rules.sort_values(['confidence', 'lift'], ascending = [False, False])
# Mendapatkan 10 rule yang paling kaut berupa ringkasan
#print(rules.head())

rules.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
14,(LARGE POPCORN HOLDER),(SMALL POPCORN HOLDER),0.071839,0.071839,0.057471,0.8,11.136,0.05231,4.640805
15,(SMALL POPCORN HOLDER),(LARGE POPCORN HOLDER),0.071839,0.071839,0.057471,0.8,11.136,0.05231,4.640805
12,(EDWARDIAN PARASOL BLACK),(EDWARDIAN PARASOL NATURAL),0.068966,0.077586,0.051724,0.75,9.666667,0.046373,3.689655
2,(PACK OF 60 DINOSAUR CAKE CASES),(60 TEATIME FAIRY CAKE CASES),0.074713,0.166667,0.054598,0.730769,4.384615,0.042146,3.095238
13,(EDWARDIAN PARASOL NATURAL),(EDWARDIAN PARASOL BLACK),0.077586,0.068966,0.051724,0.666667,9.666667,0.046373,2.793103
20,(SET/5 RED SPOTTY LID GLASS BOWLS),(PACK OF 72 RETRO SPOT CAKE CASES),0.083333,0.140805,0.054598,0.655172,4.653061,0.042864,2.491667
0,(72 SWEETHEART FAIRY CAKE CASES),(60 TEATIME FAIRY CAKE CASES),0.083333,0.166667,0.054598,0.655172,3.931034,0.040709,2.416667
6,(PACK OF 72 RETRO SPOT CAKE CASES),(60 TEATIME FAIRY CAKE CASES),0.140805,0.166667,0.08908,0.632653,3.795918,0.065613,2.268519
10,(SET/5 RED SPOTTY LID GLASS BOWLS),(60 TEATIME FAIRY CAKE CASES),0.083333,0.166667,0.051724,0.62069,3.724138,0.037835,2.19697
4,(PACK OF 60 PINK PAISLEY CAKE CASES),(60 TEATIME FAIRY CAKE CASES),0.132184,0.166667,0.077586,0.586957,3.521739,0.055556,2.017544


In [11]:
# Langkah 6 : Membangun model dan menganalisis hasilnya
# 3) Germany
# Membangun Model
frq_items = apriori(basket_Germany, min_support = 0.05, use_colnames = True)

# Mengumpulkan aturan yang disimpulkan dalam dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
# Sorting berdasarkan confidence dan lift rasio yang paling tinggi (nanti paling atas adalah rule paling kuat)
rules = rules.sort_values(['confidence', 'lift'], ascending = [False, False])
# Mendapatkan 3 rule yang paling kaut berupa ringkasan
#print(rules.head())

rules.head(3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
103,(WOODLAND WATER TRANSFER TATTOOS),(POSTAGE),0.060519,0.78098,0.057637,0.952381,1.219469,0.010373,4.599424
163,"(RED SPOTTY CHARLOTTE BAG, WOODLAND CHARLOTTE ...",(POSTAGE),0.054755,0.78098,0.051873,0.947368,1.213051,0.009111,4.161383
169,"(RED TOADSTOOL LED NIGHT LIGHT, ROUND SNACK BO...",(POSTAGE),0.054755,0.78098,0.051873,0.947368,1.213051,0.009111,4.161383


In [12]:
# Langkah 6 : Membangun model dan menganalisis hasilnya
# 4) France
# Membangun Model
frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True)

# Mengumpulkan aturan yang disimpulkan dalam dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
# Sorting berdasarkan confidence dan lift rasio yang paling tinggi (nanti paling atas adalah rule paling kuat)
rules = rules.sort_values(['confidence', 'lift'], ascending = [False, False])
# Mendapatkan 6 rule yang paling kaut berupa ringkasan
#print(rules.head())

rules.head(6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
414,"(PLASTERS IN TIN SPACEBOY, PLASTERS IN TIN CIR...",(PLASTERS IN TIN WOODLAND ANIMALS),0.058091,0.136929,0.058091,1.0,7.30303,0.050137,inf
302,"(PLASTERS IN TIN STRONGMAN, PLASTERS IN TIN SP...",(PLASTERS IN TIN CIRCUS PARADE),0.053942,0.161826,0.053942,1.0,6.179487,0.045213,inf
27,(COFFEE MUG APPLES DESIGN),(POSTAGE),0.058091,0.73029,0.058091,1.0,1.369318,0.015668,inf
99,(PINK SPOTTY CUP),(POSTAGE),0.06639,0.73029,0.06639,1.0,1.369318,0.017906,inf
131,(RED RETROSPOT CAKE STAND),(POSTAGE),0.053942,0.73029,0.053942,1.0,1.369318,0.014549,inf
151,(SET OF 9 HEART SHAPED BALLOONS),(POSTAGE),0.053942,0.73029,0.053942,1.0,1.369318,0.014549,inf
