In [2]:
%%capture
! pip install mlxtend

In [8]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Load the data from the UCI Machine Learning Repository
url = "./data/Online_Retail.xlsx"
online_retail_data = pd.read_excel(url)

In [9]:
online_retail_data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [10]:
# Data preprocessing
online_retail_data['Description'] = online_retail_data['Description'].str.strip()
online_retail_data = online_retail_data.dropna(axis=0, subset=['InvoiceNo'])
online_retail_data['InvoiceNo'] = online_retail_data['InvoiceNo'].astype('str')
online_retail_data = online_retail_data[~online_retail_data['InvoiceNo'].str.contains('C')] # 취소

In [11]:
online_retail_data.sample(20)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
71875,542216,22509,SEWING BOX RETROSPOT DESIGN,2,2011-01-26 12:29:00,16.95,14911.0,EIRE
279662,561356,21155,RED RETROSPOT PEG BAG,6,2011-07-26 14:56:00,2.55,14273.0,United Kingdom
516428,579900,22457,NATURAL SLATE HEART CHALKBOARD,6,2011-12-01 08:34:00,2.95,15951.0,United Kingdom
494180,578262,22732,3D VINTAGE CHRISTMAS STICKERS,2,2011-11-23 13:27:00,1.25,18283.0,United Kingdom
166000,550836,21411,GINGHAM HEART DOORSTOP RED,3,2011-04-21 10:55:00,4.25,14759.0,United Kingdom
362904,568528,23199,JUMBO BAG APPLES,10,2011-09-27 13:32:00,2.08,13979.0,United Kingdom
71228,542112,84988,SET OF 72 PINK HEART PAPER DOILIES,2,2011-01-25 13:55:00,1.45,13168.0,United Kingdom
46628,540355,84997b,RED 3 PIECE RETROSPOT CUTLERY SET,1,2011-01-06 15:11:00,8.47,,United Kingdom
274378,560906,22666,RECIPE BOX PANTRY YELLOW DESIGN,6,2011-07-21 17:55:00,2.95,15827.0,United Kingdom
462626,575985,23404,HOME SWEET HOME BLACKBOARD,6,2011-11-13 14:09:00,4.95,17841.0,United Kingdom


In [12]:
# 온라인 리테일 데이터를 바구니 형식으로 변환합니다.
basket = (online_retail_data[online_retail_data['Country'] == "United Kingdom"]  # 영국 데이터만 선택합니다.
          .groupby(['InvoiceNo', 'Description'])['Quantity']  # InvoiceNo와 Description으로 그룹화하고 Quantity의 합을 계산합니다.
          .sum().unstack().reset_index().fillna(0)  # unstack을 통해 데이터를 피벗하고, NaN 값을 0으로 채웁니다.
          .set_index('InvoiceNo'))  # InvoiceNo를 인덱스로 설정합니다.

# 상품의 판매량을 0 또는 1로 변환합니다. (0: 바구니에 포함되지 않음, 1: 바구니에 포함됨)
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

# 모든 데이터에 encode_units 함수를 적용하여 바구니 형식으로 변환합니다.
basket_sets = basket.applymap(encode_units)

# Apriori 알고리즘을 사용하여 빈발 항목집합을 찾습니다.
frequent_itemsets = apriori(basket_sets, min_support=0.03, use_colnames=True)

# 연관 규칙을 생성합니다.
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

  basket_sets = basket.applymap(encode_units)


In [13]:
basket.shape

(18667, 4175)

In [14]:
basket.head()

Description,*Boombox Ipod Classic,*USB Office Mirror Ball,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.046928,0.049821,0.03016,0.642694,12.900183,0.027822,2.659288,0.967903
1,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.049821,0.046928,0.03016,0.605376,12.900183,0.027822,2.415142,0.97085
2,(GREEN REGENCY TEACUP AND SAUCER),(PINK REGENCY TEACUP AND SAUCER),0.050035,0.03766,0.03091,0.617773,16.403939,0.029026,2.517719,0.988498
3,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.03766,0.050035,0.03091,0.820768,16.403939,0.029026,5.300203,0.975787
4,(GREEN REGENCY TEACUP AND SAUCER),(ROSES REGENCY TEACUP AND SAUCER),0.050035,0.051267,0.037553,0.750535,14.639752,0.034988,3.803076,0.980765
5,(ROSES REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.051267,0.050035,0.037553,0.732497,14.639752,0.034988,3.551237,0.982039
6,(JUMBO BAG RED RETROSPOT),(JUMBO BAG BAROQUE BLACK WHITE),0.10382,0.048749,0.030535,0.294118,6.03329,0.025474,1.347605,0.930898
7,(JUMBO BAG BAROQUE BLACK WHITE),(JUMBO BAG RED RETROSPOT),0.048749,0.10382,0.030535,0.626374,6.03329,0.025474,2.398601,0.877006
8,(JUMBO BAG PINK POLKADOT),(JUMBO BAG RED RETROSPOT),0.062088,0.10382,0.042053,0.677308,6.523895,0.035607,2.777201,0.902769
9,(JUMBO BAG RED RETROSPOT),(JUMBO BAG PINK POLKADOT),0.10382,0.062088,0.042053,0.405057,6.523895,0.035607,1.576473,0.944807


In [16]:
# Filter rules by a minimum lift and confidence
filtered_rules = rules[(rules['lift'] >= 3.0) & (rules['confidence'] >= 0.5)]

In [17]:
# Print the association rules
print(filtered_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

                            antecedents                        consequents  \
0          (ALARM CLOCK BAKELIKE GREEN)         (ALARM CLOCK BAKELIKE RED)   
1            (ALARM CLOCK BAKELIKE RED)       (ALARM CLOCK BAKELIKE GREEN)   
2     (GREEN REGENCY TEACUP AND SAUCER)   (PINK REGENCY TEACUP AND SAUCER)   
3      (PINK REGENCY TEACUP AND SAUCER)  (GREEN REGENCY TEACUP AND SAUCER)   
4     (GREEN REGENCY TEACUP AND SAUCER)  (ROSES REGENCY TEACUP AND SAUCER)   
5     (ROSES REGENCY TEACUP AND SAUCER)  (GREEN REGENCY TEACUP AND SAUCER)   
7      (JUMBO  BAG BAROQUE BLACK WHITE)          (JUMBO BAG RED RETROSPOT)   
8             (JUMBO BAG PINK POLKADOT)          (JUMBO BAG RED RETROSPOT)   
10  (JUMBO SHOPPER VINTAGE RED PAISLEY)          (JUMBO BAG RED RETROSPOT)   
13             (JUMBO STORAGE BAG SUKI)          (JUMBO BAG RED RETROSPOT)   

     support  confidence       lift  
0   0.030160    0.642694  12.900183  
1   0.030160    0.605376  12.900183  
2   0.030910    0.617773  1