In [1]:
# 연관규칙 실습용 필요한 패키지 설치
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import pandas as pd
import numpy as np

- 추천 진행할 때 동시구매 빈도, 조건부 확률 개념으로 수치를 계산해서 추천할 때 제공하는 경우가 있다.
- cos 유사도 같은 개념이 아닌 신뢰지도, 지지도, 향상도 개념으로 계산하여서 추천 수치를 뽑는 방식
- Assoiation rule은 추천시스템 중에 대표적인 알고리즘 중 하나
- 협업 필터링, 콘텐츠 기반 필터링, 딥러닝 등 방법이 있고 다양하다.
## 대표적인 연관규칙
- apriori - 이 알고리즘을 사용할 예정
- FP-Growth
- DHP

- 연관규칙 > A제품을 구매한 사람이 있으면, B제품도 구매할 확률이 높다
- Aporiori - 아이템들의 조합에 대한 경우릐 수를 가지고 최소화 하여서 계산

- 1: A,B
- 2: A,B,c
- 3: A,B,E

- 시퀀스 개념까지 생각하면 비행기 티켓을 구매한 사람 > 여행을 가는 것, 렌트카, 숙소 부분에서 진행하는게 구매 확률이 높다
- 콜드 스타트 문제 : 처음 온 고객에 대한 데이터가 없다? 어떤걸 추천해?

### Apriori
- 지지도(Support) : 고객이 A와 B를 동시에 거래하는 것이 발생할 확률
- 신뢰도(Confidence) - A구매한 주문 B가 포함될 확률
- 향상도(Lift) : A,B가 독립적으로 판매되는 것 대비 함께 판매되는 비율

In [2]:
df = pd.read_csv('data/market_basket.csv')

In [3]:
df

Unnamed: 0,cust_cd,std_dt,prdct_cd,prdct_nm
0,C617077280704,2021-06-19,A10001,tropical fruit
1,C617077280704,2021-06-19,A10002,whole milk
2,C617077280704,2021-06-19,A10003,pip fruit
3,C617077280704,2021-06-19,A10004,other vegetables
4,C617077280704,2021-06-19,A10005,cream
...,...,...,...,...
188764,C672598402422,2021-08-24,A10008,beef
188765,C672598402422,2021-08-24,A10050,bottled beer
188766,C672598402422,2021-08-24,A10001,tropical fruit
188767,C672598402422,2021-08-24,A10026,berries


### 전처리

- cust_cd : A,B,C,D,E 구매에 대한 데이터셋을 전처리 해야한다.

In [4]:
# 학습하기 위한 데이터셋 만들기
item_set = df.drop_duplicates(['cust_cd','std_dt','prdct_nm']).groupby(['cust_cd','std_dt'])['prdct_nm'].apply(list)
item_set

cust_cd        std_dt    
C617062396815  2021-05-01    [beef, herbs, tropical fruit, whole milk, chic...
C617062745502  2021-04-03    [sugar, packaged fruit/vegetables, sausage, sp...
C617062930654  2021-09-14    [berries, tropical fruit, fruit/vegetable juic...
C617063205957  2021-12-29    [yogurt, beef, cream, herbs, chicken, bottled ...
C617063351260  2021-03-27    [berries, beef, yogurt, specialty bar, bottled...
                                                   ...                        
C672608631462  2021-11-06             [cream, turkey, dessert, tropical fruit]
C672608824442  2021-11-07                [pastry, detergent, pip fruit, pasta]
C672612583886  2021-12-11    [dog food, sausage, chocolate, pasta, fruit/ve...
C672613540098  2021-12-14    [tropical fruit, processed cheese, hamburger m...
C672615991967  2021-12-26    [butter milk, brown bread, chicken, frozen pot...
Name: prdct_nm, Length: 27219, dtype: object

In [5]:
item_set_df = pd.DataFrame(item_set).reset_index().drop(['cust_cd','std_dt'],axis = 'columns')

In [6]:
item_set_df

Unnamed: 0,prdct_nm
0,"[beef, herbs, tropical fruit, whole milk, chic..."
1,"[sugar, packaged fruit/vegetables, sausage, sp..."
2,"[berries, tropical fruit, fruit/vegetable juic..."
3,"[yogurt, beef, cream, herbs, chicken, bottled ..."
4,"[berries, beef, yogurt, specialty bar, bottled..."
...,...
27214,"[cream, turkey, dessert, tropical fruit]"
27215,"[pastry, detergent, pip fruit, pasta]"
27216,"[dog food, sausage, chocolate, pasta, fruit/ve..."
27217,"[tropical fruit, processed cheese, hamburger m..."


In [7]:
it_set = item_set_df.squeeze()

In [8]:
it_set

0        [beef, herbs, tropical fruit, whole milk, chic...
1        [sugar, packaged fruit/vegetables, sausage, sp...
2        [berries, tropical fruit, fruit/vegetable juic...
3        [yogurt, beef, cream, herbs, chicken, bottled ...
4        [berries, beef, yogurt, specialty bar, bottled...
                               ...                        
27214             [cream, turkey, dessert, tropical fruit]
27215                [pastry, detergent, pip fruit, pasta]
27216    [dog food, sausage, chocolate, pasta, fruit/ve...
27217    [tropical fruit, processed cheese, hamburger m...
27218    [butter milk, brown bread, chicken, frozen pot...
Name: prdct_nm, Length: 27219, dtype: object

In [9]:
# for i in it_set:
#     print(i)

- apriori 모델을 학습시키기 위한 장바구니 더미변수를 만들어야 한다. (축전환)

In [10]:
encoder = TransactionEncoder()

In [11]:
encoder_T = encoder.fit(it_set).transform(it_set)

In [12]:
pd.DataFrame(encoder_T, columns = encoder.columns_)

Unnamed: 0,beef,berries,beverages,bottled beer,bottled water,brown bread,butter,butter milk,canned beer,chicken,...,sparkling wine,specialty bar,specialty chocolate,sugar,syrup,tropical fruit,turkey,white wine,whole milk,yogurt
0,True,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,True,True,False,True,False,False,False,False
2,False,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,False
3,True,False,False,False,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True
4,True,True,False,False,True,False,False,False,False,True,...,False,True,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27214,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
27215,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
27216,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
27217,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False


In [13]:
it_matrix = pd.DataFrame(encoder_T, columns = encoder.columns_)*1
it_matrix # 끝

Unnamed: 0,beef,berries,beverages,bottled beer,bottled water,brown bread,butter,butter milk,canned beer,chicken,...,sparkling wine,specialty bar,specialty chocolate,sugar,syrup,tropical fruit,turkey,white wine,whole milk,yogurt
0,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,0,0,0
2,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,1,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,1,1,0,0,1,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27214,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
27215,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27216,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27217,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


### Apriori 모델 적용

In [None]:
it_apr = apriori(it_matrix, min_support = 0.05, use_colnames = True) # 최소지지도

In [15]:
it_apr

Unnamed: 0,support,itemsets
0,0.166612,(beef)
1,0.105074,(berries)
2,0.017010,(beverages)
3,0.025754,(bottled beer)
4,0.095191,(bottled water)
...,...,...
6996,0.001727,"(bottled water, ham, cream, pastry, chicken, t..."
6997,0.001176,"(bottled water, yogurt, cream, pastry, chicken..."
6998,0.001433,"(bottled water, ham, yogurt, cream, pastry, tr..."
6999,0.001249,"(bottled water, ham, cream, pastry, beef, chic..."


In [17]:
association_rules(it_apr, metric = 'lift', min_threshold = 3) # lift는 1을 넘어야 함

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(other vegetables),(sugar),0.022264,0.025901,0.002315,0.103960,4.013756,0.001738,1.087116,0.767954
1,(sugar),(other vegetables),0.025901,0.022264,0.002315,0.089362,4.013756,0.001738,1.073682,0.770822
2,"(beef, berries)",(bottled water),0.018149,0.095191,0.007017,0.386640,4.061731,0.005290,1.475167,0.767733
3,(bottled water),"(beef, berries)",0.095191,0.018149,0.007017,0.073717,4.061731,0.005290,1.059990,0.833103
4,"(bottled water, bottled beer)",(beef),0.001763,0.166612,0.001029,0.583333,3.501158,0.000735,2.000132,0.715642
...,...,...,...,...,...,...,...,...,...,...
8757,"(tropical fruit, beef)","(bottled water, ham, yogurt, cream, pastry)",0.093868,0.002204,0.001212,0.012916,5.859276,0.001005,1.010852,0.915243
8758,(bottled water),"(ham, yogurt, cream, beef, pastry, tropical fr...",0.095191,0.001727,0.001212,0.012736,7.375999,0.001048,1.011152,0.955367
8759,(ham),"(bottled water, yogurt, cream, pastry, beef, t...",0.085712,0.002719,0.001212,0.014145,5.202830,0.000979,1.011590,0.883526
8760,(yogurt),"(bottled water, ham, beef, pastry, cream, trop...",0.076344,0.003417,0.001212,0.015881,4.647909,0.000952,1.012665,0.849720


### Auto-ML
- 머신러닝 자동 학습하는 모델링 패키지
- 알아서 학습하고, 최적의 하이퍼파라미터 파이프라인을 공유한다.

- 분류관련 tpot 모델 진행, 휘귀 모델도 있다.

In [20]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

ModuleNotFoundError: No module named 'tpot'

In [22]:
!pip install tpot

Collecting tpot
  Obtaining dependency information for tpot from https://files.pythonhosted.org/packages/c0/9d/a5a5422e00c034e16c8ad2e0361cf26578a8fcbbc45c6d605d2decf72bc7/TPOT-0.12.2-py3-none-any.whl.metadata
  Downloading TPOT-0.12.2-py3-none-any.whl.metadata (2.0 kB)
Collecting scikit-learn>=1.4.1 (from tpot)
  Obtaining dependency information for scikit-learn>=1.4.1 from https://files.pythonhosted.org/packages/ce/41/5d882544fddba1ae476c240cdf4499743ea84bfeb54456d472966dc86355/scikit_learn-1.4.1.post1-cp311-cp311-macosx_10_9_x86_64.whl.metadata
  Downloading scikit_learn-1.4.1.post1-cp311-cp311-macosx_10_9_x86_64.whl.metadata (11 kB)
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.1.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting update-checker>=0.16 (from tpot)
  Obtaining dependency information for upd