In [1]:
import pandas as pd

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

import joblib

In [2]:
pd.set_option('display.max_rows', 2000)

In [3]:
df = pd.read_csv('../data/b2c_products_500_transactions_50k.csv')

df = df.astype(bool) # convert to boolean provides better performance when generating frequent itemsets
df[:10]

Unnamed: 0,GGB-FD0TVBDI,BC-KH1DNMMJ,HKHD-ZBIKCIDK,HS-FN9XUY41,F-T-R5XFF0T0,HKC-2PCN9T84,F-F-AZZ81SDW,PSD-753LC58D,BPF-5PHT0HL9,HKB-AZCCBLSK,...,HFA-1N2OJGFH,HPC-TGBIBF57,SOY&-L85JE6K9,ESH-8JWVHPLH,SOTS-GG5GR0XA,HKS&-S03VNA31,F-B-T34DF163,HKC-D5OAIINM,HKS&-8KCMM7DB,BPM-JRNJ6225
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [4]:
frequent_itemsets = apriori(df, min_support=0.22, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.22680,(HKB-AZCCBLSK)
1,0.22386,(BC&-DPPQ0Y9D)
2,0.22536,(BPM-FKHMY4SJ)
3,0.22568,(ACC-N2ATQYYV)
4,0.22328,(BPF-2VMCQBNK)
...,...,...
8210,0.22052,"(AO&-G70S1F53, ACC-BYGHGK5B, ACC-2SZWYMX9, ACC..."
8211,0.22156,"(HKHD-C68AU9DK, HKS&-S03VNA31, HKHD-7I8SKCH8, ..."
8212,0.22052,"(AO&-G70S1F53, ACC-BYGHGK5B, ACC-2SZWYMX9, ACC..."
8213,0.22156,"(HKHD-C68AU9DK, HKS&-S03VNA31, HKHD-7I8SKCH8, ..."


In [5]:
# confidence should be > 0.3 to be useful
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.99)
# lift should be > 1.0 to be useful
# rules = association_rules(frequent_itemsets, metric="lift", min_threshold=4.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(AIA-JM4T8BP6),(ACC-N2ATQYYV),0.22280,0.22568,0.22064,0.990305,4.388095,0.170358,79.869674,0.993452
1,(AIA-JM4T8BP6),(ACC-N6Q7NC85),0.22280,0.22408,0.22058,0.990036,4.418225,0.170655,77.871611,0.995451
2,(AIA-JM4T8BP6),(AIA-BIK2U1RS),0.22280,0.22646,0.22064,0.990305,4.372981,0.170185,79.789219,0.992438
3,(AIA-JM4T8BP6),(AO&-G70S1F53),0.22280,0.22984,0.22058,0.990036,4.307500,0.169372,77.293535,0.987966
4,(AIA-JM4T8BP6),(ACC-PLO1URTT),0.22280,0.22980,0.22074,0.990754,4.311375,0.169541,83.301243,0.988234
...,...,...,...,...,...,...,...,...,...,...
997372,"(AEA-BMAE38SR, AO&-AZV46ESM)","(AO&-G70S1F53, ACC-BYGHGK5B, ACC-2SZWYMX9, ACC...",0.22058,0.22052,0.22052,0.999728,4.533503,0.171878,2865.628307,1.000000
997373,"(AEA-BMAE38SR, AIA-JM4T8BP6)","(AO&-G70S1F53, ACC-BYGHGK5B, ACC-2SZWYMX9, ACC...",0.22054,0.22052,0.22052,0.999909,4.534325,0.171887,8595.325960,1.000000
997374,"(ACC-PRAT0861, AO&-AZV46ESM)","(AO&-G70S1F53, ACC-BYGHGK5B, ACC-2SZWYMX9, ACC...",0.22060,0.22052,0.22052,0.999637,4.533092,0.171873,2149.416100,1.000000
997375,"(ACC-PRAT0861, AIA-JM4T8BP6)","(AO&-G70S1F53, ACC-BYGHGK5B, ACC-2SZWYMX9, ACC...",0.22054,0.22052,0.22052,0.999909,4.534325,0.171887,8595.325960,1.000000


In [6]:
# persist the rules to a file in joblib format
joblib.dump(rules, 'b2c_products_500_transactions_50k.joblib')

['b2c_products_500_transactions_50k.joblib']

In [7]:
# load the rules from the file in joblib format
loaded_rules = joblib.load('b2c_products_500_transactions_50k.joblib')
loaded_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(AIA-JM4T8BP6),(ACC-N2ATQYYV),0.22280,0.22568,0.22064,0.990305,4.388095,0.170358,79.869674,0.993452
1,(AIA-JM4T8BP6),(ACC-N6Q7NC85),0.22280,0.22408,0.22058,0.990036,4.418225,0.170655,77.871611,0.995451
2,(AIA-JM4T8BP6),(AIA-BIK2U1RS),0.22280,0.22646,0.22064,0.990305,4.372981,0.170185,79.789219,0.992438
3,(AIA-JM4T8BP6),(AO&-G70S1F53),0.22280,0.22984,0.22058,0.990036,4.307500,0.169372,77.293535,0.987966
4,(AIA-JM4T8BP6),(ACC-PLO1URTT),0.22280,0.22980,0.22074,0.990754,4.311375,0.169541,83.301243,0.988234
...,...,...,...,...,...,...,...,...,...,...
997372,"(AEA-BMAE38SR, AO&-AZV46ESM)","(AO&-G70S1F53, AIA-BIK2U1RS, ACC-PLO1URTT, ACC...",0.22058,0.22052,0.22052,0.999728,4.533503,0.171878,2865.628307,1.000000
997373,"(AEA-BMAE38SR, AIA-JM4T8BP6)","(AO&-G70S1F53, AIA-BIK2U1RS, ACC-PLO1URTT, ACC...",0.22054,0.22052,0.22052,0.999909,4.534325,0.171887,8595.325960,1.000000
997374,"(ACC-PRAT0861, AO&-AZV46ESM)","(AO&-G70S1F53, AIA-BIK2U1RS, ACC-PLO1URTT, ACC...",0.22060,0.22052,0.22052,0.999637,4.533092,0.171873,2149.416100,1.000000
997375,"(ACC-PRAT0861, AIA-JM4T8BP6)","(AO&-G70S1F53, AIA-BIK2U1RS, ACC-PLO1URTT, ACC...",0.22054,0.22052,0.22052,0.999909,4.534325,0.171887,8595.325960,1.000000


In [8]:
# use the loaded_rules to extract recommendations
def get_recommendations(loaded_rules, items, metric='confidence', top_n=5):
    recommendations = set()
    for item in items:
        # Find rules where the item is in the antecedents
        matched_rules = loaded_rules[loaded_rules['antecedents'].apply(lambda x: item in x)]
        # Sort by the specified metric and get the top N
        top_rules = matched_rules.sort_values(by=metric, ascending=False).head(top_n)
        for _, row in top_rules.iterrows():
            recommendations.update(row['consequents'])
    # Remove items that are already in the input list
    recommendations.difference_update(items)
    return list(recommendations)[:top_n]

In [9]:
get_recommendations(loaded_rules, ['AIA-JM4T8BP6'], metric='lift', top_n=5)

['AO&-G70S1F53',
 'AIA-BIK2U1RS',
 'ACC-PLO1URTT',
 'ACC-BYGHGK5B',
 'ACC-2SZWYMX9']