In [1]:
# Implementing Apriori in Python 
# Dependencies
import numpy as np 
import pandas as pd
import apyori
from mlxtend.frequent_patterns import apriori, association_rules 
from mlxtend.preprocessing import TransactionEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Retail data from https://www.kaggle.com/datasets/shedai/retail-data-set
data = pd.read_csv('file_out.csv') 
data

Unnamed: 0.1,Unnamed: 0,DocumentID,Date,SKU,Price,Discount,Customer,Quantity
0,0,716,2019-09-23,1039,381.780000,67.372540,1,1.0
1,1,716,2019-09-23,853,593.220000,0.000340,1,1.0
2,2,716,2019-09-23,862,423.730000,-0.001190,1,1.0
3,3,716,2019-09-23,868,201.700000,35.588140,1,1.0
4,4,716,2019-09-23,2313,345.760000,61.019660,1,1.0
...,...,...,...,...,...,...,...,...
33351,33351,13315,2022-11-09,1849,5957.627119,1072.372881,239,4.0
33352,33352,13316,2022-11-09,30,1271.186441,228.813559,477,2.0
33353,33353,13317,2022-11-09,2066,1355.932203,244.067797,370,1.0
33354,33354,13318,2022-11-09,1586,2334.745763,420.254237,356,2.0


### Analyse: 
#### 1. Sum of discount for each customer on specific date

In [33]:
data = pd.read_csv('file_out.csv') 
data = data.drop_duplicates()

# print the sum of discount for each customer on specific date
unique_dates = list(data.drop_duplicates()["Date"].drop_duplicates())

for i in range(len(unique_dates)):
    sumdiscounts = data[data['Date']==unique_dates[i]].groupby(['Customer'])['Discount'].sum()

    print(f"""
        The sum of discount for each customer on {unique_dates[i]} is: 
        {sumdiscounts}
        """)


        The sum of discount for each customer on 2019-09-23 is: 
        Customer
1       401.260170
68     1466.511865
75      298.983051
80      298.214237
99     2377.525423
156     256.471017
159      68.009492
170      92.876949
216     133.322034
273     161.694915
284      43.474576
308     256.271186
313     148.649492
322     128.135593
332     858.813559
360     503.389831
363    1321.566102
383     389.440678
396     536.949152
407     506.440678
408      81.838983
443     884.745763
488     249.687458
499    1205.084746
506      86.250508
512       0.000000
558     348.315254
574     219.661017
575     228.813559
595     605.837288
596     316.879322
604     196.779661
Name: Discount, dtype: float64
        

        The sum of discount for each customer on 2019-04-13 is: 
        Customer
18      442.372881
66      417.078305
68      383.186441
99     1650.432202
105      62.542373
131     340.474576
156      70.169492
189     412.322034
224     231.864406
266     274.576

#### 2. Associate Rule Mining

In [192]:
# Association Rule Mining
# Using Apyori.Apriori
def arm(datasetpath):
    df = pd.read_csv(datasetpath, names = ['products'], sep = ',')
    # data: transactions
    data = list(df["products"].apply(lambda x:x.split(",")))
    arules = list(apyori.apriori(data, min_support=0.2, min_confidence=0.1))
    for ar in arules:
        print(f'Antecedent: {list(ar[2][0][0])} --> Conseqent: {list(ar[2][0][1])}')
        print(f'Support: {ar[1]}')
        print(f'Confidence: {ar[2][0][2]}')
        print(f'Lift: {ar[2][0][3]}')
        print('---------------')

arm('GroceryStoreDataSet.csv')

Antecedent: [] --> Conseqent: ['BISCUIT']
Support: 0.35
Confidence: 0.35
Lift: 1.0
---------------
Antecedent: [] --> Conseqent: ['BOURNVITA']
Support: 0.2
Confidence: 0.2
Lift: 1.0
---------------
Antecedent: [] --> Conseqent: ['BREAD']
Support: 0.65
Confidence: 0.65
Lift: 1.0
---------------
Antecedent: [] --> Conseqent: ['COFFEE']
Support: 0.4
Confidence: 0.4
Lift: 1.0
---------------
Antecedent: [] --> Conseqent: ['CORNFLAKES']
Support: 0.3
Confidence: 0.3
Lift: 1.0
---------------
Antecedent: [] --> Conseqent: ['MAGGI']
Support: 0.25
Confidence: 0.25
Lift: 1.0
---------------
Antecedent: [] --> Conseqent: ['MILK']
Support: 0.25
Confidence: 0.25
Lift: 1.0
---------------
Antecedent: [] --> Conseqent: ['SUGER']
Support: 0.3
Confidence: 0.3
Lift: 1.0
---------------
Antecedent: [] --> Conseqent: ['TEA']
Support: 0.35
Confidence: 0.35
Lift: 1.0
---------------
Antecedent: [] --> Conseqent: ['BREAD', 'BISCUIT']
Support: 0.2
Confidence: 0.2
Lift: 1.0
---------------
Antecedent: [] --> C

In [197]:
# Using Apriori mlxtend
def arm(datasetpath):
    df = pd.read_csv(datasetpath, names = ['products'], sep = ',')
    # data: transactions
    data = list(df["products"].apply(lambda x:x.split(",")))
    a = TransactionEncoder()
    a_data = a.fit(data).transform(data)
    df = pd.DataFrame(a_data,columns=a.columns_)
    df = apriori(df.replace(False,0), min_support = 0.2, use_colnames = True, verbose = 1)
    arules = association_rules(df, min_threshold = 0.2)
    
    for i in range(len(arules)):
        row = list(df_ar.iloc[i,:])
        print(f'Antecedent: {str(row[0])[12:-3]} --> Conseqent: {str(row[1])[12:-3]}')
        print(f'Support: {row[4]:.2f}')
        print(f'Confidence: {row[5]:.2f}')
        print(f'Lift: {row[6]:.2f}')
        print('---------------')

arm('GroceryStoreDataSet.csv')

Processing 42 combinations | Sampling itemset size 3
Antecedent: BREAD --> Conseqent: BISCUIT
Support: 0.20
Confidence: 0.31
Lift: 0.88
---------------
Antecedent: BISCUIT --> Conseqent: BREAD
Support: 0.20
Confidence: 0.57
Lift: 0.88
---------------
Antecedent: BREAD --> Conseqent: MILK
Support: 0.20
Confidence: 0.31
Lift: 1.23
---------------
Antecedent: MILK --> Conseqent: BREAD
Support: 0.20
Confidence: 0.80
Lift: 1.23
---------------
Antecedent: BREAD --> Conseqent: SUGER
Support: 0.20
Confidence: 0.31
Lift: 1.03
---------------
Antecedent: SUGER --> Conseqent: BREAD
Support: 0.20
Confidence: 0.67
Lift: 1.03
---------------
Antecedent: BREAD --> Conseqent: TEA
Support: 0.20
Confidence: 0.31
Lift: 0.88
---------------
Antecedent: TEA --> Conseqent: BREAD
Support: 0.20
Confidence: 0.57
Lift: 0.88
---------------
Antecedent: CORNFLAKES --> Conseqent: COFFEE
Support: 0.20
Confidence: 0.67
Lift: 1.67
---------------
Antecedent: COFFEE --> Conseqent: CORNFLAKES
Support: 0.20
Confidence: