In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import fpmax
from pyECLAT import ECLAT
import time
from sklearn.neighbors import NearestNeighbors
# from association import eclat

In [2]:
data = pd.read_csv(r" /activity_store.csv")
data.head()

Unnamed: 0,Week_strt,PURCH_DATE,CUST_id,PROD_ID
0,17FEB22,17FEB22,58222157597,4138316742
1,17FEB22,17FEB22,58222157597,5200039685
2,17FEB22,17FEB22,58222157597,7214008486
3,17FEB22,17FEB22,58222157597,4138316742
4,17FEB22,17FEB22,58222157597,5200039685


# Creating Baskets for each Customer and Day

In [3]:
basket = (data.groupby(['CUST_id', 'PURCH_DATE', 'PROD_ID'])['PROD_ID']
          .count().unstack().reset_index().fillna(0).sort_values('PURCH_DATE'))
basket.head()

PROD_ID,CUST_id,PURCH_DATE,7670,8458,9071,9074,9082,9090,9239,10192,...,980000007820,980000007861,980000007871,980000007911,980000007971,980000008000,980000008021,980000008030,980000008051,980000008071
7777,58221289139,17FEB22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2062,58194261193,17FEB22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8115,58221801262,17FEB22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11611,59140537392,17FEB22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2067,58211138665,17FEB22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Hence, we obtained baskets for each day and user pairs. Number of rows represents the number of baskets, number of columns represents the unique products.**

**To obtain better model, I'm applying a preprocessing step for removing baskets includes 1 or less products bought daily for each users. Due to we are trying to find relationships between the products, the baskets that include only one unique product will not help.**

In [4]:
basket['num_of_unique_products'] = np.count_nonzero(basket.iloc[:, 2:basket.shape[1]], axis=1)

In [5]:
basket.head()

PROD_ID,CUST_id,PURCH_DATE,7670,8458,9071,9074,9082,9090,9239,10192,...,980000007861,980000007871,980000007911,980000007971,980000008000,980000008021,980000008030,980000008051,980000008071,num_of_unique_products
7777,58221289139,17FEB22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
2062,58194261193,17FEB22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
8115,58221801262,17FEB22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
11611,59140537392,17FEB22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11
2067,58211138665,17FEB22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11


In [6]:
basket['num_of_unique_products'].value_counts()

4      900
2      862
3      857
5      790
6      740
      ... 
106      1
90       1
82       1
137      1
84       1
Name: num_of_unique_products, Length: 101, dtype: int64

**As you can see, 900 of the baskets include 4 unique products. But there are some baskets include only one unique product. This baskets would not help us to create a better model.**

In [7]:
# There are 643 transactions that only have one unique product
basket[basket['num_of_unique_products'] == 1].shape[0]

634

In [8]:
# Drop 1 or less number of products bought daily for each users
basket = basket.drop(basket[basket['num_of_unique_products'] <= 1].index)
basket.num_of_unique_products.min()

2

**There are no 1 unique product for each basket now.**

In [9]:
# Dropping this column because we already use it, and do not need anymore
basket.drop(['num_of_unique_products'], axis=1, inplace=True)

In [10]:
# Encoding for further models
def encode_units(k):
    if k <= 0:
        return 0
    if k >= 1:
        return 1

basket_sets = basket.iloc[:, 2:basket.shape[1]].applymap(encode_units)

In [11]:
basket_sets.head()

PROD_ID,7670,8458,9071,9074,9082,9090,9239,10192,10709,10723,...,980000007820,980000007861,980000007871,980000007911,980000007971,980000008000,980000008021,980000008030,980000008051,980000008071
7777,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2062,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8115,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2067,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Apriori Algorithm

In [21]:
start_apriori = time.time()

frequent_itemsets_apriori = apriori(basket_sets, min_support=0.005, use_colnames=True)
frequent_itemsets_apriori

Unnamed: 0,support,itemsets
0,0.066037,(10751)
1,0.005840,(10790)
2,0.023286,(10952)
3,0.005191,(11104)
4,0.232860,(11680)
...,...,...
453,0.005912,"(11680, 17514, 10751)"
454,0.005046,"(11680, 11738, 20284807669)"
455,0.006056,"(11680, 12333, 11757)"
456,0.005984,"(11680, 17514, 17502)"


In [22]:
rules_apriori = association_rules(frequent_itemsets_apriori, metric="lift", min_threshold=2)
rules_apriori

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(11722),(10751),0.047149,0.066037,0.009012,0.191131,2.894307,0.005898,1.154654
1,(10751),(11722),0.066037,0.047149,0.009012,0.136463,2.894307,0.005898,1.103428
2,(11731),(10751),0.040732,0.066037,0.007714,0.189381,2.867792,0.005024,1.152160
3,(10751),(11731),0.066037,0.040732,0.007714,0.116812,2.867792,0.005024,1.086142
4,(11734),(10751),0.035758,0.066037,0.006344,0.177419,2.686664,0.003983,1.135406
...,...,...,...,...,...,...,...,...,...
193,"(11738, 20284807669)",(4119012780),0.019753,0.025737,0.006128,0.310219,12.053354,0.005619,1.412423
194,"(4119012780, 20284807669)",(11738),0.008291,0.047581,0.006128,0.739130,15.534058,0.005733,3.650938
195,(11738),"(4119012780, 20284807669)",0.047581,0.008291,0.006128,0.128788,15.534058,0.005733,1.138310
196,(4119012780),"(11738, 20284807669)",0.025737,0.019753,0.006128,0.238095,12.053354,0.005619,1.286574


In [23]:
rules_apriori = rules_apriori[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
rules_apriori.sort_values('lift', ascending=False)

# took approx. 28 seconds to run

Unnamed: 0,antecedents,consequents,support,confidence,lift
195,(11738),"(4119012780, 20284807669)",0.006128,0.128788,15.534058
194,"(4119012780, 20284807669)",(11738),0.006128,0.739130,15.534058
197,(20284807669),"(11738, 4119012780)",0.006128,0.149123,14.670088
192,"(11738, 4119012780)",(20284807669),0.006128,0.602837,14.670088
193,"(11738, 20284807669)",(4119012780),0.006128,0.310219,12.053354
...,...,...,...,...,...
57,(3338374070),(11680),0.006416,0.468421,2.011600
60,(11680),(4119011447),0.005335,0.022910,2.011314
61,(4119011447),(11680),0.005335,0.468354,2.011314
43,(11680),(11904),0.005119,0.021981,2.005950


In [24]:
# Top 10 Relationships for Apriori
rules_apriori.style.set_caption("Apriori")
rules_apriori.sort_values('lift', ascending=False).head(10)

end_apriori = time.time()
print("Execution Time for Apriori (s): ", end_apriori - start_apriori)

Execution Time for Apriori (s):  36.66752338409424


# ---------------------------------------------------------------------------------------------------------------

# FP-Growth Algorithm

In [25]:
start_fp = time.time()

frequent_itemsets_fpgrowth = fpgrowth(basket_sets, min_support=0.005, use_colnames=True)
frequent_itemsets_fpgrowth

Unnamed: 0,support,itemsets
0,0.232860,(11680)
1,0.066037,(10751)
2,0.055656,(11757)
3,0.031577,(20820007669)
4,0.029342,(11736)
...,...,...
453,0.006272,"(11680, 71514158018)"
454,0.006056,"(11680, 11686)"
455,0.009660,"(11680, 11717)"
456,0.005984,"(11722, 11717)"


In [26]:
rules_fpgrowth = association_rules(frequent_itemsets_fpgrowth, metric="lift", min_threshold=2).sort_values('lift', ascending=False)

In [27]:
rules_fpgrowth = rules_fpgrowth[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
rules_fpgrowth.sort_values('lift', ascending=False)

# took approx. 11 seconds to run

Unnamed: 0,antecedents,consequents,support,confidence,lift
105,(11738),"(4119012780, 20284807669)",0.006128,0.128788,15.534058
104,"(4119012780, 20284807669)",(11738),0.006128,0.739130,15.534058
102,"(11738, 4119012780)",(20284807669),0.006128,0.602837,14.670088
107,(20284807669),"(11738, 4119012780)",0.006128,0.149123,14.670088
103,"(11738, 20284807669)",(4119012780),0.006128,0.310219,12.053354
...,...,...,...,...,...
191,(3338374070),(11680),0.006416,0.468421,2.011600
144,(11680),(4119011447),0.005335,0.022910,2.011314
145,(4119011447),(11680),0.005335,0.468354,2.011314
166,(11904),(11680),0.005119,0.467105,2.005950


In [28]:
# Top 10 Relationships for FP-Growth
rules_fpgrowth.sort_values('lift', ascending=False).head(10)

end_fp = time.time()
print("Execution Time for FP Growth (s): ", end_fp - start_fp)

Execution Time for FP Growth (s):  20.761564254760742


# ---------------------------------------------------------------------------------------------------------------

# Cosine Similarity Model

In [29]:
basket_sets

PROD_ID,7670,8458,9071,9074,9082,9090,9239,10192,10709,10723,...,980000007820,980000007861,980000007871,980000007911,980000007971,980000008000,980000008021,980000008030,980000008051,980000008071
7777,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2062,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8115,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2067,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9022,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12504,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12506,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
start_cosine = time.time()


knn = NearestNeighbors(n_neighbors=11, metric='cosine', algorithm='brute', n_jobs=-1)
knn.fit(basket_sets.values.T)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=11)

In [31]:
basket_sets.columns

Index([        7670,         8458,         9071,         9074,         9082,
               9090,         9239,        10192,        10709,        10723,
       ...
       980000007820, 980000007861, 980000007871, 980000007911, 980000007971,
       980000008000, 980000008021, 980000008030, 980000008051, 980000008071],
      dtype='object', name='PROD_ID', length=27115)

In [32]:
number_of_recommendations = 6  # If you change this number you will obtain different number of recommendations (you can pick minimum 2)
find_similar_based_on_this_product = 8458  # This variable represents the which product do you want to find fimilar product with

recommendation_result = list(knn.kneighbors([basket_sets[find_similar_based_on_this_product].values], number_of_recommendations))

In [33]:
A = pd.DataFrame(np.vstack((recommendation_result[1], recommendation_result[0])),
                 index=['Product', 'Cosine_Similarity (degree)']).T.drop([0]).reset_index(drop=True)

In [34]:
# Smaller degree represents more likely to similar product with our input product (Cos0 degree = 1 (the max value for the cosine func) so these two products are identical)
A

Unnamed: 0,Product,Cosine_Similarity (degree)
0,17735.0,0.823223
1,2521.0,0.823223
2,25087.0,0.823223
3,15722.0,0.823223
4,17734.0,0.823223


In [35]:
end_cosine = time.time()
print("Execution Time for Cosine Model (s): ", end_cosine - start_cosine)

Execution Time for Cosine Model (s):  11.181578874588013
