<a href="https://colab.research.google.com/github/poojashreeNS/Association-Pattern-Mining/blob/main/Apriori.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import pandas as pd
import numpy as np

In [23]:
df = pd.read_csv('/content/GroceryStoreDataSet.csv')
df.head(30)

Unnamed: 0,id,Items
0,1,"MILK,BREAD,BISCUIT"
1,2,"BREAD,MILK,BISCUIT,CORNFLAKES"
2,3,"BREAD,TEA,BOURNVITA"
3,4,"JAM,MAGGI,BREAD,MILK"
4,5,"MAGGI,TEA,BISCUIT"
5,6,"BREAD,TEA,BOURNVITA"
6,7,"MAGGI,TEA,CORNFLAKES"
7,8,"MAGGI,BREAD,TEA,BISCUIT"
8,9,"JAM,MAGGI,BREAD,TEA"
9,10,"BREAD,MILK"


In [24]:
df.shape

(20, 2)

In [25]:
def location_preprocessing(text):

  """
  Function to Preprocess the features having location names.
  """
  text = text.split(",")          # Obtain individual Items

  lst_text = [x.lower() for x in text]    # Lowercase names

  text = " ".join(lst_text)               # Convert to string from list

  return text

df['Items'] = df['Items'].apply(lambda x: location_preprocessing(x))

print(df['Items'])

0                 milk bread biscuit
1      bread milk biscuit cornflakes
2                bread tea bournvita
3               jam maggi bread milk
4                  maggi tea biscuit
5                bread tea bournvita
6               maggi tea cornflakes
7            maggi bread tea biscuit
8                jam maggi bread tea
9                         bread milk
10    coffee cock biscuit cornflakes
11    coffee cock biscuit cornflakes
12            coffee suger bournvita
13                 bread coffee cock
14               bread suger biscuit
15           coffee suger cornflakes
16             bread suger bournvita
17                bread coffee suger
18                bread coffee suger
19        tea milk coffee cornflakes
Name: Items, dtype: object


In [26]:
df.Items = df.Items.str.split(" ")
df_out = pd.concat(
    [
        df.explode("Items")
        .pivot_table(index="id", columns="Items", aggfunc="size", fill_value=0)
    ],
    axis=1,
)


In [27]:
pd.set_option("display.max_rows", 20, "display.max_columns", 20)

In [28]:
print(df_out)

Items  biscuit  bournvita  bread  cock  coffee  cornflakes  jam  maggi  milk  \
id                                                                             
1            1          0      1     0       0           0    0      0     1   
2            1          0      1     0       0           1    0      0     1   
3            0          1      1     0       0           0    0      0     0   
4            0          0      1     0       0           0    1      1     1   
5            1          0      0     0       0           0    0      1     0   
6            0          1      1     0       0           0    0      0     0   
7            0          0      0     0       0           1    0      1     0   
8            1          0      1     0       0           0    0      1     0   
9            0          0      1     0       0           0    1      1     0   
10           0          0      1     0       0           0    0      0     1   
11           1          0      0     1  

In [29]:
df_out.sum()

Items
biscuit        7
bournvita      4
bread         13
cock           3
coffee         8
cornflakes     6
jam            2
maggi          5
milk           5
suger          6
tea            7
dtype: int64

# **FROM SCRATCH WITHOUT APRIORI:** mlxtend package helps us to analyze association rules problems.

***Finding support using formula***

In [30]:
# Product Frequency / Total Sales
first = pd.DataFrame(df_out.sum() / df_out.shape[0], columns = ["Support"]).sort_values("Support", ascending = False)
first

Unnamed: 0_level_0,Support
Items,Unnamed: 1_level_1
bread,0.65
coffee,0.4
biscuit,0.35
tea,0.35
cornflakes,0.3
suger,0.3
maggi,0.25
milk,0.25
bournvita,0.2
cock,0.15


***Eliminating values less than minimum support***

In [31]:
# Elimination by Support Value
first[first.Support >= 0.15]

Unnamed: 0_level_0,Support
Items,Unnamed: 1_level_1
bread,0.65
coffee,0.4
biscuit,0.35
tea,0.35
cornflakes,0.3
suger,0.3
maggi,0.25
milk,0.25
bournvita,0.2
cock,0.15


***Second Iteration: Find support values for pair product combinations.***

In [32]:
from itertools import combinations

second = list(combinations(first.index, 2))
second = [list(i) for i in second]
second

[['bread', 'coffee'],
 ['bread', 'biscuit'],
 ['bread', 'tea'],
 ['bread', 'cornflakes'],
 ['bread', 'suger'],
 ['bread', 'maggi'],
 ['bread', 'milk'],
 ['bread', 'bournvita'],
 ['bread', 'cock'],
 ['bread', 'jam'],
 ['coffee', 'biscuit'],
 ['coffee', 'tea'],
 ['coffee', 'cornflakes'],
 ['coffee', 'suger'],
 ['coffee', 'maggi'],
 ['coffee', 'milk'],
 ['coffee', 'bournvita'],
 ['coffee', 'cock'],
 ['coffee', 'jam'],
 ['biscuit', 'tea'],
 ['biscuit', 'cornflakes'],
 ['biscuit', 'suger'],
 ['biscuit', 'maggi'],
 ['biscuit', 'milk'],
 ['biscuit', 'bournvita'],
 ['biscuit', 'cock'],
 ['biscuit', 'jam'],
 ['tea', 'cornflakes'],
 ['tea', 'suger'],
 ['tea', 'maggi'],
 ['tea', 'milk'],
 ['tea', 'bournvita'],
 ['tea', 'cock'],
 ['tea', 'jam'],
 ['cornflakes', 'suger'],
 ['cornflakes', 'maggi'],
 ['cornflakes', 'milk'],
 ['cornflakes', 'bournvita'],
 ['cornflakes', 'cock'],
 ['cornflakes', 'jam'],
 ['suger', 'maggi'],
 ['suger', 'milk'],
 ['suger', 'bournvita'],
 ['suger', 'cock'],
 ['suger', 'ja

***Find support values with 2 combinations***

In [33]:
# Finding support values
value = []
for i in range(0, len(second)):
    temp = df_out.T.loc[second[i]].sum() 
    temp = len(temp[temp == df_out.T.loc[second[i]].shape[0]]) / df_out.shape[0]
    value.append(temp)
# Create a data frame            
secondIteration = pd.DataFrame(value, columns = ["Support"])
secondIteration["index"] = [tuple(i) for i in second]
secondIteration['length'] = secondIteration['index'].apply(lambda x:len(x))
secondIteration = secondIteration.set_index("index").sort_values("Support", ascending = False)
# Elimination by Support Value
secondIteration = secondIteration[secondIteration.Support > 0.1]
secondIteration

Unnamed: 0_level_0,Support,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1
"(tea, maggi)",0.2,2
"(bread, tea)",0.2,2
"(bread, suger)",0.2,2
"(bread, milk)",0.2,2
"(bread, biscuit)",0.2,2
"(coffee, cornflakes)",0.2,2
"(coffee, suger)",0.2,2
"(bread, coffee)",0.15,2
"(bread, maggi)",0.15,2
"(bread, bournvita)",0.15,2


***Finding support for given number of combinations***

In [34]:
def ar_iterations(data, num_iter = 1, support_value = 0.1, iterationIndex = None):
    
    # Next Iterations
    def ar_calculation(iterationIndex = iterationIndex): 
        # Calculation of support value
        value = []
        for i in range(0, len(iterationIndex)):
            result = data.T.loc[iterationIndex[i]].sum() 
            result = len(result[result == data.T.loc[iterationIndex[i]].shape[0]]) / data.shape[0]
            value.append(result)
        # Bind results
        result = pd.DataFrame(value, columns = ["Support"])
        result["index"] = [tuple(i) for i in iterationIndex]
        result['length'] = result['index'].apply(lambda x:len(x))
        result = result.set_index("index").sort_values("Support", ascending = False)
        # Elimination by Support Value
        result = result[result.Support > support_value]
        return result    
    
    # First Iteration
    first = pd.DataFrame(df_out.T.sum(axis = 1) / df_out.shape[0], columns = ["Support"]).sort_values("Support", ascending = False)
    first = first[first.Support > support_value]
    first["length"] = 1
    
    if num_iter == 1:
        res = first.copy()
        
    # Second Iteration
    elif num_iter == 2:
        
        second = list(combinations(first.index, 2))
        second = [list(i) for i in second]
        res = ar_calculation(second)
        
    # All Iterations > 2
    else:
        nth = list(combinations(set(list(chain(*iterationIndex))), num_iter))
        nth = [list(i) for i in nth]
        res = ar_calculation(nth)
    
    return res

***Combination 1***

In [35]:
iteration1 = ar_iterations(df_out, num_iter=1, support_value=0.1)
iteration1

Unnamed: 0_level_0,Support,length
Items,Unnamed: 1_level_1,Unnamed: 2_level_1
bread,0.65,1
coffee,0.4,1
biscuit,0.35,1
tea,0.35,1
cornflakes,0.3,1
suger,0.3,1
maggi,0.25,1
milk,0.25,1
bournvita,0.2,1
cock,0.15,1


***2 Item combination***

In [36]:
iteration2 = ar_iterations(df_out, num_iter=2, support_value=0.1)
iteration2

Unnamed: 0_level_0,Support,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1
"(coffee, cornflakes)",0.2,2
"(bread, tea)",0.2,2
"(tea, maggi)",0.2,2
"(bread, suger)",0.2,2
"(bread, biscuit)",0.2,2
"(bread, milk)",0.2,2
"(coffee, suger)",0.2,2
"(bread, coffee)",0.15,2
"(coffee, cock)",0.15,2
"(biscuit, cornflakes)",0.15,2


***3 Item combination***

In [37]:
from itertools import chain

iteration3 = ar_iterations(df_out, num_iter=3, support_value=0.01,
              iterationIndex=iteration2.index)
iteration3

Unnamed: 0_level_0,Support,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1
"(cock, biscuit, cornflakes)",0.10,3
"(bread, biscuit, milk)",0.10,3
"(coffee, cock, cornflakes)",0.10,3
"(coffee, cock, biscuit)",0.10,3
"(maggi, biscuit, tea)",0.10,3
...,...,...
"(bread, biscuit, tea)",0.05,3
"(bread, cornflakes, milk)",0.05,3
"(coffee, bread, cock)",0.05,3
"(biscuit, cornflakes, milk)",0.05,3


***4 Items combination***

In [38]:
iteration4 = ar_iterations(df_out, num_iter=4, support_value=0.01,
              iterationIndex=iteration3.index)
iteration4

Unnamed: 0_level_0,Support,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1
"(coffee, cock, biscuit, cornflakes)",0.1,4
"(tea, coffee, milk, cornflakes)",0.05,4
"(tea, bread, maggi, biscuit)",0.05,4
"(bread, biscuit, milk, cornflakes)",0.05,4


# **Using Apriori Association Rules**

In [39]:
# pip install pycaret

In [40]:
from mlxtend.frequent_patterns import apriori, association_rules
# Apriori
freq_items = apriori(df_out, min_support = 0.1, use_colnames = True, verbose = 1)
freq_items.sort_values("support", ascending = False)

Processing 110 combinations | Sampling itemset size 2Processing 234 combinations | Sampling itemset size 3Processing 108 combinations | Sampling itemset size 4


Unnamed: 0,support,itemsets
2,0.65,(bread)
4,0.40,(coffee)
0,0.35,(biscuit)
10,0.35,(tea)
5,0.30,(cornflakes)
...,...,...
15,0.10,"(biscuit, maggi)"
16,0.10,"(biscuit, milk)"
17,0.10,"(biscuit, tea)"
22,0.10,"(jam, bread)"


In [41]:
# Association Rules & Info
df_ar = association_rules(freq_items, metric = "confidence", min_threshold = 0.5)
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(biscuit),(bread),0.35,0.65,0.20,0.571429,0.879121,-0.0275,0.816667
1,(cock),(biscuit),0.15,0.35,0.10,0.666667,1.904762,0.0475,1.950000
2,(cornflakes),(biscuit),0.30,0.35,0.15,0.500000,1.428571,0.0450,1.300000
3,(bournvita),(bread),0.20,0.65,0.15,0.750000,1.153846,0.0200,1.400000
4,(bournvita),(suger),0.20,0.30,0.10,0.500000,1.666667,0.0400,1.400000
...,...,...,...,...,...,...,...,...,...
61,"(biscuit, cock)","(coffee, cornflakes)",0.10,0.20,0.10,1.000000,5.000000,0.0800,inf
62,"(coffee, cornflakes)","(biscuit, cock)",0.20,0.10,0.10,0.500000,5.000000,0.0800,1.800000
63,"(coffee, cock)","(biscuit, cornflakes)",0.15,0.15,0.10,0.666667,4.444444,0.0775,2.550000
64,"(cock, cornflakes)","(biscuit, coffee)",0.10,0.10,0.10,1.000000,10.000000,0.0900,inf


In [42]:
df_ar[(df_ar.support > 0.15) & (df_ar.confidence > 0.5)].sort_values("confidence", ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
8,(milk),(bread),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75
18,(maggi),(tea),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25
9,(suger),(bread),0.3,0.65,0.2,0.666667,1.025641,0.005,1.05
14,(cornflakes),(coffee),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8
16,(suger),(coffee),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8
0,(biscuit),(bread),0.35,0.65,0.2,0.571429,0.879121,-0.0275,0.816667
10,(tea),(bread),0.35,0.65,0.2,0.571429,0.879121,-0.0275,0.816667
19,(tea),(maggi),0.35,0.25,0.2,0.571429,2.285714,0.1125,1.75
