# Apriori Algorithm

This code is just to give you an intuition of how Apriori algorithm works. 

We will run it for two iterations, that is, we will only find frequent itemsets for pairs.

In [None]:
from itertools import combinations
import pandas as pd 
from collections import defaultdict
from operator import itemgetter




*   Upload your dataset in your drive
*   Change the path variable to the path to your dataset





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = "/content/drive/My Drive/Data/Apriori/groceries.csv"

In [None]:
groceries_file = pd.read_csv(path,names=["Items"],header = None , sep =";")
groceries_file.head()

Unnamed: 0,Items
0,"citrus fruit,semi-finished bread,margarine,rea..."
1,"tropical fruit,yogurt,coffee"
2,whole milk
3,"pip fruit,yogurt,cream cheese,meat spreads"
4,"other vegetables,whole milk,condensed milk,lon..."


Here, we convert our dataframe to a list

In [None]:
data = []
sentences = list(groceries_file['Items'])
for sen in sentences:
    data.append(sen)


In [None]:
def update_pair_counts(pair_counts, itemset):
    """
    Updates a dictionary of pair counts for
    all pairs of items in a given itemset.
    """

    for (a,b) in combinations(itemset, 2):
        pair_counts[(a,b)] += 1
        pair_counts[(b,a)] += 1
 

In [None]:
def update_item_counts(item_counts, itemset):
    """
    Updates a dictionary of item counts for
    all items in a given itemset.
    """
    for i in itemset:
        item_counts[i] += 1

In [None]:
def filter_rules_by_conf (pair_counts, item_counts, threshold, min):
    """
    Find confidence for each pair in pair_counts and 
    filter according to threshold
    """
    rules = {} 
    for (a,b) in pair_counts:
        conf = pair_counts[(a,b)]/item_counts[a]
        if conf>=threshold and item_counts[a]>=min and item_counts[b]>=min:
            rules[(a,b)] = conf
    return rules

In [None]:
def print_rules(rules):
    """
    Format and print each rule
    """
    if type(rules) is dict or type(rules) is defaultdict:
        ordered_rules = sorted(rules.items(), key=itemgetter(1), reverse=True)
    else: 
        ordered_rules = [((a,b), None) for a,b in rules]

    for (a,b), conf_ab in ordered_rules:
      text = "{} => {}".format(a, b)
      if conf_ab:
        text = "conf(" + text + ") = {:.3f}".format(conf_ab)
        print(text)
       

In [None]:
#Confidence threshold
THRESHOLD = 0.5
MIN_COUNT = 10

In [None]:
def find_assoc_rules(data, threshold, min):
    """
    Run the algorithm for finding pair rules
    """

    pc = defaultdict(int)
    ic = defaultdict(int)

    for itemset in data:
        update_pair_counts(pc,itemset)
        update_item_counts(ic,itemset)

    rules = filter_rules_by_conf(pc,ic,threshold,min)

    return rules

Finally, we run apriori algorithm. We create a transactions list by splitting along commas the data list we had created earlier. This transaction list is then passed into find_assoc_rules() which retruns the list of valid rules.

In [None]:
transactions = []

for a in data:
    transactions.append(set(a.split(',')))


print("\nTotal transactions in dataset:",len(transactions))

final_rules = {}
final_rules = find_assoc_rules(transactions,THRESHOLD,MIN_COUNT)



Total transactions in dataset: 9835


We call print_rules() to print our valid rules in **conf(A => B) = c** format. 

This confidence value means that if a customer has bought A, then they are c% times likely to buy B as well.

In [None]:
print("Found {} rules whose confidence exceeds {}.\n".format(len(final_rules), THRESHOLD))

print_rules(final_rules)


Found 19 rules whose confidence exceeds 0.5.

conf(honey => whole milk) = 0.733
conf(frozen fruits => other vegetables) = 0.667
conf(cereals => whole milk) = 0.643
conf(rice => whole milk) = 0.613
conf(rubbing alcohol => whole milk) = 0.600
conf(cocoa drinks => whole milk) = 0.591
conf(pudding powder => whole milk) = 0.565
conf(jam => whole milk) = 0.547
conf(cream => other vegetables) = 0.538
conf(cream => sausage) = 0.538
conf(baking powder => whole milk) = 0.523
conf(tidbits => rolls/buns) = 0.522
conf(rice => other vegetables) = 0.520
conf(cooking chocolate => whole milk) = 0.520
conf(specialty cheese => other vegetables) = 0.500
conf(rubbing alcohol => citrus fruit) = 0.500
conf(rubbing alcohol => butter) = 0.500
conf(ready soups => rolls/buns) = 0.500
conf(frozen fruits => whipped/sour cream) = 0.500
