In [4]:
import itertools
from collections import defaultdict


In [5]:
# Load dataset: each line contains space-separated item IDs
def load_transactions(path):
    transactions = []
    with open(path, "r") as f:
        for line in f:
            items = line.strip().split()
            if items:
                transactions.append(frozenset(items))
    return transactions

transactions = load_transactions("dataset.dat")
print(f"Loaded {len(transactions)} transactions.")
transactions[:5]  # preview


Loaded 100000 transactions.


[frozenset({'164',
            '240',
            '25',
            '274',
            '328',
            '368',
            '448',
            '52',
            '538',
            '561',
            '630',
            '687',
            '730',
            '775',
            '825',
            '834'}),
 frozenset({'120',
            '124',
            '205',
            '39',
            '401',
            '581',
            '704',
            '814',
            '825',
            '834'}),
 frozenset({'249', '35', '674', '712', '733', '759', '854', '950'}),
 frozenset({'39',
            '422',
            '449',
            '704',
            '825',
            '857',
            '895',
            '937',
            '954',
            '964'}),
 frozenset({'15',
            '229',
            '262',
            '283',
            '294',
            '352',
            '381',
            '708',
            '738',
            '766',
            '853',
            '883',
            '966',

In [6]:
def count_support(candidates, transactions):
    """Return dictionary: candidate_itemset -> support_count"""
    support = defaultdict(int)
    for transaction in transactions:
        for candidate in candidates:
            if candidate.issubset(transaction):
                support[candidate] += 1
    return support


In [None]:
def generate_L1(transactions, min_support):
    item_counts = defaultdict(int)

    for transaction in transactions:
        for item in transaction:
            item_counts[frozenset([item])] += 1

    L1 = {itemset: count for itemset, count in item_counts.items()
          if count >= min_support}
    
    return L1

# Example threshold; tune as needed
min_support = 1000

L1 = generate_L1(transactions, min_support)
print("Frequent 1-itemsets:", len(L1))
L1


Frequent 1-itemsets: 155


{frozenset({'368'}): 7828,
 frozenset({'274'}): 2628,
 frozenset({'561'}): 2783,
 frozenset({'538'}): 3982,
 frozenset({'775'}): 3771,
 frozenset({'825'}): 3085,
 frozenset({'205'}): 3605,
 frozenset({'401'}): 3667,
 frozenset({'581'}): 2943,
 frozenset({'120'}): 4973,
 frozenset({'39'}): 4258,
 frozenset({'854'}): 2847,
 frozenset({'674'}): 2527,
 frozenset({'895'}): 3385,
 frozenset({'937'}): 4681,
 frozenset({'766'}): 6265,
 frozenset({'738'}): 2129,
 frozenset({'229'}): 2281,
 frozenset({'883'}): 4902,
 frozenset({'381'}): 2959,
 frozenset({'966'}): 3921,
 frozenset({'283'}): 4082,
 frozenset({'620'}): 2100,
 frozenset({'798'}): 3103,
 frozenset({'569'}): 2835,
 frozenset({'782'}): 2767,
 frozenset({'529'}): 7057,
 frozenset({'682'}): 4132,
 frozenset({'350'}): 3069,
 frozenset({'947'}): 3690,
 frozenset({'970'}): 2086,
 frozenset({'809'}): 2163,
 frozenset({'390'}): 2685,
 frozenset({'280'}): 2108,
 frozenset({'279'}): 3014,
 frozenset({'675'}): 2976,
 frozenset({'192'}): 2004,
 f

In [17]:
def generate_candidates(prev_frequent_itemsets, k):
    """Generate Ck from L(k-1) via self-join and pruning"""
    prev_itemsets = list(prev_frequent_itemsets.keys())
    candidates = set()

    # Self-join
    for i in range(len(prev_itemsets)):
        for j in range(i + 1, len(prev_itemsets)):
            L1 = list(prev_itemsets[i])
            L2 = list(prev_itemsets[j])
            L1.sort(); L2.sort()

            # If first k-2 items are equal, join them
            if L1[:k-2] == L2[:k-2]:
                new_candidate = frozenset(set(prev_itemsets[i]) | set(prev_itemsets[j]))
                if len(new_candidate) == k:
                    
                    # Apriori prune:
                    # All (k-1)-subsets must be frequent
                    all_subsets_frequent = True
                    for subset in itertools.combinations(new_candidate, k-1):
                        if frozenset(subset) not in prev_frequent_itemsets:
                            all_subsets_frequent = False
                            break

                    if all_subsets_frequent:
                        candidates.add(new_candidate)

    return candidates


In [18]:
def apriori(transactions, min_support):
    # Step 1: L1
    frequent_itemsets = []
    Lk = generate_L1(transactions, min_support)
    frequent_itemsets.append(Lk)
    
    k = 2

    while True:
        print(f"Generating candidates for k = {k}")

        Ck = generate_candidates(Lk, k)
        if not Ck:
            break

        support_counts = count_support(Ck, transactions)

        # Filter by support threshold
        Lk = {itemset: count for itemset, count in support_counts.items()
              if count >= min_support}

        if not Lk:
            break

        frequent_itemsets.append(Lk)
        k += 1

    return frequent_itemsets

frequent_itemsets = apriori(transactions, min_support)


Generating candidates for k = 2


In [19]:
total = sum(len(level) for level in frequent_itemsets)

print(f"Total frequent itemsets: {total}\n")

for i, Lk in enumerate(frequent_itemsets, start=1):
    print(f"Level {i} — {len(Lk)} itemsets")
    for itemset, support in Lk.items():
        print(f"  {set(itemset)}  → support {support}")
    print()


Total frequent itemsets: 155

Level 1 — 155 itemsets
  {'368'}  → support 7828
  {'274'}  → support 2628
  {'561'}  → support 2783
  {'538'}  → support 3982
  {'775'}  → support 3771
  {'825'}  → support 3085
  {'205'}  → support 3605
  {'401'}  → support 3667
  {'581'}  → support 2943
  {'120'}  → support 4973
  {'39'}  → support 4258
  {'854'}  → support 2847
  {'674'}  → support 2527
  {'895'}  → support 3385
  {'937'}  → support 4681
  {'766'}  → support 6265
  {'738'}  → support 2129
  {'229'}  → support 2281
  {'883'}  → support 4902
  {'381'}  → support 2959
  {'966'}  → support 3921
  {'283'}  → support 4082
  {'620'}  → support 2100
  {'798'}  → support 3103
  {'569'}  → support 2835
  {'782'}  → support 2767
  {'529'}  → support 7057
  {'682'}  → support 4132
  {'350'}  → support 3069
  {'947'}  → support 3690
  {'970'}  → support 2086
  {'809'}  → support 2163
  {'390'}  → support 2685
  {'280'}  → support 2108
  {'279'}  → support 3014
  {'675'}  → support 2976
  {'192'}  →

In [11]:
# Generate association rules from frequent itemsets
def generate_association_rules(frequent_itemsets, min_confidence):
    rules = []

    # Flatten frequent itemsets into: itemset -> support
    all_frequents = {}
    for level in frequent_itemsets:
        all_frequents.update(level)

    for itemset, itemset_support in all_frequents.items():
        if len(itemset) < 2:
            continue  # can't split 1-itemset into a rule

        items = list(itemset)

        # Generate all non-empty proper subsets X ⊂ itemset
        for r in range(1, len(items)):
            for X in itertools.combinations(items, r):
                X = frozenset(X)
                Y = itemset - X

                if X in all_frequents:
                    confidence = itemset_support / all_frequents[X]

                    if confidence >= min_confidence:
                        rule = {
                            "X": X,
                            "Y": Y,
                            "support": itemset_support,
                            "confidence": confidence
                        }
                        rules.append(rule)
    return rules


In [15]:
min_confidence = 0.3

rules = generate_association_rules(frequent_itemsets, min_confidence)

print(f"Generated {len(rules)} association rules.")

rules_sorted = sorted(
    rules,
    key=lambda r: (r['confidence'], r['support']),
    reverse=True
)

for rule in rules_sorted[:20]:
    print(
        f"{set(rule['X'])} -> {set(rule['Y'])} "
        f"(conf={rule['confidence']:.3f}, support={rule['support']})"
    )



Generated 13 association rules.
{'825', '704'} -> {'39'} (conf=0.939, support=1035)
{'39', '704'} -> {'825'} (conf=0.935, support=1035)
{'825', '39'} -> {'704'} (conf=0.872, support=1035)
{'704'} -> {'39'} (conf=0.617, support=1107)
{'704'} -> {'825'} (conf=0.614, support=1102)
{'227'} -> {'390'} (conf=0.577, support=1049)
{'704'} -> {'825', '39'} (conf=0.577, support=1035)
{'390'} -> {'227'} (conf=0.391, support=1049)
{'390'} -> {'722'} (conf=0.388, support=1042)
{'346'} -> {'217'} (conf=0.385, support=1336)
{'825'} -> {'39'} (conf=0.385, support=1187)
{'825'} -> {'704'} (conf=0.357, support=1102)
{'825'} -> {'39', '704'} (conf=0.335, support=1035)


## ** Report 

## ** Instructions

*put some instructions about the dataset like last time*

## ** Introduction

In this seminar, we implemented the A-Priori algorithm to identify frequent itemsets that occur in a set of transactions that have support over a given threshold. The support for a particular itemset is simply how many transactions in the dataset contain the itemset. Once those itemsets have been identified, we then generate assoication rules from those frequent itemsets to determine which rules have a high confidence. This process has applications in many domains, such as for vendors to determine whether certain purcahases may incline a customer to also include other items in thier purchase, or in pharmaceutical cases where side effects that come along with certain combinations of prescriptions can be identified.

## ** Identifying Frequent Itemsets

In order for an itemset to be frequent, it must appear iover a certain support threshold fraction of the transactions in the dataset. However, there are many itemsets to check out, especially once you get to itemsets with longer lengths. If you take a generous asumption of a transactions set that has 3000 unique items, that will have almost 500,000 unique pairs, and raise that to itemsets of length 3 and you get over 150 million unique triplets. Given the exponential size of possible itemsets, a simple method of naively checking how many transactions each itemset is a part of quickly becomes impossible.Due to this, we need a different method of identifying frequent itemsets that allows us to identify a much smaller set of "likely" itemsets that we can check through to identify frequent itemsets.

This is where the A-Priori Algorithm comes into play. It is able to prune through and filter out many possible itemsets as not being frequent through the principle that a frequent itemset can only be that way if every subset is also a frequent itemset. For any itemset, adding another element to that itemset can only lower the ratio of transactions that it appears in, and so once an itemset becomes infrequent, any add-on to that base itemset will also remain infrequent. A-Priori utilizes this property, as when building the possible itemsets of length k that could be frequent, it only builds these candidate itemsets through combining the itemsets of length k-1 that were frequent, with frequent itemsets of length 1. This results in a far lower number of itemsets at each length that need to be checke. Instead of checking all possible pairs of, for instance, length 3 itemsets, it can build a smaller candidate pool by only checking the length 3 itemsets that result from a frequent length 2 itemset and a frequent singelton (length 1 itemset).

We first generate the set L1, which is the set of all singleton itemsets that do meet the support threshold. From L1, we can then create the candidate list C2 for itemsets of length 2 from frequent itemsets in L1. Once C2 is created, the itemsets are then checked to see if they match the support requirement, and then the process repeats iteratively for C3, C4, and so on, continously building new candidate itemsets from the previous L(k-1). Doing this allows us to be able to identify frequent itemsets of larger degrees, whereas trying to check through all possible itemsets proves infeasible almost immediately. 

From testing on our dataset, at a supportability of .6 we notice it can take around 5-7 minutes to be able to identify all frequent itemsets, as despite pruning out infeasible candidate pairs that don't need to actively be considered throughout the process, there are still many candidate itemsets that can be created from our previous levels of frequent itemsets. From our dataset, the vast majority of the time is spent on creating itemsets of length 2, as with our support of .6, we have 375 length 1 itemsets identified, with comes out to around 70,000 different length 2 itemsets to check. After this however, our implementation is very quick at runtime to check candidate itemsets of higher lengths. This is due to the creation of k-size candidate sets being based off smaller length previous frequent itemsets, as this allows us to keep making our candidate sets smaller as our searched itemset size increases. As a result, this algorithm is applicable even on sets that can have larger size frequent itemsets, as each successive step has to go through less possible candidates.

The support threshold also is a large factor in how long the a-priori algorithm takes, as setting a higher support threshold results in less starting length 1 itemsets that are identified as frequent, and so the algorithm will have less candidates at the C2 stage to construct. Increasing our support thereshold from 1000 to 2000 causes the initially identified frequent 1 pair itemsets to only be 155, and this decreases our C2 size drastically, and so as a result all frequent itemsets get identified in around 1 minute. Increasing supportability does impact the future association rule generation though, as less identified freuent itemsets allowes for less rules to be created and checked, when raising our supportbaility to 2000 for instance we now don't get any frequent itemsets of length > 1, and so as a result we cannot form any rules that meet our set supportability. As a result, the selected support threshold should depend on several factors: how large the dataset is and how feasible it is to run the algotihm based on the initial size of your C2 set, what level of supportability is considered significant in the context, and whether having rules with high confidence is paramount, rather than those rules having a high supportability being the important factor.

## ** Generating Association Rules

From frequent item sets, we can then generate association rules from them. The way that we generate these association rules is that for every itemset, we look at every proper subset P of that itemset (as a result itemset should be > 1 length), and we generate the rule that P -> {itemset}/P, and we check the confidence to see whether it meets our set confidence level. We guarantee that the association rules meet the support criteria, as we generate all rules from frequent itemsets, and so as a result for a generated X -> Y rule, X U Y is going to be some frequent itemset, which by definition meets our support requirement. 

From testing on our dataset, we notice that the confidence level has a very high impact on the identified association rules that meet the confidence criteria. When testing with a supportability of .60 for identifying frequent itemsets, we were able to get 9 itemsets of length 2 and 1 itemset of length 3, which gives us 24 total possible association rules. As a result, it does not take any considerable time to generate association rules given the low number of total possible rules, and this is as a result from being able to succesfully prune out infrequent itemsets previously, allowing us to build association rules that have a support above our accepted threshold. From testing with different confidence interval, the percentage of rules that met the confidence criteria varied heavily with the confidence threshold: 

| Confidence Threshold | # Of Generated Rules | Percentage of Rules Meeting Confidence |
|----------------------|----------------------|----------------------------------------|
|         0.9          |           2          |                 0.083                  |
|         0.8          |           3          |                 0.125                  |
|         0.6          |           5          |                 0.208                  |
|         0.5          |           7          |                 0.291                  |
|         0.3          |           13         |                 0.541                  |

From this we can see that the percentage of rules that meet a criteria steadily increases as confidence decreases, which shows that the confidence of a rule is not weighed heavily towards being very high or very low, and is more evenly distributed from 0 to 1. Due to confidence not being skewed towards very high or very low values, slightly low confidence readings of .3-.6 may not be sufficient to be able to follow, as they are quite normal in our distribution of confidence levels and may not suggest any consistent pattern or trend. The selected confidence that is acceptable is dependent onthe environment as well, for something like retail where it may not be too costly to put certain items close together and more liberties can be taken to increase sales, lower confidence and less consistency in a pattern might be alright and enough to still group certain items together in a section of the store. However for other applications where the rules that are trusted to be consistent need to be very consistent, higher confidence values would need to be chosen to ensure loose rules are not followed as a pattern.

## ** Conclusion

This was an insighftul experiment into how frequent itemsets can be reasonably obtained from a dataset, and how those tiemsets can be used to generate association rules that can form observations about our data. A simple analysis into the amount of possible itemsets quickly shows that it is imperative to find algorithms to be able to progressively tune out more and more possibilties as being infrequent from the beginning, and implementing A-Priori showed us a method of doing that, and helped with understanding the underlying set logic behind how it is able to go about that. Generating association rules also helped us to visualize how real world observations can then be made after identifying frequent itemsets, and how those observations will inately having high support due to the nature of how those rules were formed from identified frequent itemsets.

