Exp-08 (Apriori Algorithm)

In [None]:
import pandas as pd
from itertools import combinations

# Load CSV file and preprocess the data
def load_data(file_path):
    data = pd.read_csv(file_path)
    transactions = []

    # Iterate through each row in the DataFrame
    for i in range(len(data)):
        transaction = []
        # Check for each item column (item1, item2, item3, item4, item5)
        for j in range(1, len(data.columns)):
            item = data.iloc[i, j]
            if pd.notna(item):  # Check if the item is not NaN
                transaction.append(item)
        transactions.append(transaction)

    return transactions

# Calculate the support of itemsets
def calculate_support(transactions, itemsets):
    support = {}
    for itemset in itemsets:
        itemset_tuple = tuple(itemset)
        support_count = sum([1 for transaction in transactions if set(itemset).issubset(set(transaction))])
        support[itemset_tuple] = support_count / len(transactions)
    return support

# Prune itemsets that do not meet the minimum support
def prune_itemsets(support, min_support):
    return {itemset: support_val for itemset, support_val in support.items() if support_val >= min_support}

# Generate candidate itemsets of size k+1 from frequent itemsets of size k
def generate_candidates(frequent_itemsets, k):
    candidates = set()
    frequent_items = list(frequent_itemsets.keys())

    for i in range(len(frequent_items)):
        for j in range(i + 1, len(frequent_items)):
            union_set = set(frequent_items[i]).union(frequent_items[j])
            if len(union_set) == k + 1:
                candidates.add(tuple(sorted(union_set)))

    return candidates

# Calculate confidence for association rules
def calculate_confidence(frequent_itemsets, transactions, min_confidence):
    rules = []

    for itemset in frequent_itemsets:
        if len(itemset) > 1:
            subsets = list(combinations(itemset, len(itemset) - 1))

            for subset in subsets:
                remaining = tuple(set(itemset) - set(subset))
                subset_support = sum([1 for transaction in transactions if set(subset).issubset(set(transaction))]) / len(transactions)
                itemset_support = frequent_itemsets[itemset]

                confidence = itemset_support / subset_support if subset_support > 0 else 0

                # Convert confidence to percentage
                confidence_percentage = confidence * 100

                if confidence >= min_confidence:
                    rules.append((subset, remaining, confidence_percentage))

    return rules

# Apriori algorithm
def apriori(transactions, min_support, min_confidence):
    # Generate 1-itemsets
    itemsets = [{item} for transaction in transactions for item in transaction]
    itemsets = [list(x) for x in set(tuple(sorted(x)) for x in itemsets)]

    # Calculate initial support for 1-itemsets
    support = calculate_support(transactions, itemsets)

    # Filter out itemsets that do not meet minimum support
    frequent_itemsets = prune_itemsets(support, min_support)

    all_frequent_itemsets = frequent_itemsets.copy()
    k = 1

    while frequent_itemsets:
        # Generate candidate itemsets of size k+1
        candidates = generate_candidates(frequent_itemsets, k)

        # Calculate support for candidate itemsets
        support = calculate_support(transactions, candidates)

        # Prune itemsets that do not meet the minimum support
        frequent_itemsets = prune_itemsets(support, min_support)

        # Add frequent itemsets to the global list
        all_frequent_itemsets.update(frequent_itemsets)

        k += 1

    # Calculate confidence for association rules
    rules = calculate_confidence(all_frequent_itemsets, transactions, min_confidence)

    return all_frequent_itemsets, rules

# Main
if __name__ == '__main__':
    file_path = 'aprior.csv'  # Ensure this matches your actual file path
    min_support = 0.3
    min_confidence_percentage = 70  # Minimum confidence in percentage

    # Convert percentage to decimal for calculations
    min_confidence = min_confidence_percentage / 100.0

    # Load transactions
    transactions = load_data(file_path)

    # Run Apriori algorithm
    frequent_itemsets, rules = apriori(transactions, min_support, min_confidence)

    # Output Frequent Itemsets
    print("Frequent Itemsets:")
    for itemset, support in frequent_itemsets.items():
        print(f"{itemset}: {support:.2f}")

    # Output Association Rules with Confidence in Percentage
    print("\nAssociation Rules:")
    for rule in rules:
        antecedent, consequent, confidence = rule
        print(f"{antecedent} -> {consequent}: Confidence = {confidence:.2f}%")
