<a href="https://colab.research.google.com/github/shrezes/BasketAnalysis/blob/main/BasketAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install mlxtend




In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Load the dataset
def load_data(filepath):
    transactions = []
    with open(filepath, 'r') as file:
        for line in file:
            transaction = line.strip().split()
            transactions.append(transaction)
    return transactions

# Upload 'Sales1998.txt' to Colab first.
transactions = load_data('Sales1998.txt')

# Create a one-hot encoded dataframe
def create_one_hot(transactions):
    unique_items = set(item for sublist in transactions for item in sublist)
    one_hot = []
    for transaction in transactions:
        row = [1 if item in transaction else 0 for item in unique_items]
        one_hot.append(row)
    df = pd.DataFrame(one_hot, columns=list(unique_items), dtype=bool) # Added dtype=bool
    return df

df = create_one_hot(transactions)

# Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df, min_support=0.004, use_colnames=True)

# Check if frequent itemsets is empty
if frequent_itemsets.empty:
    print("No frequent itemsets found. Try lowering the 'min_support' value.")
else:
    # Generate association rules
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)

    # Display the results
    print("Frequent Itemsets:")
    print(frequent_itemsets)

    print("\nAssociation Rules:")
    print(rules)


Frequent Itemsets:
    support itemsets
0  0.004021    (319)
1  0.004050    (846)
2  0.004050    (865)
3  0.004109   (1352)
4  0.004021   (1297)
5  0.004197    (277)
6  0.004021    (827)

Association Rules:
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, representativity, leverage, conviction, zhangs_metric, jaccard, certainty, kulczynski]
Index: []


In [3]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Load the dataset
def load_data(filepath):
    transactions = []
    with open(filepath, 'r') as file:
        for line in file:
            transaction = line.strip().split()
            transactions.append(transaction)
    return transactions

# Load the product list
def load_product_names(filepath):
    product_names = {}
    with open(filepath, 'r') as file:
        for line in file:
            parts = line.strip().split(' ', 1)
            if len(parts) == 2:
                product_id, product_name = parts
                product_names[product_id] = product_name
    return product_names

# Upload 'Sales1998.txt' and 'productList.txt' to Colab first.
transactions = load_data('Sales1998.txt')
product_names = load_product_names('productList.txt')

# Create a one-hot encoded dataframe
def create_one_hot(transactions, product_names):
    unique_items = set()
    for transaction in transactions:
        for item_id in transaction:
            if item_id in product_names:
                unique_items.add(product_names[item_id])
    one_hot = []
    for transaction in transactions:
        row = [product_names[item_id] if item_id in transaction and item_id in product_names else None for item_id in transaction]
        # Filter out None values to avoid errors and keep only valid products
        row = [item for item in row if item]

        # Create a boolean row, with True if the unique item is in the transaction, False otherwise
        bool_row = [True if unique_item in row else False for unique_item in unique_items]
        one_hot.append(bool_row)
    df = pd.DataFrame(one_hot, columns=list(unique_items), dtype=bool) # Added dtype=bool
    return df

df = create_one_hot(transactions, product_names)

# Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df, min_support=0.004, use_colnames=True)

# Check if frequent itemsets is empty
if frequent_itemsets.empty:
    print("No frequent itemsets found. Try lowering the 'min_support' value.")
else:
    # Generate association rules
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)

    # Function to convert itemset numbers to names
    def get_product_names_from_itemset(itemset, product_names):
        return [item for item in itemset]

    # Convert itemset numbers to names in frequent itemsets
    frequent_itemsets['itemsets_names'] = frequent_itemsets['itemsets'].apply(lambda x: get_product_names_from_itemset(x, product_names))

    # Convert itemset numbers to names in rules
    rules['antecedents_names'] = rules['antecedents'].apply(lambda x: get_product_names_from_itemset(x, product_names))
    rules['consequents_names'] = rules['consequents'].apply(lambda x: get_product_names_from_itemset(x, product_names))

    # Display the results
    print("Frequent Itemsets:")
    print(frequent_itemsets[['support', 'itemsets_names']]) # Display support and item names

    print("\nAssociation Rules:")
    print(rules[['antecedents_names', 'consequents_names', 'support', 'confidence', 'lift']]) # Display product names in rules


Frequent Itemsets:
    support                itemsets_names
0  0.004021          ["Ebony Mixed Nuts"]
1  0.004050  ["Nationeel Fudge Brownies"]
2  0.004021      ["Booker String Cheese"]
3  0.004197     ["Great English Muffins"]
4  0.004109      ["Carrington Ice Cream"]
5  0.004021    ["Excellent Orange Juice"]
6  0.004050    ["Nationeel Dried Apples"]

Association Rules:
Empty DataFrame
Columns: [antecedents_names, consequents_names, support, confidence, lift]
Index: []
