In [15]:
# Install required packages
%pip install pandas numpy apyori openpyxl

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from apyori import apriori


In [17]:
# Load the market basket dataset
df = pd.read_excel('data/online_retail_II.xlsx')

In [18]:
# Display the first few rows
print("Dataset preview:")
print(df.head())

Dataset preview:
  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

          InvoiceDate  Price  Customer ID         Country  
0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom  
1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom  
4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom  


In [19]:
# Data Cleaning Step
# Replace missing values with a placeholder (0 in this case)
df.fillna(0, inplace=True)

In [20]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [21]:
# Verify column names
print("Columns in the DataFrame:", df.columns)

# Drop rows with missing values
df = df.dropna(subset=['Invoice', 'StockCode', 'Description'])  # Use the correct column names

# Remove credit transactions (those starting with 'C')
df = df[~df['Invoice'].astype(str).str.startswith('C')]

# Group by Invoice and aggregate items into a list
transactions = df.groupby('Invoice')['Description'].apply(list).values

# Convert transactions to a list of lists and remove duplicates
transaction_list = [list(set(transaction)) for transaction in transactions]
print("Total transactions:", len(transaction_list))

Columns in the DataFrame: Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')
Total transactions: 24224


In [22]:
# Apply the Apriori algorithm
rules = apriori(transaction_list, min_support=0.01, min_confidence=0.1, min_lift=1.0)

In [28]:
# Convert the results to a list of dictionaries
results = list(rules)
results_list = []
for rule in results:
    items = list(rule.items)
    support = rule.support
    confidence = rule.ordered_statistics[0].confidence
    lift = rule.ordered_statistics[0].lift
    results_list.append({'itemsets': items, 'support': support, 'confidence': confidence, 'lift': lift})


In [25]:
# Convert the list of dictionaries to a dataframe
results_df = pd.DataFrame(results_list)

In [27]:
# Showing top 10 items, based on lift. Sorting in desc order
df_final = results_df.sort_values('lift', ascending=False).head(10)
print(df_final)

KeyError: 'lift'

In [10]:
# Creating a helper function to extract metrics
def extract_metrics(rule):
    antecedent = list(rule[2][0])
    consequent = list(rule[2][1])
    support = rule[1]
    confidence = rule[2][2]
    lift = rule[2][3]
    return antecedent, consequent, support, confidence, lift

In [None]:
# Ensure results_as_df is not empty before applying the function
if not results_as_df.empty:
	# Apply the function to extract metrics from rules
	metrics = results_as_df.apply(extract_metrics, axis=1, result_type='expand')
	metrics.columns = ['Antecedent', 'Consequent', 'Support', 'Confidence', 'Lift']
else:
	print("results_as_df is empty. No metrics to extract.")

In [None]:
# Showing top 10 items, based on lift. Sorting in desc order
df_final = metrics.sort_values('lift', ascending=False).head(10)
print("Top 10 items based on lift:")
print(df_final)

In [None]:
## verifying - by printing the 0th transaction
transactions[0]

In [114]:
# Convert transactions to a list of lists, remove duplicates, and ensure all items are strings
transaction_list = [[str(item) for item in set(transaction)] for transaction in transactions]

In [None]:
# Filter necessary columns (assuming 'Description' holds item names)
# Update based on actual column names in your dataset
if 'Description' in df.columns:
    filtered_df = df[['Description']]
    print("Filtered 'Description' column successfully.")
else:
    raise ValueError("Column 'Description' not found in dataset. Update column filtering logic.")


In [116]:
# Data Preprocessing Step
# Convert data into a list format for Apriori
transactions = []
for i in range(0, len(filtered_df)):
    transaction = [
        str(filtered_df.values[i, j]).strip()
        for j in range(filtered_df.shape[1])
        if str(filtered_df.values[i, j]) != '0'
    ]
    if transaction:  # Add transaction only if it's not empty
        transactions.append(transaction)

In [None]:
# Debugging: Verify transactions
print("Number of transactions:", len(transactions))
if len(transactions) > 0:
    print("Sample transactions:", transactions[:5])
else:
    raise ValueError("No transactions generated. Check dataset and preprocessing steps.")

In [None]:
# Set minimum support, confidence, and lift thresholds
min_support = 0.02
min_confidence = 0.3
min_lift = 1.0

# Run the Apriori algorithm
rules = apriori(transactions, min_support=min_support, 
                min_confidence=min_confidence, min_lift=min_lift)

# Convert results to a list for readability
results = list(rules)
print("Total rules generated:", len(results))

In [98]:
# Call Apriori with initial parameters
rules = apriori(transactions, min_support=0.003, min_confidance=0.2, min_lift=3, min_length=2)
# rules = apriori(
#     transactions,
#     min_support=0.0001,  # Minimum support 0.01%
#     min_confidence=0.05, # Minimum confidence 5%
#     min_lift=1.0,        # Minimum lift of 1
#     min_length=1         # Allow single-item rules
# )


In [None]:
# Convert rules to a list
Results = list(rules)
Results

In [None]:
# Debugging: Check for rules
if not Results:
    print("No rules generated. Try further lowering Apriori parameters or inspect the dataset.")
    # Analyze item frequency
    from collections import Counter

    item_counts = Counter(item for transaction in transactions for item in transaction)
    print("Top 10 items by frequency:")
    print(item_counts.most_common(10))

    # Analyze dataset size
    print("Number of transactions:", len(transactions))
else:
    # Display raw rules
    print("\nGenerated Rules:")
    for rule in Results:
        print(rule)

    # Process results into a DataFrame
    results_list = []
    for result in Results:
        support = result.support
        ordered_statistics = result.ordered_statistics

        for stat in ordered_statistics:
            # Extract rule details
            items_base = ', '.join(stat.items_base)
            items_add = ', '.join(stat.items_add)
            confidence = stat.confidence
            lift = stat.lift

            results_list.append({
                'Base Items': items_base,
                'Add Items': items_add,
                'Support': support,
                'Confidence': confidence,
                'Lift': lift
            })

    # Create DataFrame from the results
    df_results = pd.DataFrame(results_list)

    # Display the first few rows of the results
    print("\nAssociation Rules:")
    print(df_results.head())

    # Ensure df_results is not empty
    if not df_results.empty:
        # Save support values in a separate DataFrame
        support_df = df_results[['Support']]
        print("\nSupport Values:")
        print(support_df.head())
    else:
        print("df_results is empty. No support values to display.")


In [None]:
# Check if any rules were generated
if not Results:
    print("No rules generated. Try adjusting Apriori parameters.")
else:
    # Display the raw results
    print("\nGenerated Rules:")
    for rule in Results:
        print(rule)

In [47]:
# Convert results into a DataFrame for further analysis
results_list = []
for result in Results:
    # Extract base statistics
    support = result.support
    ordered_statistics = result.ordered_statistics

    for stat in ordered_statistics:
        # Extracting specific details of rules
        items_base = ', '.join(stat.items_base)
        items_add = ', '.join(stat.items_add)
        confidence = stat.confidence
        lift = stat.lift

        # Append to the results list
        results_list.append({
            'Base Items': items_base,
            'Add Items': items_add,
            'Support': support,
            'Confidence': confidence,
            'Lift': lift
        })

In [48]:
# Create a DataFrame from the results
df_results = pd.DataFrame(results_list)

In [None]:
# Display the first few rows of the results
print("\nAssociation Rules:")
print(df_results.head())

In [None]:
# Ensure df_results is not empty
if not df_results.empty:
	# Save support values in a separate DataFrame for later use
	support_df = df_results[['Support']]
	print("\nSupport Values:")
	print(support_df.head())
else:
	print("df_results is empty. No support values to display.")