In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from apyori import apriori

In [2]:
# Load the market basket dataset
df = pd.read_excel('data/online_retail_II.xlsx')

In [3]:
# Display the first few rows
print("Dataset preview:")
print(df.head())

Dataset preview:
  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

          InvoiceDate  Price  Customer ID         Country  
0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom  
1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom  
4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom  


In [4]:
# Data Cleaning Step
# Replace missing values with a placeholder (0 in this case)
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [5]:
# Verify column names
print("Columns in the DataFrame:", df.columns)

# Drop rows with missing values
df = df.dropna(subset=['Invoice', 'StockCode', 'Description'])  # Use the correct column names

# Remove credit transactions (those starting with 'C')
df = df[~df['Invoice'].astype(str).str.startswith('C')]

# Group by Invoice and aggregate items into a list
transactions = df.groupby('Invoice')['Description'].apply(list).values

# Convert transactions to a list of lists and remove duplicates
transaction_list = [list(set(transaction)) for transaction in transactions]
print("Total transactions:", len(transaction_list))

Columns in the DataFrame: Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')
Total transactions: 24224


In [6]:
## verifying - by printing the 0th transaction
transactions[0]

['15CM CHRISTMAS GLASS BALL 20 LIGHTS',
 'PINK CHERRY LIGHTS',
 ' WHITE CHERRY LIGHTS',
 'RECORD FRAME 7" SINGLE SIZE ',
 'STRAWBERRY CERAMIC TRINKET BOX',
 'PINK DOUGHNUT TRINKET POT ',
 'SAVE THE PLANET MUG',
 'FANCY FONT HOME SWEET HOME DOORMAT']

In [7]:
## verifying - by printing the 1st transaction
transactions[1]

['CAT BOWL ',
 'DOG BOWL , CHASING BALL DESIGN',
 'HEART MEASURING SPOONS LARGE',
 'LUNCHBOX WITH CUTLERY FAIRY CAKES ']

In [28]:
# Call apriori function which requires minimum support, confidance and lift, min length is combination of item default is 2".
rules = apriori(transactions, min_support=0.02, min_confidance=0.3, min_lift=1.0, min_length=2)

## min_support = 0.003 -> means selecting items with min support of 0.3%
## min_confidance = 0.2 -> means min confidance of 20% 
## min_lift = 3  
## min_length = 2 -> means no. of items in the transaction should be 2

In [29]:
#it generates a set of rules in a generator file...
rules

<generator object apriori at 0x00000145E9742650>

In [32]:
# all rules need to be converted in a list..
Results = list(rules)
Results

[]