In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from apyori import apriori

In [2]:
# Load the market basket dataset
df = pd.read_excel('data/online_retail_II.xlsx')

In [3]:
# Display the first few rows
print("Dataset preview:")
print(df.head())

Dataset preview:
  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

          InvoiceDate  Price  Customer ID         Country  
0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom  
1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom  
4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom  


In [None]:
# Data Cleaning Step
# Replace missing values with a placeholder (0 in this case)
df.fillna(0, inplace=True)
df.head()

In [4]:
# Verify column names
print("Columns in the DataFrame:", df.columns)

# Drop rows with missing values
df = df.dropna(subset=['Invoice', 'StockCode', 'Description'])  # Use the correct column names

# Remove credit transactions (those starting with 'C')
df = df[~df['Invoice'].astype(str).str.startswith('C')]

# Group by Invoice and aggregate items into a list
transactions = df.groupby('Invoice')['Description'].apply(list).values

# Convert transactions to a list of lists and remove duplicates
transaction_list = [list(set(transaction)) for transaction in transactions]
print("Total transactions:", len(transaction_list))

Columns in the DataFrame: Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')
Total transactions: 21296


In [5]:
## verifying - by printing the 0th transaction
transactions[0]

['15CM CHRISTMAS GLASS BALL 20 LIGHTS',
 'PINK CHERRY LIGHTS',
 ' WHITE CHERRY LIGHTS',
 'RECORD FRAME 7" SINGLE SIZE ',
 'STRAWBERRY CERAMIC TRINKET BOX',
 'PINK DOUGHNUT TRINKET POT ',
 'SAVE THE PLANET MUG',
 'FANCY FONT HOME SWEET HOME DOORMAT']

In [6]:
## verifying - by printing the 1st transaction
transactions[1]

['CAT BOWL ',
 'DOG BOWL , CHASING BALL DESIGN',
 'HEART MEASURING SPOONS LARGE',
 'LUNCHBOX WITH CUTLERY FAIRY CAKES ']

In [7]:
# Convert transactions to a list of lists, remove duplicates, and ensure all items are strings
transaction_list = [[str(item) for item in set(transaction)] for transaction in transactions]

In [8]:
# Call apriori function which requires minimum support, confidance and lift, min length is combination of item default is 2".
rules = apriori(transaction_list, min_support=0.02, min_confidance=0.3, min_lift=1.0, min_length=2)

## min_support = 0.003 -> means selecting items with min support of 0.3%
## min_confidance = 0.2 -> means min confidance of 20% 
## min_lift = 3  
## min_length = 2 -> means no. of items in the transaction should be 2

In [9]:
#it generates a set of rules in a generator file...
rules

<generator object apriori at 0x00000221FFB48970>

In [10]:
# all rules need to be converted in a list..
Results = list(rules)
Results

[RelationRecord(items=frozenset({'12 PENCILS SMALL TUBE SKULL'}), support=0.02324380165289256, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'12 PENCILS SMALL TUBE SKULL'}), confidence=0.02324380165289256, lift=1.0)]),
 RelationRecord(items=frozenset({'3 HEARTS HANGING DECORATION RUSTIC'}), support=0.021130728775356874, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'3 HEARTS HANGING DECORATION RUSTIC'}), confidence=0.021130728775356874, lift=1.0)]),
 RelationRecord(items=frozenset({'6 RIBBONS RUSTIC CHARM'}), support=0.03925619834710744, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'6 RIBBONS RUSTIC CHARM'}), confidence=0.03925619834710744, lift=1.0)]),
 RelationRecord(items=frozenset({'60 CAKE CASES VINTAGE CHRISTMAS'}), support=0.022163786626596543, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'60 CAKE CASES VINTAGE CHRISTMAS'}), confidence

In [11]:
# convert result in a dataframe for further operation...
df_results = pd.DataFrame(Results)

In [12]:
# as we see "order_statistics" , is itself a list so need to be converted in proper format..
df_results.head()

Unnamed: 0,items,support,ordered_statistics
0,(12 PENCILS SMALL TUBE SKULL),0.023244,"[((), (12 PENCILS SMALL TUBE SKULL), 0.0232438..."
1,(3 HEARTS HANGING DECORATION RUSTIC),0.021131,"[((), (3 HEARTS HANGING DECORATION RUSTIC), 0...."
2,(6 RIBBONS RUSTIC CHARM),0.039256,"[((), (6 RIBBONS RUSTIC CHARM), 0.039256198347..."
3,(60 CAKE CASES VINTAGE CHRISTMAS),0.022164,"[((), (60 CAKE CASES VINTAGE CHRISTMAS), 0.022..."
4,(60 TEATIME FAIRY CAKE CASES),0.062735,"[((), (60 TEATIME FAIRY CAKE CASES), 0.0627347..."


In [13]:
# keep support in a separate data frame so we can use later.. 
support = df_results.support

In [14]:
#all four empty list which will contain lhs, rhs, confidance and lift respectively.
first_values = []
second_values = []
third_values = []
fourth_value = []

# loop number of rows time and append 1 by 1 value in a separate list.. 
# first and second element was frozenset which need to be converted in list..
for i in range(df_results.shape[0]):
    single_list = df_results['ordered_statistics'][i][0]
    first_values.append(list(single_list[0]))
    second_values.append(list(single_list[1]))
    third_values.append(single_list[2])
    fourth_value.append(single_list[3])

In [15]:
# convert all four list into dataframe for further operation..
lhs = pd.DataFrame(first_values)
rhs = pd.DataFrame(second_values)

confidance=pd.DataFrame(third_values,columns=['Confidance'])

lift=pd.DataFrame(fourth_value,columns=['lift'])

In [16]:
# concat all list together in a single dataframe
df_final = pd.concat([lhs,rhs,support,confidance,lift], axis=1)
df_final

Unnamed: 0,0,1,support,Confidance,lift
0,12 PENCILS SMALL TUBE SKULL,,0.023244,0.023244,1.0
1,3 HEARTS HANGING DECORATION RUSTIC,,0.021131,0.021131,1.0
2,6 RIBBONS RUSTIC CHARM,,0.039256,0.039256,1.0
3,60 CAKE CASES VINTAGE CHRISTMAS,,0.022164,0.022164,1.0
4,60 TEATIME FAIRY CAKE CASES,,0.062735,0.062735,1.0
...,...,...,...,...,...
245,STRAWBERRY CERAMIC TRINKET BOX,WHITE HANGING HEART T-LIGHT HOLDER,0.025216,0.025216,1.0
246,VINTAGE SNAP CARDS,VINTAGE HEADS AND TAILS CARD GAME,0.020849,0.020849,1.0
247,WOODEN FRAME ANTIQUE WHITE,WHITE HANGING HEART T-LIGHT HOLDER,0.024981,0.024981,1.0
248,WOODEN PICTURE FRAME WHITE FINISH,WHITE HANGING HEART T-LIGHT HOLDER,0.021459,0.021459,1.0


In [17]:
df_final.fillna(value=' ', inplace=True)
df_final.head()

Unnamed: 0,0,1,support,Confidance,lift
0,12 PENCILS SMALL TUBE SKULL,,0.023244,0.023244,1.0
1,3 HEARTS HANGING DECORATION RUSTIC,,0.021131,0.021131,1.0
2,6 RIBBONS RUSTIC CHARM,,0.039256,0.039256,1.0
3,60 CAKE CASES VINTAGE CHRISTMAS,,0.022164,0.022164,1.0
4,60 TEATIME FAIRY CAKE CASES,,0.062735,0.062735,1.0


In [19]:
#set column name
df_final.columns = ['lhs', 'rhs', 'support', 'confidance', 'lift']
df_final.head()

Unnamed: 0,lhs,rhs,support,confidance,lift
0,12 PENCILS SMALL TUBE SKULL,,0.023244,0.023244,1.0
1,3 HEARTS HANGING DECORATION RUSTIC,,0.021131,0.021131,1.0
2,6 RIBBONS RUSTIC CHARM,,0.039256,0.039256,1.0
3,60 CAKE CASES VINTAGE CHRISTMAS,,0.022164,0.022164,1.0
4,60 TEATIME FAIRY CAKE CASES,,0.062735,0.062735,1.0


In [21]:
# add all three column to lhs itemset only
df_final['lhs'] = df_final['lhs'] + ", " + df_final['rhs']

# rhs already contains the correct values, no need to concatenate further

In [22]:
df_final.head()

Unnamed: 0,lhs,rhs,support,confidance,lift
0,"12 PENCILS SMALL TUBE SKULL,",,0.023244,0.023244,1.0
1,"3 HEARTS HANGING DECORATION RUSTIC,",,0.021131,0.021131,1.0
2,"6 RIBBONS RUSTIC CHARM,",,0.039256,0.039256,1.0
3,"60 CAKE CASES VINTAGE CHRISTMAS,",,0.022164,0.022164,1.0
4,"60 TEATIME FAIRY CAKE CASES,",,0.062735,0.062735,1.0


In [34]:
# Check if columns 'rhs', 'confidance', and 'lift' exist before dropping them
columns_to_drop = ['rhs', 'confidance', 'lift']
df_final.drop(columns=[col for col in columns_to_drop if col in df_final.columns], inplace=True)

In [33]:
#this is final output. You can sort based on the support lift and confidance..
df_final.head()

Unnamed: 0,lhs,support
0,"12 PENCILS SMALL TUBE SKULL,",0.023244
1,"3 HEARTS HANGING DECORATION RUSTIC,",0.021131
2,"6 RIBBONS RUSTIC CHARM,",0.039256
3,"60 CAKE CASES VINTAGE CHRISTMAS,",0.022164
4,"60 TEATIME FAIRY CAKE CASES,",0.062735


In [31]:
## Showing top 10 items, based on lift.  Sorting in desc order
df_final.sort_values('lift', ascending=False).head(10)

NameError: name 'df_final' is not defined