In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
df = pd.read_csv('D:\\Downloads\\Groceries_dataset\\Groceries_dataset.csv')
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  int64 
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
dtypes: int64(1), object(2)
memory usage: 908.7+ KB


In [4]:
df['itemDescription'].nunique()

167

In [5]:
df['itemDescription'].value_counts()

itemDescription
whole milk               2502
other vegetables         1898
rolls/buns               1716
soda                     1514
yogurt                   1334
                         ... 
rubbing alcohol             5
bags                        4
baby cosmetics              3
kitchen utensil             1
preservation products       1
Name: count, Length: 167, dtype: int64

In [6]:
transactions = df.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).tolist()

In [7]:
len(transactions)

14963

In [8]:
te = TransactionEncoder() #call TransactionEncoder to create an instance
Encoded_te = te.fit(transactions).transform(transactions) #transform the transactions into a one-hot encoded format
df_encoded = pd.DataFrame(Encoded_te, columns=te.columns_) #convert the encoded data into a DataFrame

In [9]:
df_encoded

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14958,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
14959,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14960,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14961,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
df_encoded.shape

(14963, 167)

In [11]:
frequent_itemsets = apriori(df_encoded, min_support=0.001, use_colnames=True)

In [12]:
frequent_itemsets.sort_values(by= "support",ascending=False)

Unnamed: 0,support,itemsets
146,0.157923,(whole milk)
90,0.122101,(other vegetables)
109,0.110005,(rolls/buns)
123,0.097106,(soda)
147,0.085879,(yogurt)
...,...,...
344,0.001002,"(chicken, margarine)"
201,0.001002,"(bottled beer, chicken)"
202,0.001002,"(chocolate, bottled beer)"
516,0.001002,"(pastry, hamburger meat)"


In [13]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)

In [14]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(UHT-milk),(other vegetables),0.021386,0.122101,0.002139,0.100000,0.818993,1.0,-0.000473,0.975443,-0.184234,0.015130,-0.025175,0.058758
1,(UHT-milk),(whole milk),0.021386,0.157923,0.002540,0.118750,0.751949,1.0,-0.000838,0.955549,-0.252105,0.014367,-0.046519,0.067416
2,(beef),(whole milk),0.033950,0.157923,0.004678,0.137795,0.872548,1.0,-0.000683,0.976656,-0.131343,0.024991,-0.023902,0.083709
3,(berries),(other vegetables),0.021787,0.122101,0.002673,0.122699,1.004899,1.0,0.000013,1.000682,0.004984,0.018930,0.000681,0.072297
4,(berries),(whole milk),0.021787,0.157923,0.002272,0.104294,0.660414,1.0,-0.001168,0.940127,-0.344543,0.012806,-0.063686,0.059341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,"(soda, sausage)",(whole milk),0.005948,0.157923,0.001069,0.179775,1.138374,1.0,0.000130,1.026642,0.122281,0.006568,0.025951,0.093273
126,"(sausage, whole milk)",(soda),0.008955,0.097106,0.001069,0.119403,1.229612,1.0,0.000200,1.025320,0.188423,0.010185,0.024695,0.065207
127,"(sausage, yogurt)",(whole milk),0.005748,0.157923,0.001470,0.255814,1.619866,1.0,0.000563,1.131541,0.384877,0.009065,0.116250,0.132562
128,"(sausage, whole milk)",(yogurt),0.008955,0.085879,0.001470,0.164179,1.911760,1.0,0.000701,1.093681,0.481231,0.015748,0.085657,0.090650


In [15]:
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])

In [16]:
rules[['antecedents', 'consequents', 'support', 'confidence','lift']].head()

Unnamed: 0,antecedents,consequents,support,confidence,lift
127,"(sausage, yogurt)",(whole milk),0.00147,0.255814,1.619866
120,"(rolls/buns, sausage)",(whole milk),0.001136,0.2125,1.345594
125,"(soda, sausage)",(whole milk),0.001069,0.179775,1.138374
99,(semi-finished bread),(whole milk),0.001671,0.176056,1.114825
123,"(rolls/buns, yogurt)",(whole milk),0.001337,0.17094,1.082428


In [24]:
#Online Retail Example
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Step 1: Load Dataset
df = pd.read_excel('D:\\Downloads\\online+retail\\Online Retail.xlsx')
df.head()



Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [26]:
# Step 2: Basic Cleaning
df.dropna(subset=['InvoiceNo', 'Description'], inplace=True)
df = df[df['InvoiceNo'].astype(str).str.startswith('5')]  # Remove credit notes
df['Description'] = df['Description'].str.strip()



In [27]:
# Step 3: Basket Matrix (for UK transactions only)
basket = df[df['Country'] == "United Kingdom"].groupby(
    ['InvoiceNo', 'Description']
)['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo')



In [28]:
# Convert quantity to binary (1 if purchased, 0 otherwise)
basket_sets = basket.applymap(lambda x: 1 if x > 0 else 0)



In [29]:
# Step 4: Apply Apriori algorithm
frequent_itemsets = apriori(basket_sets, min_support=0.01, use_colnames=True)




MemoryError: Unable to allocate 87.1 GiB for an array with shape (313236, 2, 18664) and data type int64

In [30]:
# Step 5: Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)



In [31]:
# Step 6: Display Results
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head())

        antecedents       consequents   support  confidence      lift
0  (tropical fruit)        (UHT-milk)  0.001537    0.022682  1.060617
1        (UHT-milk)  (tropical fruit)  0.001537    0.071875  1.060617
2            (beef)     (brown bread)  0.001537    0.045276  1.203301
3     (brown bread)            (beef)  0.001537    0.040853  1.203301
4            (beef)    (citrus fruit)  0.001804    0.053150  1.000349
