In [2]:
#!pip install --upgrade ipython jupyter
#!pip install mlxtend

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [3]:
df = pd.read_csv('Online_Retail_10000.csv', nrows=100)

In [4]:
df.shape

(100, 8)

In [5]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


**Data Preprocessing**

* To perform an Apriori analysis on this dataset, we need to convert this dataset into a one-hot encoded format where each row represents a transaction (an `InvoiceNo`) and each column represents an item (in this case, `Description` or `StockCode`). Each cell in the DataFrame should be a binary indicator of whether the item was present in the transaction.
* Since each row represents an item in a transaction, we need to group the dataset by `InvoiceNo` so that each transaction is represented as a list of items.

In [6]:
# Group by 'InvoiceNo' and create a list of items in each transaction
transactions = df.groupby('InvoiceNo')['Description'].apply(lambda items: [str(item) for item in items])
transactions = transactions.apply(lambda items: [item for item in items if item.lower() != 'nan'])

**Transaction Encoder**


* The `TransactionEncoder` module is a part of the `mlxtend` (machine learning extensions) library in Python. It is specifically designed to transform transactional data (like a list of items bought in each purchase) into a format suitable for various machine learning algorithms, especially for frequent itemset mining and association rule mining, like the Apriori algorithm.

* The TransactionEncoder transforms a dataset of transactions (which are lists of items) into a one-hot encoded DataFrame. In this DataFrame:

  * Each column represents a unique item from the dataset.
  * Each row corresponds to a transaction.
  * The cells in the DataFrame are filled with True if the item (column) was present in the transaction (row) and False otherwise.

In [7]:
# Initialize the TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
trans_encoded = pd.DataFrame(te_ary, columns=te.columns_)

In [8]:
# Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(trans_encoded, min_support=20.0/len(df), use_colnames=True)


* **Generate Association Rules**

In [10]:
# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# We can sort the rules by descending order of confidence
rules_sorted = rules.sort_values(by='confidence', ascending=False)
rules_sorted.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(GLASS STAR FROSTED T-LIGHT HOLDER),(CREAM CUPID HEARTS COAT HANGER),0.214286,0.214286,0.214286,1.0,4.666667,0.168367,inf,1.0
1329,"(SET 7 BABUSHKA NESTING BOXES, WHITE METAL LAN...","(GLASS STAR FROSTED T-LIGHT HOLDER, RED WOOLLY...",0.214286,0.214286,0.214286,1.0,4.666667,0.168367,inf,1.0
1297,"(KNITTED UNION FLAG HOT WATER BOTTLE, GLASS ST...","(SET 7 BABUSHKA NESTING BOXES, WHITE HANGING H...",0.214286,0.214286,0.214286,1.0,4.666667,0.168367,inf,1.0
1296,"(GLASS STAR FROSTED T-LIGHT HOLDER, WHITE META...","(SET 7 BABUSHKA NESTING BOXES, KNITTED UNION F...",0.214286,0.214286,0.214286,1.0,4.666667,0.168367,inf,1.0
1295,"(KNITTED UNION FLAG HOT WATER BOTTLE, WHITE ME...","(SET 7 BABUSHKA NESTING BOXES, GLASS STAR FROS...",0.214286,0.214286,0.214286,1.0,4.666667,0.168367,inf,1.0
