<a href="https://colab.research.google.com/github/saileepanchbhai/Advance-Machine-Learning-Lab/blob/main/Apriori.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import pandas for data manipulation
import pandas as pd
# Import apriori algorithm and association_rules function from mlxtend
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import Google Colab's file upload module
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving Groceries_dataset.csv to Groceries_dataset.csv
User uploaded file "Groceries_dataset.csv" with length 1103280 bytes


In [None]:
# Read the uploaded CSV file into a pandas DataFrame
df=pd.read_csv('Groceries_dataset.csv')
df.head() # Display the first 5 rows to inspect the dataset

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [None]:
df.shape # Display the dimensions of the DataFrame (rows, columns)

(38765, 3)

In [None]:
df.dtypes # Check the data types of each column in the DataFrame

Unnamed: 0,0
Member_number,int64
Date,object
itemDescription,object


In [None]:
df.isnull().sum() # Check for missing/null values in each column

Unnamed: 0,0
Member_number,0
Date,0
itemDescription,0


In [None]:
# Import regular expressions module for text cleaning
import re
# Clean the 'itemDescription' column:
# - Convert all entries to string
# - Replace multiple spaces with a single space
# - Remove leading and trailing spaces
df['itemDescription'] = df['itemDescription'].astype('str').apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [None]:
# Create a "basket" of items bought by each member on each date
# - Group data by 'Member_number' and 'Date'
# - Aggregate 'itemDescription' into a list of items per transaction
# - Reset index to get a clean DataFrame
basket = (df
          .groupby(['Member_number', 'Date'])['itemDescription']
          .apply(list)
          .reset_index())

In [None]:
# Convert the list of item lists into a list of transactions
# Each transaction is a list of items bought in a single purchase
transactions = basket['itemDescription'].tolist()

In [None]:
# Import TransactionEncoder to convert transaction data into one-hot encoded format
from mlxtend.preprocessing import TransactionEncoder

# Initialize the encoder
te = TransactionEncoder()

# Fit the encoder to the list of transactions and transform into a boolean array
te_array = te.fit(transactions).transform(transactions)


# Convert the boolean array into a pandas DataFrame
# Each column represents an item; True indicates the item was purchased in that transaction
basket = pd.DataFrame(te_array, columns=te.columns_)

# Display the first 5 rows of the one-hot encoded transaction DataFrame
basket.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Generate frequent itemsets using the Apriori algorithm
# - min_support=0.01 means items appearing in at least 1% of transactions are considered
# - use_colnames=True ensures item names (not column indices) are used in output
frequent_itemsets = apriori(basket,
                            min_support=0.01,
                            use_colnames=True)

# Display the first 5 frequent itemsets
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.021386,(UHT-milk)
1,0.03395,(beef)
2,0.021787,(berries)
3,0.016574,(beverages)
4,0.045312,(bottled beer)


In [None]:
# Generate association rules from frequent itemsets
# - metric="confidence": use confidence to filter rules
# - min_threshold=0.1: keep rules with confidence >= 0.1
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)
print(rules.sort_values(by='lift', ascending=False).head())

          antecedents   consequents  antecedent support  consequent support  \
3            (yogurt)  (whole milk)            0.085879            0.157923   
1        (rolls/buns)  (whole milk)            0.110005            0.157923   
0  (other vegetables)  (whole milk)            0.122101            0.157923   
2              (soda)  (whole milk)            0.097106            0.157923   

    support  confidence      lift  representativity  leverage  conviction  \
3  0.011161    0.129961  0.822940               1.0 -0.002401    0.967861   
1  0.013968    0.126974  0.804028               1.0 -0.003404    0.964550   
0  0.014837    0.121511  0.769430               1.0 -0.004446    0.958551   
2  0.011629    0.119752  0.758296               1.0 -0.003707    0.956636   

   zhangs_metric   jaccard  certainty  kulczynski  
3      -0.190525  0.047975  -0.033206    0.100317  
1      -0.214986  0.055000  -0.036752    0.107711  
0      -0.254477  0.055948  -0.043241    0.107730  
2      -0.

In [None]:
# Display the number of frequent itemsets found (rows) and columns
print(frequent_itemsets.shape)

(69, 2)


In [None]:
# Display the top 5 rules sorted by confidence
 #(how likely the consequent is given the antecedent)
rules.sort_values(by='confidence', ascending=False).head()

# Display the top 5 rules sorted by support
# (how frequently the itemsets appear in all transactions)
rules.sort_values(by='support', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(other vegetables),(whole milk),0.122101,0.157923,0.014837,0.121511,0.76943,1.0,-0.004446,0.958551,-0.254477,0.055948,-0.043241,0.10773
1,(rolls/buns),(whole milk),0.110005,0.157923,0.013968,0.126974,0.804028,1.0,-0.003404,0.96455,-0.214986,0.055,-0.036752,0.107711
2,(soda),(whole milk),0.097106,0.157923,0.011629,0.119752,0.758296,1.0,-0.003707,0.956636,-0.260917,0.047776,-0.045329,0.096694
3,(yogurt),(whole milk),0.085879,0.157923,0.011161,0.129961,0.82294,1.0,-0.002401,0.967861,-0.190525,0.047975,-0.033206,0.100317


In [None]:
# Loop through each association rule and print in a readable format
for i, row in enumerate(rules.itertuples(), start=1):
   # Convert frozensets of antecedents and consequents to comma-separated strings
    antecedents = ', '.join(row.antecedents)
    consequents = ', '.join(row.consequents)
    # Print the rule with numbering
    print(f"Rule #{i}: {antecedents} -> {consequents}")
     # Print key metrics for the rule, rounded to 2 decimal places
    print(f"Support: {row.support:.2f}")
    print(f"Confidence: {row.confidence:.2f}")
    print(f"Lift: {row.lift:.2f}")
    print("-----")

Rule #1: other vegetables -> whole milk
Support: 0.01
Confidence: 0.12
Lift: 0.77
-----
Rule #2: rolls/buns -> whole milk
Support: 0.01
Confidence: 0.13
Lift: 0.80
-----
Rule #3: soda -> whole milk
Support: 0.01
Confidence: 0.12
Lift: 0.76
-----
Rule #4: yogurt -> whole milk
Support: 0.01
Confidence: 0.13
Lift: 0.82
-----
