PRN: RBT23CB013
##### Experiment 2: Using suitable public domain dataset in UCI ML repository, apply a-priori algorithm to find frequently occurring items from given data and generate strong association rules using support and confidence thresholds. For Example: Market Basket Analysis.

In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [4]:
# 1. Load the dataset
df = pd.read_csv('Market_Basket_Optimisation.csv', header=None)
print("--- Raw Data Sample (first 5 rows) ---")
print(df.head())

--- Raw Data Sample (first 5 rows) ---
              0          1           2                 3             4   \
0         shrimp    almonds     avocado    vegetables mix  green grapes   
1        burgers  meatballs        eggs               NaN           NaN   
2        chutney        NaN         NaN               NaN           NaN   
3         turkey    avocado         NaN               NaN           NaN   
4  mineral water       milk  energy bar  whole wheat rice     green tea   

                 5     6               7             8             9   \
0  whole weat flour  yams  cottage cheese  energy drink  tomato juice   
1               NaN   NaN             NaN           NaN           NaN   
2               NaN   NaN             NaN           NaN           NaN   
3               NaN   NaN             NaN           NaN           NaN   
4               NaN   NaN             NaN           NaN           NaN   

               10         11     12     13             14      15  \
0 

In [5]:
# 2. Preprocess the data into a list of transactions
transactions = []
for i in range(len(df)):
    transactions.append([str(item) for item in df.iloc[i].dropna()])

In [6]:
# 3. Transform the data into a one-hot encoded DataFrame
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
onehot_df = pd.DataFrame(te_ary, columns=te.columns_)

In [7]:
# 4. Find frequent itemsets using Apriori
frequent_itemsets = apriori(onehot_df, min_support=0.03, use_colnames=True)
print("\n--- Frequent Itemsets (Support >= 0.03) ---")
print(frequent_itemsets.sort_values(by='support', ascending=False))


--- Frequent Itemsets (Support >= 0.03) ---
     support                            itemsets
25  0.238368                     (mineral water)
11  0.179709                              (eggs)
31  0.174110                         (spaghetti)
13  0.170911                      (french fries)
7   0.163845                         (chocolate)
18  0.132116                         (green tea)
24  0.129583                              (milk)
19  0.098254                       (ground beef)
16  0.095321                 (frozen vegetables)
27  0.095054                          (pancakes)
2   0.087188                           (burgers)
4   0.081056                              (cake)
8   0.080389                           (cookies)
12  0.079323                          (escalope)
23  0.076523                    (low fat yogurt)
29  0.071457                            (shrimp)
33  0.068391                          (tomatoes)
26  0.065858                         (olive oil)
15  0.063325            

In [8]:

# 5. Generate association rules
# Looking for rules with a lift greater than 1.2 and confidence > 0.2.
# Lift > 1 indicates a positive correlation.
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

print("\n--- Association Rules (Lift >= 1.2) ---")
# Sort rules by confidence and lift for clarity
print(rules.sort_values(by=['confidence', 'lift'], ascending=False).head(10))


--- Association Rules (Lift >= 1.2) ---
            antecedents      consequents  antecedent support  \
12        (ground beef)  (mineral water)            0.098254   
15        (ground beef)      (spaghetti)            0.098254   
10  (frozen vegetables)  (mineral water)            0.095321   
16               (milk)  (mineral water)            0.129583   
20           (pancakes)  (mineral water)            0.095054   
22          (spaghetti)  (mineral water)            0.174110   
4           (chocolate)  (mineral water)            0.163845   
19               (milk)      (spaghetti)            0.129583   
23      (mineral water)      (spaghetti)            0.238368   
3                (milk)      (chocolate)            0.129583   

    consequent support   support  confidence      lift  representativity  \
12            0.238368  0.040928    0.416554  1.747522               1.0   
15            0.174110  0.039195    0.398915  2.291162               1.0   
10            0.238368  0.