In [1]:
import pandas as pd
import numpy as np


In [5]:
df = pd.read_csv("groceries.csv")

In [6]:
df.head()

Unnamed: 0,Item(s),Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,...,Item 23,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32
0,4,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,...,,,,,,,,,,
1,3,tropical fruit,yogurt,coffee,,,,,,,...,,,,,,,,,,
2,1,whole milk,,,,,,,,,...,,,,,,,,,,
3,4,pip fruit,yogurt,cream cheese,meat spreads,,,,,,...,,,,,,,,,,
4,4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,...,,,,,,,,,,


# Data Preprocessing

In [7]:
# Drop unnecessary columns if any
df_clean = df.drop(columns=['Item(s)'])

# Fill NaN values with empty string or drop them (based on your requirement)
df_clean.fillna('', inplace=True)

In [12]:
df_stack = df_clean.stack().reset_index(level=1, drop=True)

df_onehot = pd.get_dummies(df_stack).groupby(level=0).sum()
df_onehot = df_onehot.astype(bool).astype(int)  

In [None]:
!pip install mlxtend

In [15]:
from mlxtend.frequent_patterns import apriori, association_rules

# Generate frequent itemsets with min support
frequent_itemsets = apriori(df_onehot, min_support=0.01, use_colnames=True)

# Review the frequent itemsets
print(frequent_itemsets)



      support                                         itemsets
0    0.999898                                               ()
1    0.033452                                       (UHT-milk)
2    0.017692                                  (baking powder)
3    0.052466                                           (beef)
4    0.033249                                        (berries)
..        ...                                              ...
658  0.011896  (, whole milk, tropical fruit, root vegetables)
659  0.014438          (, yogurt, root vegetables, whole milk)
660  0.010473                     (, yogurt, whole milk, soda)
661  0.015048           (, whole milk, tropical fruit, yogurt)
662  0.010778       (, yogurt, whipped/sour cream, whole milk)

[663 rows x 2 columns]


In [16]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

In [17]:
print(rules)

                                  antecedents     consequents  \
0                                  (UHT-milk)              ()   
1                             (baking powder)              ()   
2                                      (beef)              ()   
3                                   (berries)              ()   
4                                 (beverages)              ()   
..                                        ...             ...   
364      (yogurt, whole milk, tropical fruit)              ()   
365                  (yogurt, tropical fruit)  (, whole milk)   
366            (, yogurt, whipped/sour cream)    (whole milk)   
367  (yogurt, whole milk, whipped/sour cream)              ()   
368              (yogurt, whipped/sour cream)  (, whole milk)   

     antecedent support  consequent support   support  confidence      lift  \
0              0.033452            0.999898  0.033452    1.000000  1.000102   
1              0.017692            0.999898  0.017692    1.00

In [21]:
def get_recommendations(item, top_n=3):
    # Filter rules where 'item' is in the antecedents and the consequents are not empty
    recommendations = rules[
        (rules['antecedents'].apply(lambda x: item in str(x))) &
        (rules['consequents'].apply(lambda x: len(x) > 0))
    ]
    
    # Return the top N recommendations based on confidence
    return recommendations[['antecedents', 'consequents', 'confidence']].head(top_n)

In [24]:
recommendations = get_recommendations('yogurt', top_n=3)
print(recommendations)

          antecedents consequents  confidence
87           (yogurt)          ()    0.999271
92     (yogurt, beef)          ()    0.991304
95  (yogurt, berries)          ()    1.000000
