In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [23]:
df = pd.read_excel("/content/retail_shop.xlsx")
df

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt
...,...
7495,"butter,light mayo,fresh bread"
7496,"burgers,frozen vegetables,eggs,french fries,ma..."
7497,chicken
7498,"escalope,green tea"


In [24]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Use your data, e.g. transactions per category/item
# Assuming your transactions are in the first column of your DataFrame df
transactions = df.iloc[:,0].apply(lambda x: x.split(',')) # Split the single string of items into a list
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)


In [25]:
# Lower thresholds to capture some rules
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True) # Lowered min_support
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)


In [26]:
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
7495,False
7496,False
7497,True
7498,True


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 1 columns):
 #   Column                                                                                                                                                                                                                           Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                           --------------  ----- 
 0   shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil  7500 non-null   object
dtypes: object(1)
memory usage: 58.7+ KB


In [28]:
from mlxtend.preprocessing import TransactionEncoder

transactions = df['shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil'].str.split(', ')
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
ohe = pd.DataFrame(te_ary, columns=te.columns_)

In [29]:
from mlxtend.frequent_patterns import apriori

freq_itemsets = apriori(ohe, min_support=0.01, use_colnames=True)
print(freq_itemsets.sort_values(by='support', ascending=False))

    support         itemsets
0  0.029733        (cookies)
3  0.017600   (french fries)
2  0.013467       (escalope)
4  0.012400  (mineral water)
1  0.012000           (eggs)


In [30]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(freq_itemsets, metric='confidence', min_threshold=0.5)
rules = rules.sort_values(by=['lift', 'confidence'], ascending=False)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [31]:
filtered = rules[(rules['support'] >= 0.1)
                 & (rules['confidence'] >= 0.6)
                 & (rules['lift'] >= 1.2)]
print(filtered[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [32]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
ohe_df = pd.DataFrame(te_ary, columns=te.columns_)

In [33]:
freq_itemsets = apriori(ohe_df, min_support=0.01, use_colnames=True)
freq_itemsets['length'] = freq_itemsets['itemsets'].apply(lambda x: len(x))

In [34]:
top_rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values(by='lift', ascending=False)
print("Top Association Rules:")
print(top_rules.head(10))

Top Association Rules:
Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [35]:
for idx, rule in top_rules.head(5).iterrows():
    ants = list(rule['antecedents'])
    cons = list(rule['consequents'])
    print(f"- If a customer buys {ants}, they also buy {cons}"
          f" (support={rule['support']:.2f}, confidence={rule['confidence']:.2f}, lift={rule['lift']:.2f})")

In [36]:
print("Frequent itemsets:\n", frequent_itemsets)
print("Association rules:\n", rules[['antecedents','consequents','support','confidence','lift']])

Frequent itemsets:
       support                                 itemsets
0    0.020267                                (almonds)
1    0.033200                                (avocado)
2    0.010800                         (barbecue sauce)
3    0.014267                              (black tea)
4    0.011467                             (body spray)
..        ...                                      ...
254  0.011067       (milk, mineral water, ground beef)
255  0.017067  (ground beef, spaghetti, mineral water)
256  0.015733         (milk, spaghetti, mineral water)
257  0.010267    (olive oil, spaghetti, mineral water)
258  0.011467     (spaghetti, mineral water, pancakes)

[259 rows x 2 columns]
Association rules:
 Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


# **interview questions**

1

Lift is a metric used in association rule mining to measure the strength of association between two itemsets compared to what would be expected if they were statistically independent. Here’s a breakdown:

1. adjust for item frequency
2. detect strength for dependency

2

support :


Support measures how frequently an itemset appears in the dataset.

Helps weed out rare combinations that are unlikely to be meaningful or profitable.


confidance:

 Confidence measures the reliability of an implication, indicating how often

Y appears when

X does.

3

1 . For large datasets or many unique items, the number of possible itemsets grows exponentially, making algorithms like Apriori slow and resource-intensive

2 .
Noisy, incomplete, or inconsistent data can lead to misleading or false rules

3 .
esting millions of potential rules increases risk of purely chance associations being flagged as significant