In [11]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline 
plt.rcParams['figure.figsize'] = (10,5)
plt.rcParams['figure.dpi'] = 300

#### Data Preprocessing

In [12]:
data = pd.read_csv('Online retail.csv')
data.head()

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 1 columns):
 #   Column                                                                                                                                                                                                                           Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                           --------------  ----- 
 0   shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil  7500 non-null   object
dtypes: object(1)
memory usage: 58.7+ KB


In [14]:
# Removing duplicates and standardize item names
data.drop_duplicates(inplace=True)
data['Transaction'] = data.iloc[:, 0].apply(lambda x: x.lower().split(','))

In [15]:
# Tokenizing and cleaning each transaction
data['Transaction'] = data['Transaction'].apply(lambda items: [item.strip() for item in items if item.strip()])

In [16]:
# Converting the transaction data to a list of lists for encoding
transactions = data['Transaction'].tolist()

#### Association Rule Mining

In [17]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [18]:
encoder = TransactionEncoder()
transaction_matrix = encoder.fit(transactions).transform(transactions)
transaction_df = pd.DataFrame(transaction_matrix, columns=encoder.columns_)

In [19]:
# Setting minimum support threshold
min_support = 0.01
frequent_itemsets = apriori(transaction_df, min_support=min_support, use_colnames=True)

In [22]:
# Association Rules Generation
# Setting minimum thresholds for confidence and lift
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules = rules[rules['lift'] > 1]  # Filtering by lift > 1 to find meaningful associations

In [23]:
# Results
print("Frequent Itemsets:\n", frequent_itemsets)
print("\nAssociation Rules:\n", rules)

Frequent Itemsets:
       support                               itemsets
0    0.029179                              (almonds)
1    0.011014                    (antioxydant juice)
2    0.045797                              (avocado)
3    0.012560                                (bacon)
4    0.015459                       (barbecue sauce)
..        ...                                    ...
431  0.014686  (olive oil, mineral water, spaghetti)
432  0.016618   (pancakes, mineral water, spaghetti)
433  0.012367     (shrimp, mineral water, spaghetti)
434  0.010821       (mineral water, soup, spaghetti)
435  0.013527   (tomatoes, mineral water, spaghetti)

[436 rows x 2 columns]

Association Rules:
                         antecedents      consequents  antecedent support  \
0              (chocolate, chicken)  (mineral water)            0.021256   
1            (olive oil, chocolate)  (mineral water)            0.023575   
2               (ground beef, eggs)  (mineral water)            0.02879

#### Analysis And Interpretation 

In [24]:
'''
The generated rules show that mineral water is frequently bought alongside a variety of items, indicating it as a staple 
product. Spaghetti often appears with ground beef, vegetables, or other meal ingredients, suggesting it’s part of common meal
combinations. Overall, popular combinations hint at customer preferences for balanced meals and versatile ingredients.

The results indicate that customers tend to buy mineral water with a wide range of items, suggesting it’s a frequent add-on. 
Meal components like spaghetti, ground beef, and vegetables often appear together, showing a preference for easy meal prep 
ingredients. Overall, customers seem to favor versatile and complementary products, likely for convenient, balanced meal 
planning.
'''

'\nThe generated rules show that mineral water is frequently bought alongside a variety of items, indicating it as a staple \nproduct. Spaghetti often appears with ground beef, vegetables, or other meal ingredients, suggesting it’s part of common meal\ncombinations. Overall, popular combinations hint at customer preferences for balanced meals and versatile ingredients.\n\nThe results indicate that customers tend to buy mineral water with a wide range of items, suggesting it’s a frequent add-on. \nMeal components like spaghetti, ground beef, and vegetables often appear together, showing a preference for easy meal prep \ningredients. Overall, customers seem to favor versatile and complementary products, likely for convenient, balanced meal \nplanning.\n'

#### Interview Questions

In [25]:
'''
1. What is lift and why is it important in Association rules?
-> Lift measures the strength of an association rule compared to random chance, indicating the likelihood of one item occurring 
   with another. A lift greater than 1 shows a positive association.
   
2. What is support and Confidence. How do you calculate them?
-> Support is the frequency of an itemset in the dataset, while Confidence indicates the reliability of a rule. They are 
   calculated as:
   Support = (Transactions with itemset) / (Total transactions)
   Confidence = (Transactions with both A and B) / (Transactions with A)
   
3. What are some limitations or challenges of Association rules mining?
-> Challenges of Association Rule Mining include scalability, as larger datasets require more resources; interpretability, 
   since many complex rules can emerge; and overfitting, where the model captures noise rather than genuine patterns.
'''

'\n1. What is lift and why is it important in Association rules?\n-> Lift measures the strength of an association rule compared to random chance, indicating the likelihood of one item occurring \n   with another. A lift greater than 1 shows a positive association.\n   \n2. What is support and Confidence. How do you calculate them?\n-> Support is the frequency of an itemset in the dataset, while Confidence indicates the reliability of a rule. They are \n   calculated as:\n   Support = (Transactions with itemset) / (Total transactions)\n   Confidence = (Transactions with both A and B) / (Transactions with A)\n   \n3. What are some limitations or challenges of Association rules mining?\n-> Challenges of Association Rule Mining include scalability, as larger datasets require more resources; interpretability, \n   since many complex rules can emerge; and overfitting, where the model captures noise rather than genuine patterns.\n'