<a href="https://colab.research.google.com/github/soumitmondal17/Projects/blob/main/Market_Basket_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing required libraries.**

In [7]:
import pandas as pd
!pip install mlxtend
from mlxtend.frequent_patterns import apriori,fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder



**Load the dataset.**

In [10]:
df = pd.read_csv('/content/Groceries data.csv')

**Review the first 5 rows of dataset.**

In [11]:
df.head(5)

Unnamed: 0,Member_number,Date,itemDescription,year,month,day,day_of_week
0,1808,2015-07-21,tropical fruit,2015,7,21,1
1,2552,2015-05-01,whole milk,2015,5,1,4
2,2300,2015-09-19,pip fruit,2015,9,19,5
3,1187,2015-12-12,other vegetables,2015,12,12,5
4,3037,2015-01-02,whole milk,2015,1,2,4


**Removing white spaces in itemDescriptions.**

In [23]:
df['itemDescription']=df['itemDescription'].str.strip()

**Group purchases into full transactions**

In [26]:
transactions=df.groupby('Member_number')['itemDescription'].apply(list).to_list()
#Display first 2 transactions
transactions[:2]

[['soda',
  'canned beer',
  'sausage',
  'sausage',
  'whole milk',
  'whole milk',
  'pickled vegetables',
  'misc. beverages',
  'semi-finished bread',
  'hygiene articles',
  'yogurt',
  'pastry',
  'salty snack'],
 ['frankfurter',
  'frankfurter',
  'beef',
  'sausage',
  'whole milk',
  'soda',
  'curd',
  'white bread',
  'whole milk',
  'soda',
  'whipped/sour cream',
  'rolls/buns']]

**Transform transactions into Boolean Matrix.**

In [27]:
te=TransactionEncoder()
te_ary=te.fit(transactions).transform(transactions)

**Converting the matrix into a DataFrame with item names as column headers.**

In [28]:
df_encoded=pd.DataFrame(te_ary,columns=te.columns_)
df_encoded.head(5)

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,True,False,...,False,False,False,True,False,True,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


**Identifying frequent items using Apriori.**

In [30]:
# Applying Apriori to find frequent item combinations with at least 1% support
frequent_items_apriori=apriori(df_encoded,min_support=0.01,use_colnames=True)
# Viewing the top results
frequent_items_apriori.head(10)

Unnamed: 0,support,itemsets
0,0.015393,(Instant food products)
1,0.078502,(UHT-milk)
2,0.031042,(baking powder)
3,0.119548,(beef)
4,0.079785,(berries)
5,0.062083,(beverages)
6,0.158799,(bottled beer)
7,0.213699,(bottled water)
8,0.135967,(brown bread)
9,0.126475,(butter)


**Generating association rules from Apriori output.**

In [35]:
#Creating association rules based on frequent items identified by Apriori using lift as the metric.
rules_apriori=association_rules(frequent_items_apriori,metric='lift',min_threshold=1.0)
# Viewing the top association rules (most reliable connections first)
rules_apriori[rules_apriori['confidence']>0.7].sort_values(by='lift',ascending=False)[['antecedents','consequents','support','confidence','lift']].head(5)


Unnamed: 0,antecedents,consequents,support,confidence,lift
7022,"(meat, domestic eggs)",(whole milk),0.010262,0.784314,1.711789
5709,"(chocolate, fruit/vegetable juice)",(whole milk),0.010775,0.75,1.636898
15140,"(other vegetables, rolls/buns, yogurt, bottled...",(whole milk),0.010518,0.745455,1.626978
11632,"(pip fruit, yogurt, bottled water)",(whole milk),0.010262,0.740741,1.616689
11994,"(rolls/buns, yogurt, brown bread)",(whole milk),0.012827,0.735294,1.604802


**Identifying frequent items using FPGrowth. FPGrowth is more reliable for larger datasets.**

In [36]:
# Applying FPGrowth to find frequent item combinations with at least 1% support
frequent_items_fpgrowth=fpgrowth(df_encoded,min_support=0.01,use_colnames=True)
#Let's review the top frequent items.
frequent_items_fpgrowth.head(5)

Unnamed: 0,support,itemsets
0,0.458184,(whole milk)
1,0.313494,(soda)
2,0.282966,(yogurt)
3,0.206003,(sausage)
4,0.177527,(pastry)


**Generating association rules from FPGrowth output.**

In [37]:
#Creating association rules for frequent items identified by FPGrowth using the metric as Lift.
rules_fpgrowth=association_rules(frequent_items_fpgrowth,metric='lift',min_threshold=0.01)
#Let's review the top items in a most reliable method.
rules_fpgrowth[rules_fpgrowth['confidence']>0.7].sort_values(by='lift',ascending=False)[['antecedents','consequents','support','confidence','lift']].head(5)


Unnamed: 0,antecedents,consequents,support,confidence,lift
13438,"(meat, domestic eggs)",(whole milk),0.010262,0.784314,1.711789
7725,"(chocolate, fruit/vegetable juice)",(whole milk),0.010775,0.75,1.636898
8384,"(other vegetables, rolls/buns, yogurt, bottled...",(whole milk),0.010518,0.745455,1.626978
6544,"(pip fruit, yogurt, bottled water)",(whole milk),0.010262,0.740741,1.616689
12518,"(rolls/buns, yogurt, brown bread)",(whole milk),0.012827,0.735294,1.604802


**Identifying the most purchased item.**

In [38]:
#Summing up how many each items got purchased accross all transactions.
item_frequencies=df_encoded.sum().sort_values(ascending=False)
#Extracting most purchased item.
most_common_item=item_frequencies.index[0]
most_common_count=item_frequencies.iloc[0]

#Display the most purchased item.
print(f'Most frequent item : {most_common_item} (appeared in {most_common_count} transactions)' )

Most frequent item : whole milk (appeared in 1786 transactions)


***Final : Creating a reusable function for product recomendation.*** ⚡

In [39]:
# Function to recommend products based on a chosen item
def recommend_products(rules_df, item, top_n=5, metric='confidence'):
    # Filter rules where the item is in the antecedents
    filtered_rules = rules_df[rules_df['antecedents'].apply(lambda x: item in x)]

    # Sort by chosen metric (default: confidence)
    sorted_rules = filtered_rules.sort_values(by=metric, ascending=False)

    # Return top N recommendations
    return sorted_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(top_n)

# Example usage
recommend_products(rules_fpgrowth, 'yogurt', top_n=5)

Unnamed: 0,antecedents,consequents,support,confidence,lift
8384,"(other vegetables, rolls/buns, yogurt, bottled...",(whole milk),0.010518,0.745455,1.626978
6544,"(pip fruit, yogurt, bottled water)",(whole milk),0.010262,0.740741,1.616689
12518,"(rolls/buns, yogurt, brown bread)",(whole milk),0.012827,0.735294,1.604802
8978,"(bottled beer, rolls/buns, yogurt)",(whole milk),0.013853,0.72,1.571422
3698,"(soda, yogurt, curd)",(whole milk),0.010775,0.711864,1.553666
