# Market Basket Analysis 

### Installing Required libraries and packages

In [1]:
!pip install mlxtend
!pip install mlxtend --upgrade
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules, apriori, hmine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mlxtend
  Downloading mlxtend-0.22.0-py2.py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mlxtend
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.14.0
    Uninstalling mlxtend-0.14.0:
      Successfully uninstalled mlxtend-0.14.0
Successfully installed mlxtend-0.22.0


### Data Ingestion and Exploration

In [2]:
#Reading the data
data = pd.read_excel("/content/Online Retail.xlsx")
#To read from the link
# data = pd.read_excel("https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx")

#Data Exploration
print("\nData shape:", data.shape)
print("\nData columns:", data.columns)
print("\nSample data:")
print(data.head())

#Checking the data types of columns
print("\n",data.dtypes)

  and should_run_async(code)



Data shape: (541909, 8)

Data columns: Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

Sample data:
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Ki

In [3]:
#Finding the most frequent product for sample recommendations
product_counts = data['StockCode'].value_counts()
print("The most frequently occurring product is:", product_counts)

The most frequently occurring product is: 85123A    2313
22423     2203
85099B    2159
47566     1727
20725     1639
          ... 
21431        1
22275        1
17001        1
90187A       1
72759        1
Name: StockCode, Length: 4070, dtype: int64


  and should_run_async(code)


### Preprocessing and Feature Engineering

In [4]:
#Preprocessing

#Checking for null values and dropping them
print(data.isna().sum())
data.dropna(inplace=True)

#Excluding rows where InvoiceNo starts with 'C' - refers to refund transactions
data = data[~data['InvoiceNo'].astype(str).str.startswith('C')]

#Excluding rows with negative or zero quantity
data = data[data['Quantity'] > 0]

#Excluding rows with negative or zero quantity
data = data[data["UnitPrice"] > 0]

#Converting StockCode to String Type
data['StockCode'] = data['StockCode'].astype(str)

#Removing row labeled POST in StockCode, as it is not a product
data = data[~data["StockCode"].str.contains("POST", na=False)]

#Checking number of unique values in Description
print("\nUnique Values in Description",data.Description.nunique())

#Checking number of unique values in Stockcode
print("Unique Values in Stock Code",data.StockCode.nunique())

#Since the number of unqiue values should be same for description and stock code for 1-1 product mapping, we will remove entries with multiple mappings
#Removing Descriptions mapped to multiple StockCodes
df_inter = data[["Description","StockCode"]].drop_duplicates()
df_inter = df_inter.groupby(["Description"]).agg({"StockCode":"count"}).reset_index()
df_inter.rename(columns = {'StockCode':'StockCode_Count'},inplace = True)
df_inter = df_inter.sort_values("StockCode_Count", ascending = False)
df_inter = df_inter[df_inter["StockCode_Count"] > 1]
print("\n Descriptions with multiple StockCodes : \n",df_inter.head() )
df = data[~data["Description"].isin(df_inter["Description"])]

#Removing StockCodes mapped to multiple Descriptions
df_inter = df[["Description","StockCode"]].drop_duplicates()
df_inter = df_inter.groupby(["StockCode"]).agg({"Description":"count"}).reset_index()
df_inter.rename(columns = {'Description':'Description_Count'},inplace=True)
df_inter = df_inter.sort_values("Description_Count", ascending = False)
df_inter = df_inter[df_inter["Description_Count"] > 1] 
print("\n StockCodes with multiple Descriptions : \n",df_inter.head())
df = df[~df["StockCode"].isin(df_inter["StockCode"])]

#Checking number of unique values in Description
print("\nUnique Values in Description",df.Description.nunique())

#Checking number of unique values in Stockcode
print("Unique Values in Stock Code",df.StockCode.nunique())

  and should_run_async(code)


InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

Unique Values in Description 3876
Unique Values in Stock Code 3664

 Descriptions with multiple StockCodes : 
                           Description  StockCode_Count
2014   METAL SIGN,CUPCAKE SINGLE HOOK                3
826      COLOURING PENCILS BROWN TUBE                2
1077  EAU DE NILE JEWELLED PHOTOFRAME                2
3677      WHITE BAMBOO RIBS LAMPSHADE                2
2460    PINK FAIRY CAKE CUSHION COVER                2

 StockCodes with multiple Descriptions : 
      StockCode  Description_Count
2020     23196                  4
2060     23236                  4
1951     23126                  3
2033     23209                  3
2199     23396                  3

Unique Values in Description 3419
Unique Values in Stock Code 3419


### Association Rule Learning Algorithms

In [5]:
#Feature Engineering

#Creating a list of transactions for each invoice
transactions = df.groupby('InvoiceNo')['StockCode'].apply(list).values.tolist()

#Converting the transaction list into a transaction matrix
te = TransactionEncoder()
te_ary = te.fit_transform(transactions)
invoice_product_matrix = pd.DataFrame(te_ary, columns = te.columns_)

#FP-Growth Algorithm
#Applying FP-Growth algorithm to find frequent itemsets
frequent_itemsets_fp = fpgrowth(invoice_product_matrix, min_support = 0.01, use_colnames = True)
#Generating association rules from frequent itemsets
rules_fp = association_rules(frequent_itemsets_fp, metric = "lift", min_threshold = 1)
rules_fp = rules_fp[rules_fp['confidence'] > 0.5]  

#H-Mine Algorithm
#Appling H-Mine algorithm to find frequent itemsets
frequent_itemsets_hmine = hmine(invoice_product_matrix, min_support = 0.01, use_colnames = True)
#Generating association rules from frequent itemsets
rules_hmine = association_rules(frequent_itemsets_hmine, metric = "lift", min_threshold = 1)
rules_hmine = rules_hmine[rules_hmine['confidence'] > 0.5] 

#Apriori Algorithm
#Applying Apriori algorithm to find frequent itemsets
frequent_itemsets_apriori = apriori(invoice_product_matrix, min_support = 0.01, use_colnames=True)
#Generating association rules from frequent itemsets
rules_apriori = association_rules(frequent_itemsets_apriori, metric = "lift", min_threshold = 1)
rules_apriori = rules_apriori[rules_apriori['confidence'] > 0.5]  

#Reading results and inferences
print("\n\nFP-Growth Results:")
print(rules_fp.head())

print("\n\nH-Mine Results:")
print(rules_hmine.head())

print("\n\nApriori Results:")
print(rules_apriori.head())

  and should_run_async(code)


FP-Growth Results:
   antecedents consequents  antecedent support  consequent support   support  \
10     (22745)     (22748)            0.017274            0.018914  0.013885   
11     (22748)     (22745)            0.018914            0.017274  0.013885   
14     (22727)     (22726)            0.047942            0.043131  0.028973   
15     (22726)     (22727)            0.043131            0.047942  0.028973   
19     (22728)     (22727)            0.033510            0.047942  0.021648   

    confidence       lift  leverage  conviction  zhangs_metric  
10    0.803797  42.496726  0.013558    5.000372       0.993633  
11    0.734104  42.496726  0.013558    3.695903       0.995294  
14    0.604333  14.011486  0.026905    2.418369       0.975392  
15    0.671736  14.011486  0.026905    2.900285       0.970488  
19    0.646003  13.474729  0.020041    2.689455       0.957886  
H-Mine Results:
   antecedents consequents  antecedent support  consequent support   support  \
2      (20712)

In [6]:
#Recommender Engine

#Creating a function that will provide the top 5 products the customer might buy, based on a given input product
def recommend_products(association_rules, input_product_id, num_recommendations):
  
  #Creating an empty list to store the recommendations
  recommendation_list = []

  for idx, product in enumerate(association_rules["antecedents"]):
    
    #For each instance of the presence of chosen product as antecedent, we traverse the list of subsequent associations
    for j in list(product):
      if j == input_product_id:
        recommendation_list.append(list(rules_fp.iloc[idx]["consequents"])[0])
        recommendation_list = list( dict.fromkeys(recommendation_list) )
  
  #Filtering for the required number of recommendations
  reco_products = recommendation_list[0 : num_recommendations]

  return reco_products

#Creating a function to retrieve the Product Description given StockCode
def get_descriptions(df, product_ids):
    descriptions = df[df['StockCode'].isin(product_ids)]['Description'].unique()
    return descriptions.tolist()

#Sample Input
input_product_id = '22423' #Specify the input product ID
num_recommendations = 5  #Specify the number of recommendations desired

#Displaying the user input product
description = df.loc[df['StockCode'] == input_product_id, 'Description'].iloc[0]
print("\nFor ", description," we will be recommending products based on each algorithm :")

#Recommending top 5 products to customer based on FP-Growth Algorithm
fp_recommended_products = recommend_products(rules_fp, input_product_id, num_recommendations)
print("\nFp-Growth algorithm :")
descriptions = get_descriptions(df, fp_recommended_products)
print(descriptions)

#Recommending top 5 products to customer based on H-Mine Algorithm
hmine_recommended_products = recommend_products(rules_hmine, input_product_id, num_recommendations)
print("\nH-Mine Algorithm :")
descriptions = get_descriptions(df, hmine_recommended_products)
print(descriptions)

#Recommending top 5 products to customer based on Apriori Algorithm
apriori_recommended_products = recommend_products(rules_apriori, input_product_id, num_recommendations)
print("\nApriori Algorithm :")
descriptions = get_descriptions(df, apriori_recommended_products)
print(descriptions)


For  REGENCY CAKESTAND 3 TIER  we will be recommending products based on each algorithm :

Fp-Growth algorithm :
['ROSES REGENCY TEACUP AND SAUCER ', 'GREEN REGENCY TEACUP AND SAUCER', 'PINK REGENCY TEACUP AND SAUCER']

H-Mine Algorithm :
['JUMBO BAG PINK POLKADOT', 'PACK OF 72 RETROSPOT CAKE CASES', 'JUMBO BAG RED RETROSPOT', 'LUNCH BAG SPACEBOY DESIGN ', 'LUNCH BAG  BLACK SKULL.']

Apriori Algorithm :
['ROSES REGENCY TEACUP AND SAUCER ', 'GREEN REGENCY TEACUP AND SAUCER', 'JUMBO BAG APPLES', 'JUMBO BAG PEARS', 'JUMBO BAG VINTAGE LEAF']


  and should_run_async(code)
