In [2]:
# import xarray as xr
# Import libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori ,association_rules
import matplotlib.pyplot as plt
import seaborn as snsno

In [3]:
#  Load data from an Excel file named "Basket_Cosmetic.xlsx"
df_sale_product=pd.read_excel("Basket_Cosmetic.xlsx")

In [4]:
#Check the shape (number of rows and columns) of the dataset
df_sale_product.shape

(4792, 10)

In [5]:
# Get information about the dataset, including data types and non-null counts
df_sale_product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4792 entries, 0 to 4791
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   OrderId                4792 non-null   int64  
 1   OrderQty               4792 non-null   int64  
 2   SaleAmount             4792 non-null   float64
 3   CountOrder             4792 non-null   int64  
 4   AttributeSetName       4792 non-null   object 
 5   AttributeSetId         4792 non-null   int64  
 6   CategoryName_Level2    4792 non-null   object 
 7   ProductMiddleCategory  4792 non-null   object 
 8   ProductName            4792 non-null   object 
 9   ProductCode            4792 non-null   int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 374.5+ KB


In [6]:
# display the first two rows and the last row of the data to get an overview.
df_sale_product.head(2)
df_sale_product.tail(2)

Unnamed: 0,OrderId,OrderQty,SaleAmount,CountOrder,AttributeSetName,AttributeSetId,CategoryName_Level2,ProductMiddleCategory,ProductName,ProductCode
4790,365641,1,280000.0,1,آرايشي و بهداشتي,17,لوازم آرایشی,آرايشي و بهداشتي,highlighter,357021
4791,358833,1,168000.0,1,آرايشي و بهداشتي,17,لوازم آرایشی,آرايشي و بهداشتي,highlighter,357021


In [7]:
# Generate descriptive statistics for the dataset, including count, mean, min, max, etc.
df_sale_product.describe(include = 'all')

Unnamed: 0,OrderId,OrderQty,SaleAmount,CountOrder,AttributeSetName,AttributeSetId,CategoryName_Level2,ProductMiddleCategory,ProductName,ProductCode
count,4792.0,4792.0,4792.0,4792.0,4792,4792.0,4792,4792,4792,4792.0
unique,,,,,1,,6,3,32,
top,,,,,آرايشي و بهداشتي,,لوازم آرایشی,آرايشي و بهداشتي,perfume,
freq,,,,,4792,,1985,4699,750,
mean,366736.435309,1.137312,146812.3,1.000835,,17.0,,,,474233.797579
std,8837.134977,0.735771,210654.1,0.028883,,0.0,,,,91167.328871
min,352685.0,0.0,0.0,1.0,,17.0,,,,253867.0
25%,358701.0,1.0,44000.0,1.0,,17.0,,,,473203.0
50%,366208.0,1.0,89000.0,1.0,,17.0,,,,501795.0
75%,374290.0,1.0,178000.0,1.0,,17.0,,,,540831.0


In [8]:
# Insert a new column named 'quantity' with a default value of 1 at position 9
df_sale_product.insert(9, 'quantity',1)

In [9]:
# Group and pivot the data to create a basket of products by OrderId
product_basket = (df_sale_product.groupby(['OrderId', 'ProductCode'])['quantity']).sum().unstack().reset_index().fillna(0).set_index('OrderId')

In [10]:
# Define a function to convert values to 0 if they are less than or equal to 0, and 1 if greater than or equal to 1
def convto0(x):
    if (x<=0):
        return 0
    if (x>=1):
        return 1
# Apply the conversion function to the product_basket dataset
Basket_sets = product_basket.applymap(convto0)
Basket_sets.head()

ProductCode,253867,257556,257758,271352,357021,363706,411587,435483,437287,454273,...,529253,529515,529575,532667,540831,541788,542097,542718,542785,543073
OrderId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
352685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
352697,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
352740,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
352743,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
352761,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [11]:
# Use the Apriori algorithm to mine frequent itemsets from the basket dataset
frequent_itemsets = apriori(Basket_sets.astype('bool'), min_support = 0.0008, use_colnames = True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.011387,(253867),1
1,0.115494,(257556),1
2,0.045140,(257758),1
3,0.034974,(271352),1
4,0.002847,(357021),1
...,...,...,...
715,0.000813,"(501539, 498215, 271352, 541788, 542718)",5
716,0.000813,"(498215, 271352, 529146, 541788, 542718)",5
717,0.000813,"(501539, 271352, 529146, 541788, 542718)",5
718,0.000813,"(501539, 498215, 529146, 541788, 542718)",5


In [12]:
# Filter the itemsets where the length is greater than or equal to 1
lenght=frequent_itemsets['itemsets'].str.len()
Filter1=lenght>=1
frequent_itemsets[Filter1]

Unnamed: 0,support,itemsets,length
0,0.011387,(253867),1
1,0.115494,(257556),1
2,0.045140,(257758),1
3,0.034974,(271352),1
4,0.002847,(357021),1
...,...,...,...
715,0.000813,"(501539, 498215, 271352, 541788, 542718)",5
716,0.000813,"(498215, 271352, 529146, 541788, 542718)",5
717,0.000813,"(501539, 271352, 529146, 541788, 542718)",5
718,0.000813,"(501539, 498215, 529146, 541788, 542718)",5


In [13]:
# Generate association rules from the frequent itemsets based on support
association_rules_data = association_rules(frequent_itemsets, metric = 'support', min_threshold = 0.0008)
association_rules_data

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(253867),(257556),0.011387,0.115494,0.002033,0.178571,1.546152,0.000718,1.076790
1,(257556),(253867),0.115494,0.011387,0.002033,0.017606,1.546152,0.000718,1.006330
2,(253867),(411587),0.011387,0.058560,0.000813,0.071429,1.219742,0.000147,1.013858
3,(411587),(253867),0.058560,0.011387,0.000813,0.013889,1.219742,0.000147,1.002537
4,(518771),(253867),0.059780,0.011387,0.000813,0.013605,1.194849,0.000133,1.002249
...,...,...,...,...,...,...,...,...,...
3565,(498215),"(501539, 271352, 529146, 541788, 542718)",0.228955,0.000813,0.000813,0.003552,4.367673,0.000627,1.002749
3566,(271352),"(501539, 498215, 529146, 541788, 542718)",0.034974,0.000813,0.000813,0.023256,28.593023,0.000785,1.022977
3567,(529146),"(501539, 498215, 271352, 541788, 542718)",0.041074,0.000813,0.000813,0.019802,24.346535,0.000780,1.019372
3568,(541788),"(501539, 498215, 271352, 529146, 542718)",0.068727,0.000813,0.000813,0.011834,14.550296,0.000757,1.011153


In [14]:
association_rules_data.dtypes

antecedents            object
consequents            object
antecedent support    float64
consequent support    float64
support               float64
confidence            float64
lift                  float64
leverage              float64
conviction            float64
dtype: object

In [15]:
# Export the rules to an Excel file named 'BasketAnalysisOutput.xlsx'
excell=association_rules_data.to_excel('BasketAnalysisOutput.xlsx')

In [29]:
# Generate a heatmap with annotations on and the colorbar off
import seaborn as sns
sns.heatmap(association_rules_data, annot = True)
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()

TypeError: float() argument must be a string or a number, not 'frozenset'