In [2]:
import pandas as pd

# Load the dataset
file_path = 'CatalogCrossSell3 (1) (1).csv'
data = pd.read_csv(file_path)

# Displaying the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,Customer Number,Clothing Division,Housewares Division,Health Products Division,Automotive Division,Personal Electronics Division,Computers Division,Garden Division,Novelty Gift Division,Jewelry Division,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19
0,11569.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,,,,,,,,,,
1,13714.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,,,,,,,,,,
2,46391.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,,,,,,,,,,
3,67264.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,
4,67363.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,


In [3]:
# Dropping unnecessary columns (Unnamed columns with missing values)
data_cleaned = data.drop(data.columns[data.columns.str.contains('unnamed',case = False)],axis = 1)

# Check for missing values in the remaining columns
missing_values = data_cleaned.isnull().sum()

# Display the first few rows of the cleaned dataset and the missing values information
data_cleaned.head(), missing_values


(   Customer Number  Clothing Division  Housewares Division  \
 0          11569.0                0.0                  1.0   
 1          13714.0                0.0                  1.0   
 2          46391.0                0.0                  1.0   
 3          67264.0                0.0                  0.0   
 4          67363.0                0.0                  0.0   
 
    Health Products Division  Automotive Division  \
 0                       1.0                  1.0   
 1                       1.0                  1.0   
 2                       1.0                  1.0   
 3                       1.0                  1.0   
 4                       1.0                  0.0   
 
    Personal Electronics Division  Computers Division  Garden Division  \
 0                            1.0                 0.0              0.0   
 1                            1.0                 0.0              1.0   
 2                            1.0                 0.0              1.0   
 3  

In [4]:
# Removing rows with missing values
data_cleaned.dropna(inplace=True)

# Dropping the 'Customer Number' column as it's not needed for association rule analysis
data_cleaned.drop('Customer Number', axis=1, inplace=True)

# Converting the dataframe to boolean values for the Apriori algorithm
data_bool = data_cleaned.astype(bool)

# Display the first few rows of the boolean converted dataset
data_bool.head()


Unnamed: 0,Clothing Division,Housewares Division,Health Products Division,Automotive Division,Personal Electronics Division,Computers Division,Garden Division,Novelty Gift Division,Jewelry Division
0,False,True,True,True,True,False,False,True,False
1,False,True,True,True,True,False,True,True,True
2,False,True,True,True,True,False,True,True,True
3,False,False,True,True,True,False,True,True,False
4,False,False,True,False,True,False,True,True,False


In [5]:
from mlxtend.frequent_patterns import apriori

# Generating frequent itemsets
# Setting a support threshold - this value can be adjusted based on initial findings
support_threshold = 0.01  # 1% support

frequent_itemsets = apriori(data_bool, min_support=support_threshold, use_colnames=True)

# Displaying the frequent itemsets
frequent_itemsets.sort_values(by='support', ascending=False).head()


Unnamed: 0,support,itemsets
2,1.0,(Health Products Division)
23,0.467387,"(Personal Electronics Division, Health Product..."
4,0.467387,(Personal Electronics Division)
1,0.393557,(Housewares Division)
15,0.393557,"(Health Products Division, Housewares Division)"


In [7]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)
