In [None]:
import pandas as pd


Preprocessing Groceries Dataset

In [None]:
data = pd.read_csv("Groceries_dataset.csv")

In [None]:
groceries_df = pd.DataFrame(data)

# Group the data by 'Member_number' and collect 'itemDescription' as items in transactions
groceries_transactions = groceries_df.groupby('Member_number')['itemDescription'].apply(list).reset_index()

# Convert the lists in the "itemDescription" column to sets
groceries_transactions["itemDescription"] = groceries_transactions["itemDescription"].apply(set)
print(groceries_transactions.head())
# Convert the transactions into a list of lists
transactions_list = groceries_transactions['itemDescription'].tolist()


   Member_number                                    itemDescription
0           1000  {misc. beverages, semi-finished bread, canned ...
1           1001  {beef, rolls/buns, sausage, curd, soda, white ...
2           1002  {butter milk, specialty chocolate, frozen vege...
3           1003  {rolls/buns, detergent, frozen meals, sausage,...
4           1004  {frozen fish, red/blush wine, rolls/buns, cann...


In [43]:
len(groceries_df)

38765

In [None]:
# Convert sets to a single string with items separated by a comma
groceries_transactions["itemDescription"] = groceries_transactions["itemDescription"].agg(lambda x: ",".join(x))

groceries_transactions = groceries_transactions["itemDescription"].str.split(',')

print(groceries_transactions)

0       [misc. beverages, semi-finished bread, canned ...
1       [beef, rolls/buns, sausage, curd, soda, white ...
2       [butter milk, specialty chocolate, frozen vege...
3       [rolls/buns, detergent, frozen meals, sausage,...
4       [frozen fish, red/blush wine, rolls/buns, cann...
                              ...                        
3893    [decalcifier, dessert, misc. beverages, semi-f...
3894    [canned beer, tropical fruit, white wine, curd...
3895                                   [rolls/buns, curd]
3896    [herbs, butter milk, semi-finished bread, bott...
3897    [semi-finished bread, other vegetables, bottle...
Name: itemDescription, Length: 3898, dtype: object


Apriori

In [36]:
def prune(data,supp):
  """
  pruning to get Candidates k-itemset to Frequent k-itemset
  by comparing the support count (data.supp_count) with the minSup (supp)
  """
  df = data[data.supp_count >= supp]
  return df

def count_itemset(transaction_df, itemsets):
  """
  for each itemset in (the current set of) freq-n itemsets,
  the count of each itemset is computed according to
  the number of occurrences that it appears in the transactions data
  """
  count_item = {}
  for item_set in itemsets:
      set_A = set(item_set) # set A represents the itemset whose count is to be computed in order to be determined if it is a frequent itemset or not
      for row in transaction_df:
          set_B = set(row) # set B represents the transaction row record
          if set_B.intersection(set_A) == set_A: # checks for occurrence of the itemset in the transaction
              if item_set in count_item.keys():
                  count_item[item_set] += 1

              else:
                  count_item[item_set] = 1

  data = pd.DataFrame()
  data['item_sets'] = count_item.keys()
  data['supp_count'] = count_item.values()
  # print("Candidate itemset table (Counting):\n", data)
  return data

def count_item(trans_items):

    count_ind_item = {}
    for row in trans_items:
        for i in range(len(row)):
            if row[i] in count_ind_item.keys():
                count_ind_item[row[i]] += 1
            else:
                count_ind_item[row[i]] = 1

    data = pd.DataFrame()
    data['item_sets'] = count_ind_item.keys()
    data['supp_count'] = count_ind_item.values()
    data = data.sort_values('item_sets')

    return data


def join(list_of_items):
    itemsets = []
    i = 1
    for entry in list_of_items:
        proceding_items = list_of_items[i:]
        for item in proceding_items:
            if(type(item) is str):
                if entry != item:
                    tuples = (entry, item)
                    itemsets.append(tuples)
            else:
                if entry[0:-1] == item[0:-1]:
                    tuples = entry+item[1:]
                    itemsets.append(tuples)
        i = i+1
    if(len(itemsets) == 0):
        return None
    return itemsets

In [37]:
def apriori(trans_data,supp=2):
    freq = pd.DataFrame()

    df = count_item(trans_data) # to generate counts of
    # print("df")
    # print(len(df))
    # print(df)
    while(len(df) != 0):

        df = prune(df, supp)
        # print("Minsup =", supp,"\n")
        # print("Freq itemset table (Pruned):\n", df)

        if len(df) > 1 or (len(df) == 1 and int(df.supp_count >= supp)):
            freq = df

        itemsets = join(df.item_sets)

        if(itemsets is None):
            return freq

        df = count_itemset(trans_data, itemsets)
    return df

# Apriori on Groceries Dataset

In [45]:
freq_item_sets = apriori(groceries_transactions, 200)
freq_item_sets

Unnamed: 0,item_sets,supp_count
10,"(bottled water, other vegetables, whole milk)",219
97,"(other vegetables, rolls/buns, soda)",205
100,"(other vegetables, rolls/buns, whole milk)",320
101,"(other vegetables, rolls/buns, yogurt)",204
122,"(other vegetables, soda, whole milk)",270
129,"(other vegetables, whole milk, yogurt)",280
162,"(rolls/buns, soda, whole milk)",254
169,"(rolls/buns, whole milk, yogurt)",257
191,"(soda, whole milk, yogurt)",212


In [None]:
# from itertools import combinations

# Convert the frequent itemsets into features for clustering

In [None]:
def convert_to_transdf(data):
    # Create a set of all unique items
    all_items = set()
    for entry in data:
      print("entry:", entry)
      all_items.update(entry["items"])

    # Generate all possible combinations of items
    combinations_list = []
    for r in range(1, len(all_items) + 1):
      item_combinations = combinations(all_items, r)
      combinations_list.extend(item_combinations)

    # Create a DataFrame with features for each combination
    df = pd.DataFrame(data)
    for combination in combinations_list:
      feature_name = " & ".join(sorted(list(combination)))
      df[feature_name] = df["items"].apply(lambda x: int(set(combination).issubset(x)))

    # Fill NaN values with 0
    df.fillna(0, inplace=True)

    # Set the "record_id" as the DataFrame index
    df.set_index("record_id", inplace=True)

    # Print the resulting DataFrame
    print(df)

    return df



In [None]:
# print(dict(freq_item_sets['item_sets']))

{57: ('other vegetables', 'rolls/buns', 'sausage', 'rolls/buns', 'whole milk'), 66: ('other vegetables', 'rolls/buns', 'soda', 'rolls/buns', 'whole milk'), 73: ('other vegetables', 'rolls/buns', 'whole milk', 'rolls/buns', 'yogurt'), 88: ('other vegetables', 'soda', 'whole milk', 'soda', 'yogurt')}


In [46]:
input_data=dict(freq_item_sets['item_sets'])

transformed_data = []
record_id = 1

for _, items_tuple in input_data.items():
    items_set = set(items_tuple)
    transformed_data.append({"record_id": record_id, "items": items_set})
    record_id += 1

print(transformed_data)

[{'record_id': 1, 'items': {'whole milk', 'bottled water', 'other vegetables'}}, {'record_id': 2, 'items': {'rolls/buns', 'soda', 'other vegetables'}}, {'record_id': 3, 'items': {'rolls/buns', 'whole milk', 'other vegetables'}}, {'record_id': 4, 'items': {'rolls/buns', 'yogurt', 'other vegetables'}}, {'record_id': 5, 'items': {'whole milk', 'soda', 'other vegetables'}}, {'record_id': 6, 'items': {'whole milk', 'yogurt', 'other vegetables'}}, {'record_id': 7, 'items': {'rolls/buns', 'whole milk', 'soda'}}, {'record_id': 8, 'items': {'rolls/buns', 'whole milk', 'yogurt'}}, {'record_id': 9, 'items': {'whole milk', 'soda', 'yogurt'}}]


In [47]:
convert_to_transdf(transformed_data)

entry: {'record_id': 1, 'items': {'whole milk', 'bottled water', 'other vegetables'}}
entry: {'record_id': 2, 'items': {'rolls/buns', 'soda', 'other vegetables'}}
entry: {'record_id': 3, 'items': {'rolls/buns', 'whole milk', 'other vegetables'}}
entry: {'record_id': 4, 'items': {'rolls/buns', 'yogurt', 'other vegetables'}}
entry: {'record_id': 5, 'items': {'whole milk', 'soda', 'other vegetables'}}
entry: {'record_id': 6, 'items': {'whole milk', 'yogurt', 'other vegetables'}}
entry: {'record_id': 7, 'items': {'rolls/buns', 'whole milk', 'soda'}}
entry: {'record_id': 8, 'items': {'rolls/buns', 'whole milk', 'yogurt'}}
entry: {'record_id': 9, 'items': {'whole milk', 'soda', 'yogurt'}}
                                                   items  rolls/buns  \
record_id                                                              
1          {whole milk, bottled water, other vegetables}           0   
2                   {rolls/buns, soda, other vegetables}           1   
3             {rolls

Unnamed: 0_level_0,items,rolls/buns,bottled water,other vegetables,soda,whole milk,yogurt,bottled water & rolls/buns,other vegetables & rolls/buns,rolls/buns & soda,...,bottled water & other vegetables & whole milk & yogurt,bottled water & soda & whole milk & yogurt,other vegetables & soda & whole milk & yogurt,bottled water & other vegetables & rolls/buns & soda & whole milk,bottled water & other vegetables & rolls/buns & soda & yogurt,bottled water & other vegetables & rolls/buns & whole milk & yogurt,bottled water & rolls/buns & soda & whole milk & yogurt,other vegetables & rolls/buns & soda & whole milk & yogurt,bottled water & other vegetables & soda & whole milk & yogurt,bottled water & other vegetables & rolls/buns & soda & whole milk & yogurt
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"{whole milk, bottled water, other vegetables}",0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"{rolls/buns, soda, other vegetables}",1,0,1,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,"{rolls/buns, whole milk, other vegetables}",1,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,"{rolls/buns, yogurt, other vegetables}",1,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,"{whole milk, soda, other vegetables}",0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,"{whole milk, yogurt, other vegetables}",0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,"{rolls/buns, whole milk, soda}",1,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,"{rolls/buns, whole milk, yogurt}",1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,"{whole milk, soda, yogurt}",0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
