In [23]:
import pandas as pd
import numpy as np
from pathlib import Path
import math
from tqdm import tqdm
import re

In [47]:
# Load and preprocess data
skincare_df = pd.read_csv(Path('data/Skincare.csv'))
skincare_df.index.rename('index', inplace=True)
#ingredients_df = skincare_df.iloc[:, 11].str.split(',')

# Remove text within parentheses and strip extra spaces
ingredients_df = skincare_df.iloc[:, 11].str.replace(r'\(.*?\)', '', regex=True).str.replace(' ', '', regex=False).str.replace(r'\*', '', regex=True).str.split(',').apply(lambda x: [item.strip() for item in x])
#ingredients_df = skincare_df.iloc[:, 11].str.replace(r'[^\w\s]', '', regex=True).str.replace(' ', '', regex=False).str.split(',')
tags_df = skincare_df.iloc[:, 20].str.replace(r'\(.*?\)', '', regex=True).str.replace(' ', '', regex=False).str.replace(r'\*', '', regex=True).str.split(',').apply(lambda x: [item.strip() for item in x])
tags_df


index
0     [Vegan, HyaluronicAcid, allure2019BestofBeauty...
1     [Vegan, CommunityFavorite, AHA, GlycolicAcid, ...
2     [CommunityFavorite, AHA, GlycolicAcid, OilFree...
3     [Vegan, CommunityFavorite, DarkCircles, OilFre...
4     [Vegan, CommunityFavorite, WithoutSilicones, A...
5     [Vegan, WithoutSilicones, AlcoholFree, GlutenF...
6     [Vegan, CommunityFavorite, OilFree, WithoutSil...
7     [Vegan, LacticAcid, CommunityFavorite, Without...
8     [Vegan, Dullness, UnevenTexture, VitaminC, Ant...
9     [Vegan, CommunityFavorite, OilFree, AlcoholFre...
10    [Vegan, Hydrating, Dryness, WithoutParabens, W...
11    [Vegan, WithoutSilicones, AlcoholFree, GlutenF...
12    [Vegan, Dullness, UnevenTexture, Retinol, Anti...
13    [Vegan, CommunityFavorite, SalicylicAcid, With...
14    [Vegan, Anti-Aging, WithoutParabens, WithoutSu...
15    [Vegan, Dullness, UnevenTexture, Acne, Blemish...
16    [Vegan, Dullness, UnevenTexture, VitaminC, Ant...
17    [Vegan, Dry, WithoutParabens, Gluten

In [25]:
def prune(data, supp):
    """Filter itemsets by support count threshold."""
    return data[data.supp_count >= supp]

In [26]:
def count_itemsets(ingredients_df, itemsets):
    """Count occurrences of each itemset in the transaction data."""
    count_item = {}
    for item_set in tqdm(itemsets, desc="Counting itemsets"):
        set_A = set(item_set)
        for row in ingredients_df:
            set_B = set(row)
            if set_B.intersection(set_A) == set_A: 
                if item_set in count_item:
                    count_item[item_set] += 1
                else:
                    count_item[item_set] = 1
    data = pd.DataFrame({
        'item_sets': list(count_item.keys()),
        'supp_count': list(count_item.values())
    })
    return data

In [27]:
def count_individual_items(ingr_items):
    """Count occurrences of individual items."""
    count_ind_item = {}
    for row in tqdm(ingr_items, desc="Counting individual items"):
        for item in row:
            if item in count_ind_item:
                count_ind_item[item] += 1
            else:
                count_ind_item[item] = 1
    data = pd.DataFrame({
        'item_sets': list(count_ind_item.keys()),
        'supp_count': list(count_ind_item.values())
    }).sort_values('item_sets')
    return data

In [28]:
def join(list_of_items):
    """Generate candidate itemsets by joining item pairs."""
    itemsets = []
    list_length = len(list_of_items)
    for i, entry in enumerate(tqdm(list_of_items, desc="Joining itemsets", leave=False)):
        for item in list_of_items[i+1:]:
            if isinstance(item, str):
                if entry != item:
                    tuples = (entry, item)
                    itemsets.append(tuples)
            else:
                if entry[:-1] == item[:-1]:
                    tuples = entry + item[1:]
                    itemsets.append(tuples)
    return itemsets if itemsets else None

In [29]:
def apriori(ingr_data, supp=3, con=0.5):
    """Apriori algorithm for finding frequent itemsets."""
    freq = pd.DataFrame()
    df = count_individual_items(ingr_data)
    
    while not df.empty:
        df = prune(df, supp)
        if not df.empty:
            freq = pd.concat([freq, df])
        itemsets = join(df.item_sets)
        if itemsets is None:
            break
        df = count_itemsets(ingr_data, itemsets)
    
    return freq

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules

# Convert ingredients and tags into a transaction dataset format
# Ensure `transaction_df` is a one-hot encoded DataFrame with ingredients and tags as columns

# Get frequent itemsets
frequent_itemsets = apriori(ingredients_df, min_support=0.03, use_colnames=True)

# Generate association rules with ingredients as antecedents and tags as consequents
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# Filter rules to focus on ingredient -> tag associations
rules = rules[rules['consequents'].apply(lambda x: any(tag in x for tag in tags_list))]

# Display the rules
print(rules[['antecedents', 'consequents', 'support', 'confidence']])


In [None]:
freq_item_sets = apriori(ingredients_df, supp=4)
print(freq_item_sets)