In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import json
import glob
import re

In [2]:
def getProducts(i_list, p_data):
    """
    Return a list of the products that contain at least one ingredient
    in the given list.
    :param i_list: List of ingredients to search for
    :param p_data: Product data to search in
    :return: List of products that contain at least one ingredient in i_list
    """
    i_set = set(i_list)
    out = []
    for k, v in p_data.items():
        i = 0
        found = False
        while i < len(v['ingredients']) and not found:
            found = v['ingredients'][i] in i_set
            i += 1
        if found:
            out.append(k)

    return out


In [3]:
with open("finaldata.json", "r") as f:
    data = json.loads("\n".join(f.readlines()))

with open("ingredients.json", "r", errors='ignore') as f:
    u_ingredients = json.loads("\n".join(f.readlines()))

with open("concerns.json", "r") as f:
    u_concerns = json.loads("\n".join(f.readlines()))
    
with open("relevant_types.json", "r") as f:
    relevant_product_types = json.loads("\n".join(f.readlines()))

In [4]:
for _, v in data.items():
    i_string = v['ingredients']
    if i_string[-1] == '.': i_string = i_string[:len(i_string) - 1]
    i_list = re.split(", |\. ", i_string)
    v['ingredients'] = [(re.sub(r".*:", "", s)).strip() for s in i_list]

In [5]:
categories = {}

for k, v in u_concerns.items():
    d = {
        "concerns": ", ".join(v),
        "products": getProducts(u_ingredients[k], data)
    }
    categories[k.lower()] = d

# Construct auxiliary data structures
num_products = len(data)
category_to_index = {name: index for index, name in enumerate(categories)}
products_to_indices = {k: v for k, v in zip(data.keys(), range(num_products))}
indices_to_products = {v: k for k, v in products_to_indices.items()}

In [6]:
def create_product_types_dict():
    product_types = {}
    product_files = glob.glob("./product_types_lists/*.json")
    if len(product_files) == 0: 
        print('can\'t read')  
    
    for p_file in product_files:
        name = p_file[22:-5]
        with open(p_file) as json_file:
            data = json.load(json_file)
        p_arr = np.full(num_products, False)
        for p in data:
            if p not in products_to_indices.keys():
                continue
            p_arr[products_to_indices[p]] = True
        product_types[name] = p_arr
    
    return product_types

In [7]:
def create_price_ranges():
    """Returns a dictionary to map price ranges to Boolean arrays 
    (True if index corresponds to a product within the budget).
    
    Params: {}
    Returns: (String -> Numpy Array) Dict 
    """
    price_ranges = dict([('under $15', np.full(num_products, False)), ('$15-30', np.full(num_products, False)),
                         ('$30-50', np.full(num_products, False)), ('$50-75', np.full(num_products, False)), 
                         ('$75+', np.full(num_products, False))])
    for k,v in data.items():
        if v['price'] < 15:
            price_ranges['under $15'][products_to_indices[k]] = True
        elif v['price'] < 30:
            price_ranges['$15-30'][products_to_indices[k]] = True
        elif v['price'] < 30:
            price_ranges['$30-50'][products_to_indices[k]] = True
        elif v['price'] < 30:
            price_ranges['$50-75'][products_to_indices[k]] = True
        else :
            price_ranges['$75+'][products_to_indices[k]] = True
    
    return price_ranges

In [8]:
def create_claims():
    """Returns a dictionary to map price ranges to Boolean arrays 
    (True if index corresponds to a product within the budget).
    
    Params: {}
    Returns: (String -> Numpy Array) Dict 
    """
    price_ranges = dict([('under $15', np.full(num_products, False)), ('$15-30', np.full(num_products, False)),
                         ('$30-50', np.full(num_products, False)), ('$50-75', np.full(num_products, False)), 
                         ('$75+', np.full(num_products, False))])
    for k,v in data.items():
        if v['price'] < 15:
            price_ranges['under $15'][products_to_indices[k]] = True
        elif v['price'] < 30:
            price_ranges['$15-30'][products_to_indices[k]] = True
        elif v['price'] < 30:
            price_ranges['$30-50'][products_to_indices[k]] = True
        elif v['price'] < 30:
            price_ranges['$50-75'][products_to_indices[k]] = True
        else :
            price_ranges['$75+'][products_to_indices[k]] = True
    
    return price_ranges

In [9]:
product_types = create_product_types_dict()

In [10]:
price_ranges = create_price_ranges()

In [11]:
def adjust_sensitivity(ranking, sensitive):
    """Returns the ranking after adjusting scores based on skin sensiivity.
    
    Params: {ranking: Numpy Array,
             sensitive: Boolean}
    Returns: Numpy Array 
    """
    if sensitive:
        for prod in categories['abrasive/scrub']['products']:
            ranking[products_to_indices[prod]] *= 0.5
        for prod in categories['perfuming']['products']:
            ranking[products_to_indices[prod]] *= 0.5
            
        for prod in categories['soothing']['products']:
            ranking[products_to_indices[prod]] *= 1.5
    return ranking

In [12]:
def adjust_skin_type(ranking, s_type):
    """Returns the ranking after adjusting scores based on skin type.
    
    Params: {ranking: Numpy Array,
             s_type: String}
    Returns: Numpy Array 
    """
    if s_type == 'oily':
        ranking[product_types['face_oil_products']] *= 0.5
            
        for prod in categories['absorbent/mattifier']['products']:
            ranking[products_to_indices[prod]] *= 1.5
        ranking[product_types['bha_products']] *= 1.5
        ranking[product_types['oil_absorbing_products']] *= 1.5
    
    elif s_type == 'dry':
        for prod in categories['absorbent/mattifier']['products']:
            ranking[products_to_indices[prod]] *= 0.5
        ranking[product_types['oil_absorbing_products']] *= 0.5
            
        for prod in categories['soothing']['products']:
            ranking[products_to_indices[prod]] *= 1.5
    
    elif s_type == 'combo':
        pass
    
    return ranking

In [13]:
def cos_sim(c, tfidf_mat, category_to_idx):
    """Returns the cosine similarity of the query and a concern list.
    
    Params: {c: String,
             tfidf_mat: np.ndarray,
             category_to_idx: Dict}
    Returns: Float 
    """
    # query is last row
    v1 = tfidf_mat[len(tfidf_mat)-1]
    v2 = tfidf_mat[category_to_idx[c]]
    num = np.dot(v1, v2)
    
    denom = max((np.linalg.norm(v1)*np.linalg.norm(v2)), 1e-7)
    return num/denom

In [14]:
def claims_similarity(query, product_info, prod_to_idx):
    """ Finds cosine similarity between input query (concerns) and each product's claims. 
        Returns a numpy array with each product's score.
        
        Params: {query: (user input) String,
                 product_info: (product -> Dict) Dict,
                 prod_to_idx: (product -> index) Dict}
        Returns: Numpy Array
    """
    result = np.zeros(len(prod_to_idx))
                      
    tfidf_vec = TfidfVectorizer(stop_words = 'english')
    lst = [product_info[k]['claims'] for k in product_info.keys()]
    lst.append(query)
    tfidf_mat = tfidf_vec.fit_transform(lst).toarray()
    
    for k,v in product_info.items():
        sim = cos_sim(k, tfidf_mat, prod_to_idx)
        #print(k, sim)
        result[prod_to_idx[k]] += sim
        
    return result

In [15]:
def concern_similarity(query, category_info, prod_to_idx, category_to_idx):
    """ Finds cosine similarity between input query (concerns) and each product category's concern list. 
        Returns a numpy array with each product's score, based on the categories they are in.
        
        Params: {query: (user input) String,
                 category_info: (category -> Dict) Dict,
                 prod_to_idx: (product -> index) Dict, 
                 category_to_idx: (category -> index) Dict}
        Returns: Numpy Array
    """
    result = np.zeros(len(prod_to_idx))
                      
    tfidf_vec = TfidfVectorizer(stop_words = 'english')
    lst = [category_info[k]['concerns'] for k in categories.keys()]
    lst.append(query)
    tfidf_mat = tfidf_vec.fit_transform(lst).toarray()
    
    for k,v in category_info.items():
        sim = cos_sim(k, tfidf_mat, category_to_idx)
        for p in v['products']:
            result[prod_to_idx[p]] += sim
            
        # added adjustments
        for category in relevant_product_types[k]['relevant']:
            result[product_types[category]] *= 1.5
        for category in relevant_product_types[k]['irrelevant']:
            result[product_types[category]] *= 0.1
        
    return result

In [16]:
def rank_products(query, category_info, prod_to_idx, idx_to_prod, product_info, category_to_idx,
                 product_types, price_ranges, product_type=None, skin_type=None, budget=None, sensitivity=None):
    """ Returns a ranked list of products, with the most relevant at index 0.
        
        Params: {query: (user input) String,
                 category_info: (category -> Dict) Dict,
                 prod_to_idx: (product -> index) Dict,
                 idx_to_prod: (index -> product) Dict
                 product_info: (product -> Dict) Dict
        Returns: List
    """
    scores = concern_similarity(query, category_info, prod_to_idx, category_to_idx)
    scores += 2 * claims_similarity(query, product_info, prod_to_idx)
    if sum(scores) == 0: return 'invalid query'
    
    # ranking adjustments
    if skin_type != None:
        scores = adjust_skin_type(scores, skin_type)
    if sensitivity != None:
        scores = adjust_sensitivity(scores, sensitivity)
    
    # strict filters
    if budget != None:
        scores[np.invert(price_ranges[budget])] = 0     
    if product_type != None:
        scores[np.invert(product_types[product_type])] = 0
    
    scores_idx = [(val,prod) for prod, val in enumerate(scores)]
    rank_idx = sorted(scores_idx, key = lambda x: (x[0], product_info[idx_to_prod[x[1]]]["num faves"], 
                                            product_info[idx_to_prod[x[1]]]["price"]), reverse = True)
    for r in rank_idx:
        print(idx_to_prod[r[1]], r[0])
        print(data[idx_to_prod[r[1]]]['price'])
    ranking = list(map(lambda x: idx_to_prod[x[1]], rank_idx))
    return ranking

In [17]:
a = np.array([1,2,3])
b = np.array([True, False, False])
print(a[b])

print(products_to_indices["Benzoyl Peroxide 10%"])
print(indices_to_products[3983])
print(product_types['acne_products'][3983])

[1]
3983
Benzoyl Peroxide 10%
True


In [21]:
concerns = 'burger'
ranking = rank_products(concerns, categories, products_to_indices, indices_to_products, data, category_to_index, product_types, price_ranges, product_type=None, budget=None)

In [19]:
# abrasive/scrub: 'acne, oil, oily, pores, pore, breakout, breakouts, wrinkles, lines, liney, texture, textured, smooth, smoother, smoothness, dull, dullness, dirt, dirty, makeup, clean, cleaning, comedones, blackhead, bleackheads, whitehead, whiteheads'
# absorbent/mattifier: 'acne, oil, oily, pores, pore, pimple, pimples, breakout, breakouts, texture, textured, blackhead, bleackheads, whitehead, whiteheads'
# anti-acne: 'acne, red, redness, oil, oily, pores, pore, pimple, pimples, breakout, breakouts, texture, textured, inflamed, inflammation, irritate, irritated, irritation, bump, bumps, bumpy'
# antimicrobial/antibacterial: 'acne, oil, oily, pimple, pimples, breakout, texture, textured, bump, bumps, bumpy, comedones, blackhead, bleackheads, whitehead, whiteheads, pore, pores, dirt, dirty, makeup, clean, cleaning'
# antioxidants: 'aging, age, wrinkles, wrinkly, lines, liney, texture, textured, rough, roughness, uneven, even, repair, repaired, repairs, protect, protection, sun, damage, damaged, dead, old, dull, dullness, pigmentation, hyperpigmentation, spots, spotty, dots, dotty, loose, droopy, drooping, sag, saggy, sags, sagging, elastic, elasticity, firm, firmer, firmness fresh, fresher, freshness, bright, brighter, brightness'
# astringent: 'oil, oily, pores, pore, dirt, dirty, makeup, clean, cleaning, tone, toning, toner'
# buffering: 'irritate, irritated, irritation, inflamed, inflammation, sensitive, sensitivity'
# cell-communicating ingredient: 'aging, age, wrinkles, wrinkly, lines, liney, texture, rough, roughness, smooth, smoother, smoothness, uneven, even, damage, damaged, dead, old, dull, dullness, pigmentation, hyperpigmentation, spots, spotty, dry, dryness, loose, droopy, drooping, sag, saggy, sags, sagging, elastic, elasticity, firm, firmer, firmness, hydrate, hydrated, hydration, moisture, moisturizer, soft, softer, soften, fresh, fresher, freshness'
# chelating: '' # no concerns, just stabilizes products
# colorant: '' # no concerns, just colors products
# deodorant: 'smell, smelling, odor, sweat, sweaty, sweatiness' # unlikely to be a part of our data, but just in case
# emollient: 'itch, itchy, itchiness, redness, red, rosacea, texture, textured, rough, roughness, uneven, damage, damaged, dead, smooth, smoother, smoothness, inflamed, inflammation, irritate, irritated, irritation, dry, dryness, hydrate, hydrated, hydration, moisture, moisturizer, soft, softer, soften'
# emulsifying: '' # no concerns, just helps mix ingredients
# emulsion stabilising: '' # no concerns, just helps products stay mixed/stable
# exfoliant: 'acne, oil, oily, pores, pore, breakout, breakouts, wrinkles, lines, liney, texture, textured, smooth, smoother, smoothness, dull, dullness, dirt, dirty, makeup, clean, cleaning, comedones, blackhead, bleackheads, whitehead, whiteheads, bright, brighter, brightness, fresh, fresher, freshness'
# moisturizer/humectant: 'dry, dryness, peel, peeling, itch, itchy, itchiness, moisture, moisturizer, hydrate, hydrated, hydration, texture, rough, roughness, smooth, smoother, smoothness, soft, softer, soften, fresh, fresher, freshness'
# perfuming: '' # no concerns, just fragrance
# preservative: '' # no concerns, just for product longevity
# skin brightening: 'sun, damage, damaged, repair, repaired, repairs, dull, dullness, pigmentation, hyperpigmentation, dark, darker, brown, brownish, spots, spotty, dots, dotty, brighter, brightness, even, uneven, tone, toner, toning'
# skin-identical ingredient: 'aging, age, wrinkles, wrinkly, lines, liney, texture, rough, roughness, smooth, smoother, smoothness, uneven, even, protect, protection, damage, damaged, dead, old, dull, dullness, pigmentation, hyperpigmentation, spots, spotty, dry, dryness, loose, droopy, drooping, sag, saggy, sags, sagging, elastic, elasticity, firm, firmer, firmness, hydrate, hydrated, hydration, moisture, moisturizer, soft, softer, soften, fresh, fresher, freshness'
# solvent: '' # no concerns, just for dissolving ingredients
# soothing: 'itch, itchy, itchiness, redness, red, rosacea, soothe, soothing, smooth, smoother, smoothness, inflamed, inflammation, swelling, irritate, irritated, irritation, patches, patch, acne, breakout, breakouts, pimple, pimples, sensitive, sensitivity, gentle'
# sunscreen: 'sun, damage, damaged, repair, repaired, repairs, protect, protection, dull, dullness, pigmentation, hyperpigmentation, dark, darker, brown, brownish, spots, spotty, dots, dotty, brighter, brightness, even, uneven, tone, toner, toning, aging, age, wrinkles, wrinkly, lines, liney'
# surfactant/cleansing: 'dirt, dirty, makeup, clean, cleaning'
# viscosity controlling: '' # no concerns, just for product thickness