In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [17]:
# For next time: change products to a counter dictionary! then weigh by counts
# so {'B': 4} means 4 of B's ingredients are antioxidants

# Need a categories data structure like this, with two fields for each category:
# 'products': list of product names that contain 1+ ingredients in this category
# 'concerns': String of keywords (added to the bottom of this notebook)

categories = {}
categories['antioxidant'] = {}
categories['anti-acne'] = {}
categories['antioxidant']['concerns'] = 'aging, wrinkles, lines, texture, age, old, dull'
categories['anti-acne']['concerns'] = 'acne, redness, oil, oily, pores, pimple, pimples, breakout, texture'
categories['antioxidant']['products'] = ['A', 'B', 'C']
categories['anti-acne']['products'] = ['1', '2', '3', '4', 'A']
print(categories)

{'antioxidant': {'concerns': 'aging, wrinkles, lines, texture, age, old, dull', 'products': ['A', 'B', 'C']}, 'anti-acne': {'concerns': 'acne, redness, oil, oily, pores, pimple, pimples, breakout, texture', 'products': ['1', '2', '3', '4', 'A']}}


In [18]:
# code to construct structure that assigns categories to indices
category_to_index = {name:index for index, name in enumerate(categories)}
print(category_to_index)

{'antioxidant': 0, 'anti-acne': 1}


In [19]:
def cos_sim(c, tfidf_mat, category_to_idx):
    """Returns the cosine similarity of the query and a concern list.
    
    Params: {c: String,
             tfidf_mat: np.ndarray,
             concern_to_index: Dict}
    Returns: Float 
    """
    # query is last row
    v1 = tfidf_mat[len(tfidf_mat)-1]
    v2 = tfidf_mat[category_to_idx[c]]
    num = np.dot(v1, v2)

    return num/(np.linalg.norm(v1)*np.linalg.norm(v2))

In [20]:
def concern_similarity(query, category_info, prod_to_idx, category_to_idx):
    """ Finds cosine similarity between input query (concerns) and each product category's concern list. 
        Returns a numpy array with each product's score, based on the categories they are in.
        
        Params: {query: (user input) String,
                 category_info: (category -> Dict) Dict,
                 prod_to_idx: (product -> index) Dict, 
                 category_to_idx: (category -> index) Dict}
        Returns: Numpy Array
    """
    result = np.zeros(len(prod_to_idx))
                      
    tfidf_vec = TfidfVectorizer(stop_words = 'english')
    lst = [category_info[k]['concerns'] for k in categories.keys()]
    lst.append(query)
    tfidf_mat = tfidf_vec.fit_transform(lst).toarray()
    
    for k,v in category_info.items():
        sim = cos_sim(k, tfidf_mat, category_to_idx)
        for p in v['products']:
            result[prod_to_idx[p]] += sim
    
    # for invalid query
    if sum(result) == 0: return 'invalid query'
    return result

In [21]:
# this would be the user query
concerns = 'some redness, texture, and acne'

# this needs to be constructed and should contain indices for all products in our data
products_to_indices = {'A':0, 'B':1, 'C':2, '1':3, '2':4, '3':5, '4':6}

# code to construct structure that matches indices back to products
indices_to_products = {v:k for k,v in products_to_indices.items()}

In [22]:
# this would be the actual product data, which has more fields than just "num faves" and "price"

products = {'A': {"num faves": 9, "price": 10.45}, 'B': {"num faves": 3, "price": 4.45}, 
            'C': {"num faves": 12, "price": 10.99}, '1': {"num faves": 6, "price": 8.00}, 
            '2': {"num faves": 6, "price": 5.80}, '3': {"num faves": 2, "price": 6.99},
            '4': {"num faves": 0, "price": 18.99}}

In [23]:
def rank_products(query, category_info, prod_to_idx, idx_to_prod, product_info, category_to_idx):
    """ Returns a ranked list of products, with the most relevant at index 0.
        
        Params: {query: (user input) String,
                 category_info: (category -> Dict) Dict,
                 prod_to_idx: (product -> index) Dict,
                 idx_to_prod: (index -> product) Dict
                 product_info: (product -> Dict) Dict
        Returns: List
    """
    scores = concern_similarity(query, category_info, prod_to_idx, category_to_idx)
    
    scores_idx = [(val,prod) for prod, val in enumerate(scores)]
    rank_idx = sorted(scores_idx, key = lambda x: (x[0], product_info[idx_to_prod[x[1]]]["num faves"], 
                                            product_info[idx_to_prod[x[1]]]["price"]), reverse = True)
    ranking = list(map(lambda x: idx_to_prod[x[1]], rank_idx))
    return ranking

In [24]:
ranking = rank_products(concerns, categories, products_to_indices, indices_to_products, products, category_to_index)
print(ranking)

['A', '1', '2', '3', '4', 'C', 'B']


In [11]:
# abrasive/scrub: 'acne, oil, oily, pores, pore, breakout, breakouts, wrinkles, lines, liney, texture, textured, smooth, smoother, smoothness, dull, dullness, dirt, dirty, makeup, clean, cleaning, comedones, blackhead, bleackheads, whitehead, whiteheads'
# absorbent/mattifier: 'acne, oil, oily, pores, pore, pimple, pimples, breakout, breakouts, texture, textured, blackhead, bleackheads, whitehead, whiteheads'
# anti-acne: 'acne, red, redness, oil, oily, pores, pore, pimple, pimples, breakout, breakouts, texture, textured, inflamed, inflammation, irritate, irritated, irritation, bump, bumps, bumpy'
# antimicrobial/antibacterial: 'acne, oil, oily, pimple, pimples, breakout, texture, textured, bump, bumps, bumpy, comedones, blackhead, bleackheads, whitehead, whiteheads, pore, pores, dirt, dirty, makeup, clean, cleaning'
# antioxidants: 'aging, age, wrinkles, wrinkly, lines, liney, texture, textured, rough, roughness, uneven, even, repair, repaired, repairs, protect, protection, sun, damage, damaged, dead, old, dull, dullness, pigmentation, hyperpigmentation, spots, spotty, dots, dotty, loose, droopy, drooping, sag, saggy, sags, sagging, elastic, elasticity, firm, firmer, firmness fresh, fresher, freshness, bright, brighter, brightness'
# astringent: 'oil, oily, pores, pore, dirt, dirty, makeup, clean, cleaning, tone, toning, toner'
# buffering: 'irritate, irritated, irritation, inflamed, inflammation, sensitive, sensitivity'
# cell-communicating ingredient: 'aging, age, wrinkles, wrinkly, lines, liney, texture, rough, roughness, smooth, smoother, smoothness, uneven, even, damage, damaged, dead, old, dull, dullness, pigmentation, hyperpigmentation, spots, spotty, dry, dryness, loose, droopy, drooping, sag, saggy, sags, sagging, elastic, elasticity, firm, firmer, firmness, hydrate, hydrated, hydration, moisture, moisturizer, soft, softer, soften, fresh, fresher, freshness'
# chelating: '' # no concerns, just stabilizes products
# colorant: '' # no concerns, just colors products
# deodorant: 'smell, smelling, odor, sweat, sweaty, sweatiness' # unlikely to be a part of our data, but just in case
# emollient: 'itch, itchy, itchiness, redness, red, rosacea, texture, textured, rough, roughness, uneven, damage, damaged, dead, smooth, smoother, smoothness, inflamed, inflammation, irritate, irritated, irritation, dry, dryness, hydrate, hydrated, hydration, moisture, moisturizer, soft, softer, soften'
# emulsifying: '' # no concerns, just helps mix ingredients
# emulsion stabilising: '' # no concerns, just helps products stay mixed/stable
# exfoliant: 'acne, oil, oily, pores, pore, breakout, breakouts, wrinkles, lines, liney, texture, textured, smooth, smoother, smoothness, dull, dullness, dirt, dirty, makeup, clean, cleaning, comedones, blackhead, bleackheads, whitehead, whiteheads, bright, brighter, brightness, fresh, fresher, freshness'
# moisturizer/humectant: 'dry, dryness, peel, peeling, itch, itchy, itchiness, moisture, moisturizer, hydrate, hydrated, hydration, texture, rough, roughness, smooth, smoother, smoothness, soft, softer, soften, fresh, fresher, freshness'
# perfuming: '' # no concerns, just fragrance
# preservative: '' # no concerns, just for product longevity
# skin brightening: 'sun, damage, damaged, repair, repaired, repairs, dull, dullness, pigmentation, hyperpigmentation, dark, darker, brown, brownish, spots, spotty, dots, dotty, brighter, brightness, even, uneven, tone, toner, toning'
# skin-identical ingredient: 'aging, age, wrinkles, wrinkly, lines, liney, texture, rough, roughness, smooth, smoother, smoothness, uneven, even, protect, protection, damage, damaged, dead, old, dull, dullness, pigmentation, hyperpigmentation, spots, spotty, dry, dryness, loose, droopy, drooping, sag, saggy, sags, sagging, elastic, elasticity, firm, firmer, firmness, hydrate, hydrated, hydration, moisture, moisturizer, soft, softer, soften, fresh, fresher, freshness'
# solvent: '' # no concerns, just for dissolving ingredients
# soothing: 'itch, itchy, itchiness, redness, red, rosacea, soothe, soothing, smooth, smoother, smoothness, inflamed, inflammation, swelling, irritate, irritated, irritation, patches, patch, acne, breakout, breakouts, pimple, pimples, sensitive, sensitivity, gentle'
# sunscreen: 'sun, damage, damaged, repair, repaired, repairs, protect, protection, dull, dullness, pigmentation, hyperpigmentation, dark, darker, brown, brownish, spots, spotty, dots, dotty, brighter, brightness, even, uneven, tone, toner, toning, aging, age, wrinkles, wrinkly, lines, liney'
# surfactant/cleansing: 'dirt, dirty, makeup, clean, cleaning'
# viscosity controlling: '' # no concerns, just for product thickness