In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [61]:
# For next time: change products to a counter dictionary! then weigh by counts
# so {'B': 4} means 4 of B's ingredients are antioxidants

# Need a categories data structure

categories = {}
categories['antioxidant'] = {}
categories['anti-acne'] = {}
categories['antioxidant']['concerns'] = 'aging, wrinkles, lines, texture, age, old, dull'
categories['anti-acne']['concerns'] = 'acne, redness, oil, oily, pores, pimple, pimples, breakout, texture'
categories['antioxidant']['products'] = ['A', 'B', 'C']
categories['anti-acne']['products'] = ['1', '2', '3', '4', 'A']
print(categories)

{'antioxidant': {'concerns': 'aging, wrinkles, lines, texture, age, old, dull', 'products': ['A', 'B', 'C']}, 'anti-acne': {'concerns': 'acne, redness, oil, oily, pores, pimple, pimples, breakout, texture', 'products': ['1', '2', '3', '4', 'A']}}


In [62]:
concern_to_index = {name:index for index, name in enumerate(categories)}
print(concern_to_index)

{'antioxidant': 0, 'anti-acne': 1}


In [63]:
def cos_sim(c, tfidf_mat, concern_to_index):
    """Returns the cosine similarity of the query and a concern list.
    
    Params: {c: String,
             tfidf_mat: np.ndarray,
             concern_to_index: Dict}
    Returns: Float 
    """
    # query is last row
    v1 = tfidf_mat[len(tfidf_mat)-1]
    v2 = tfidf_mat[concern_to_index[c]]
    num = np.dot(v1, v2)

    return num/(np.linalg.norm(v1)*np.linalg.norm(v2))

In [66]:
def concern_similarity(concerns, categories, products_to_indices):
    """ Finds cosine similarity between input query (concerns) and each product category's concern list. 
        Returns a numpy array with each product's score, based on the categories they are in.
        
        Params: {concerns: (user input) String,
                 categories: (category -> Dict('products', 'concerns')) Dict,
                 products_to_indices: (product -> index) Dict}
        Returns: Numpy Array
    """
    result = np.zeros(len(products_to_indices))
                      
    tfidf_vec = TfidfVectorizer(stop_words = 'english')
    lst = [categories[k]['concerns'] for k in categories.keys()]
    lst.append(concerns)
    tfidf_mat = tfidf_vec.fit_transform(lst).toarray()
    
    for k,v in categories.items():
        sim = cos_sim(k, tfidf_mat, concern_to_index)
        for p in v['products']:
            result[products_to_indices[p]] += sim
    
    # for invalid query
    if sum(result) == 0: return 'invalid query'
    return result

In [72]:
concerns = 'some redness, texture, and acne'
# q1 = 'dryness'
products_to_indices = {'A':0, 'B':1, 'C':2, '1':3, '2':4, '3':5, '4':6}
indices_to_products = {v:k for k,v in products_to_indices.items()}
arr = concern_similarity(concerns, categories, products_to_indices)
print(arr)

[0.56070853 0.11282497 0.11282497 0.44788357 0.44788357 0.44788357
 0.44788357]


In [73]:
products = {'A': {"num faves": 9, "price": 10.45}, 'B': {"num faves": 3, "price": 4.45}, 
            'C': {"num faves": 12, "price": 10.99}, '1': {"num faves": 6, "price": 8.00}, 
            '2': {"num faves": 6, "price": 5.80}, '3': {"num faves": 2, "price": 6.99},
            '4': {"num faves": 0, "price": 18.99}}

In [75]:
arr_idx = [(val,prod) for prod, val in enumerate(arr)]
rank_idx = sorted(arr_idx, key = lambda x: (x[0], products[indices_to_products[x[1]]]["num faves"], 
                                            products[indices_to_products[x[1]]]["price"]), reverse = True)
print(rank_idx)

[(0.5607085343890859, 0), (0.4478835659540915, 4), (0.4478835659540915, 3), (0.4478835659540915, 5), (0.4478835659540915, 6), (0.11282496843499437, 2), (0.11282496843499437, 1)]


In [77]:
ranking = list(map(lambda x: indices_to_products[x[1]], rank_idx))
print(ranking)

['A', '2', '1', '3', '4', 'C', 'B']
