In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import json
import glob
import re

In [2]:
def getProducts(i_list, p_data):
    """
    Return a list of the products that contain at least one ingredient
    in the given list.
    :param i_list: List of ingredients to search for
    :param p_data: Product data to search in
    :return: List of products that contain at least one ingredient in i_list
    """
    i_set = set(i_list)
    out = []
    for k, v in p_data.items():
        i = 0
        found = False
        while i < len(v['ingredients']) and not found:
            found = v['ingredients'][i] in i_set
            i += 1
        if found:
            out.append(k)

    return out


In [163]:
with open("finaldata.json", "r") as f:
    data = json.loads("\n".join(f.readlines()))

with open("ingredients.json", "r", errors='ignore') as f:
    u_ingredients = json.loads("\n".join(f.readlines()))

with open("concerns.json", "r") as f:
    u_concerns = json.loads("\n".join(f.readlines()))
    
with open("relevant_types.json", "r") as f:
    relevant_product_types = json.loads("\n".join(f.readlines()))

with open("reviews.json", "r") as f:
    reviews_lst = json.loads("\n".join(f.readlines()))
# CHANGE!    
with open("product_type_names.json", "r") as f:
    product_file_to_type = json.loads("\n".join(f.readlines()))

In [164]:
for _, v in data.items():
    i_string = v['ingredients']
    if i_string[-1] == '.': i_string = i_string[:len(i_string) - 1]
    i_list = re.split(", |\. ", i_string)
    v['ingredients'] = [(re.sub(r".*:", "", s)).strip() for s in i_list]

In [165]:
categories = {}

for k, v in u_concerns.items():
    d = {
        "concerns": ", ".join(v),
        "products": getProducts(u_ingredients[k], data)
    }
    categories[k.lower()] = d

# Construct auxiliary data structures
num_products = len(data)
category_to_index = {name: index for index, name in enumerate(categories)}
products_to_indices = {k: v for k, v in zip(data.keys(), range(num_products))}
indices_to_products = {v: k for k, v in products_to_indices.items()}

In [166]:
#CHANGE!
def create_product_types_dict():
    """Returns a dictionary to map product types to Boolean arrays 
    (True if index corresponds to a product of that type).
    
    Params: {}
    Returns: (String -> Numpy Array) Dict 
    """
    product_types = {}
    product_files = glob.glob("./product_types_lists/*.json")
    if len(product_files) == 0: 
        print('can\'t read')  
    
    for p_file in product_files:
        with open(p_file) as json_file:
            data = json.load(json_file)
        p_arr = np.full(num_products, False)
        for p in data:
            if p not in products_to_indices.keys():
                continue
            p_arr[products_to_indices[p]] = True
        product_types[product_file_to_type[p_file[22:-5]]] = p_arr
    
    return product_types

In [167]:
def create_price_ranges():
    """Returns a dictionary to map price ranges to Boolean arrays 
    (True if index corresponds to a product within the budget).
    
    Params: {}
    Returns: (String -> Numpy Array) Dict 
    """
    price_ranges = dict([('under $15', np.full(num_products, False)), ('$15-30', np.full(num_products, False)),
                         ('$30-50', np.full(num_products, False)), ('$50-75', np.full(num_products, False)), 
                         ('$75+', np.full(num_products, False))])
    for k,v in data.items():
        if v['price'] < 15:
            price_ranges['under $15'][products_to_indices[k]] = True
        elif v['price'] < 30:
            price_ranges['$15-30'][products_to_indices[k]] = True
        elif v['price'] < 50:
            price_ranges['$30-50'][products_to_indices[k]] = True
        elif v['price'] < 75:
            price_ranges['$50-75'][products_to_indices[k]] = True
        else :
            price_ranges['$75+'][products_to_indices[k]] = True
    
    return price_ranges

In [168]:
def create_ratings():
    """Returns a Numpy Array where each product index stores its rating.
    
    Params: {}
    Returns: Numpy Array
    """
    ratings = np.zeros(num_products)
    
    for prod in reviews_lst:
        ratings[products_to_indices[prod['product']]] = prod['rate']
    
    return ratings

In [169]:
product_types = create_product_types_dict()

In [170]:
price_ranges = create_price_ranges()

In [171]:
ratings = create_ratings()

In [172]:
def adjust_sensitivity(ranking, sensitive):
    """Returns the ranking after adjusting scores based on skin sensiivity.
    
    Params: {ranking: Numpy Array,
             sensitive: Boolean}
    Returns: Numpy Array 
    """
    if sensitive:
        for prod in categories['Scrubs']['products']:
            ranking[products_to_indices[prod]] *= 0.5
        for prod in categories['perfuming']['products']:
            ranking[products_to_indices[prod]] *= 0.5
            
        for prod in categories['soothing']['products']:
            ranking[products_to_indices[prod]] *= 1.5
    return ranking

In [173]:
#CHANGE!
def adjust_skin_type(ranking, s_type):
    """Returns the ranking after adjusting scores based on skin type.
    
    Params: {ranking: Numpy Array,
             s_type: String}
    Returns: Numpy Array 
    """
    if s_type == 'oily':
        ranking[product_types['Face Oils']] *= 0.5
            
        for prod in categories['absorbent/mattifier']['products']:
            ranking[products_to_indices[prod]] *= 1.5
        ranking[product_types['BHA Products']] *= 1.5
        ranking[product_types['Oil Absorbing Products']] *= 1.5
    
    elif s_type == 'dry':
        for prod in categories['absorbent/mattifier']['products']:
            ranking[products_to_indices[prod]] *= 0.5
        ranking[product_types['Oil Absorbing Products']] *= 0.5
            
        for prod in categories['soothing']['products']:
            ranking[products_to_indices[prod]] *= 1.5
    
    elif s_type == 'combo':
        pass
    
    return ranking

In [174]:
def adjust_rating(ranking, ratings):
    """Returns the ranking after adjusting scores based on product ratings.
    
    Params: {ranking: Numpy Array,
             ratings: Numpy Array}
    Returns: Numpy Array 
    """
    r1 = ratings == 1
    ranking[r1] *= 0.1
    
    r2 = ratings == 2
    ranking[r2] *= 0.25
    
    r3 = ratings == 3
    ranking[r3] *= 0.5
    
    r4 = ratings == 4
    ranking[r4] *= 1.25
    
    r5 = ratings == 5
    ranking[r5] *= 1.5
    
    return ranking

In [175]:
def cos_sim(c, tfidf_mat, category_to_idx):
    """Returns the cosine similarity of the query and a concern list.
    
    Params: {c: String,
             tfidf_mat: np.ndarray,
             category_to_idx: Dict}
    Returns: Float 
    """
    # query is last row
    v1 = tfidf_mat[len(tfidf_mat)-1]
    v2 = tfidf_mat[category_to_idx[c]]
    num = np.dot(v1, v2)
    
    denom = max((np.linalg.norm(v1)*np.linalg.norm(v2)), 1e-7)
    return num/denom

In [176]:
def claims_similarity(query, product_info, prod_to_idx):
    """ Finds cosine similarity between input query (concerns) and each product's claims. 
        Returns a numpy array with each product's score.
        
        Params: {query: (user input) String,
                 product_info: (product -> Dict) Dict,
                 prod_to_idx: (product -> index) Dict}
        Returns: Numpy Array
    """
    result = np.zeros(len(prod_to_idx))
                      
    tfidf_vec = TfidfVectorizer(stop_words = 'english')
    lst = [product_info[k]['claims'] for k in product_info.keys()]
    lst.append(query)
    tfidf_mat = tfidf_vec.fit_transform(lst).toarray()
    
    for k,v in product_info.items():
        sim = cos_sim(k, tfidf_mat, prod_to_idx)
        result[prod_to_idx[k]] += sim
        
    return result

In [179]:
def concern_similarity(query, category_info, prod_to_idx, category_to_idx):
    """ Finds cosine similarity between input query (concerns) and each product category's concern list. 
        Returns a numpy array with each product's score, based on the categories they are in.
        
        Params: {query: (user input) String,
                 category_info: (category -> Dict) Dict,
                 prod_to_idx: (product -> index) Dict, 
                 category_to_idx: (category -> index) Dict}
        Returns: Numpy Array
    """
    result = np.zeros(len(prod_to_idx))
                      
    tfidf_vec = TfidfVectorizer(stop_words = 'english')
    lst = [category_info[k]['concerns'] for k in categories.keys()]
    lst.append(query)
    tfidf_mat = tfidf_vec.fit_transform(lst).toarray()
    
    for k,v in category_info.items():
        sim = cos_sim(k, tfidf_mat, category_to_idx)
        for p in v['products']:
            result[prod_to_idx[p]] += sim
            
        # added adjustments
        # CHANGE
        for category in relevant_product_types[k]['relevant']:
            result[product_types[product_file_to_type[category]]] *= 1.5
        for category in relevant_product_types[k]['irrelevant']:
            result[product_types[product_file_to_type[category]]] *= 0.1
        
    return result

In [180]:
def rank_products(query, category_info, prod_to_idx, idx_to_prod, product_info, category_to_idx,
                 product_types, price_ranges, ratings, product_type=None, skin_type=None, budget=None, sensitivity=None):
    """ Returns a ranked list of products, with the most relevant at index 0.
        
        Params: {query: (user input) String,
                 category_info: (category -> Dict) Dict,
                 prod_to_idx: (product -> index) Dict,
                 idx_to_prod: (index -> product) Dict
                 product_info: (product -> Dict) Dict
        Returns: List
    """
    scores = concern_similarity(query, category_info, prod_to_idx, category_to_idx)
    scores += 2 * claims_similarity(query, product_info, prod_to_idx)
    if sum(scores) == 0: return 'invalid query'
    
    # ranking adjustments
    scores = adjust_rating(scores, ratings)
    if skin_type != None:
        scores = adjust_skin_type(scores, skin_type)
    if sensitivity != None:
        scores = adjust_sensitivity(scores, sensitivity)
    
    # strict filters
    if budget != None:
        scores[np.invert(price_ranges[budget])] = 0
    if product_type != None:
        scores[np.invert(product_types[product_type])] = 0
    
    len_rank = np.count_nonzero(scores)
    
    scores_idx = [(val, prod) for prod, val in enumerate(scores)]
    rank_idx = sorted(scores_idx, key = lambda x: (x[0], ratings[x[1]], product_info[idx_to_prod[x[1]]]["price"],
                                                  product_info[idx_to_prod[x[1]]]["num faves"]), reverse = True)
    ranking = list(map(lambda x: idx_to_prod[x[1]], rank_idx))[:len_rank]
    for r in ranking:
        print(r, scores[[prod_to_idx[r]]])
        print(data[r]['price'], ratings[[prod_to_idx[r]]])
    
    return ranking

In [186]:
concerns = 'fine lines and wrinkles'
ranking = rank_products(concerns, categories, products_to_indices, indices_to_products, data, category_to_index, 
                        product_types, price_ranges, ratings, product_type=None, skin_type=None, 
                        budget='under $15', sensitivity=None)

0.2% Retinol Fine Line Correcting Serum [1.62727711]
8.0 [5.]
Bakuchiol [1.36425031]
9.99 [4.]
Healthy Skin Anti-Aging Perfector Moisturizing Retinol Treatment SPF 20 [1.34936151]
13.99 [4.]
0.5% Retinol with Rosehip Seed Oil Super Strength Formula Conditioning & Fine Line Serum [1.33549595]
14.0 [5.]
Retinol Face Serum [1.15761255]
9.99 [5.]
Granactive Retinoid 2% in Squalane [1.15761255]
9.6 [5.]
Restorative Night Moisturizer [1.15536654]
12.99 [4.]
Nourish Oil-Free Antioxidant Facial Moisturizer [1.15536654]
5.99 [4.]
Nourishing Hot Cloth Cleanser [1.04993694]
8.99 [5.]
Dual Action Skin Lightener [0.97211149]
10.99 [5.]
Hyaluronic Acid Face Serum [0.96658987]
7.99 [4.]
Vitamin C Face Serum [0.94758845]
9.99 [4.]
Protect & Perfect Hand Cream SPF 15 [0.93681576]
14.69 [5.]
Hydrating Cleansing Oil [0.92429323]
12.99 [5.]
Healing Ointment [0.92429323]
10.99 [5.]
Hydrating Cleanser [0.92429323]
9.99 [5.]
Sunscreen Stick Broad Spectrum SPF 50 [0.92429323]
9.99 [5.]
Baby Moisturizing Lotio

AM Facial Moisturizing Lotion Broad Spectrum SPF 30 [0.4576546]
14.99 [5.]
Makeup Removing Cleanser Cloths [0.4576546]
8.99 [5.]
PM Facial Moisturizing Lotion [0.4576546]
8.29 [5.]
Hydrocortisone Anti-Itch Cream [0.4576546]
6.99 [5.]
Skin Renew Anti-Sun Damage SPF 28 [0.40156366]
12.99 [3.]
Face + Neck Face Factor 30 Broad Spectrum SPF 30 [0.39126053]
12.95 [5.]
Cicaplast Baume B5 [0.38886553]
14.99 [4.]
15% Vitamin C and EGF Serum [0.38886553]
14.99 [4.]
Prep + Brighten Rose Face Oil [0.38886553]
14.0 [4.]
Prep + Soothe Camellia Face Oil [0.38886553]
14.0 [4.]
BFD Cleansing Oil [0.38886553]
14.0 [4.]
Silicone-Free Priming Moisturizer [0.38886553]
14.0 [4.]
Coconut Face Milk [0.38886553]
14.0 [4.]
Clean It Zero Foam Cleanser [0.38886553]
14.0 [4.]
Eczema Therapy Moisturizing Cream [0.38886553]
13.99 [4.]
Eye Makeup Remover Lotion Oil-Free [0.38886553]
13.95 [4.]
Rosa Mosqueta Rose Hip Seed Oil [0.38886553]
13.63 [4.]
Visibly Even Daily Moisturizer SPF 30 [0.38886553]
13.49 [4.]
Apricot

7.99 [3.]
Sport Sunscreen Stick SPF 55 [0.30809774]
4.99 [3.]
Enrich Moisturizing Face Lotion SPF 15 [0.30809774]
3.99 [3.]
Hydrating Body Wash [0.30510306]
10.99 [5.]
Niacinamide [0.30510306]
6.99 [5.]
Q10 Anti-Wrinkle Face Lotion With SPF 15 Sunscreen [0.28967161]
10.99 [2.]
Zit Stick [0.28017692]
14.0 [4.]
Age Defying Anti-Wrinkle Eye Cream [0.27870007]
14.99 [2.]
AHA 30% + BHA 2% Peeling Solution [0.26953373]
7.2 [4.]
Neutrogena Men’s Triple Protect Face Lotion SPF 20 [0.26760478]
6.99 [3.]
Cica Repair Sleep Paste [0.25924369]
14.99 [4.]
Make Them Jelly Hi-Lite [0.25924369]
14.99 [4.]
Restoring Antioxidant Face Mask with Green Tea [0.25924369]
14.99 [4.]
Borage Dry Skin Therapy Lotion [0.25924369]
14.5 [4.]
Soothing Aloe Vera Pure Natural Body Wash [0.25924369]
13.79 [4.]
Glowing Apricot Pure Natural Body Wash [0.25924369]
13.79 [4.]
Softening Mango Pure Natural Body Wash [0.25924369]
13.49 [4.]
Purifying Tea Tree Pure Natural Body Wash [0.25924369]
13.49 [4.]
Restorative Skin Ther

Prickly Pear Hydrating Body Cleansing Gel [0.15554621]
9.99 [3.]
Water Boost Skin Quench Sleep Cream [0.15554621]
9.99 [3.]
Yes to Blueberries Age Refresh Foaming Facial Cleanser [0.15554621]
9.99 [3.]
Mineral Sport Sunscreen Stick SPF 50 [0.15554621]
9.99 [3.]
Rapid Wrinkle Repair Prep Cleanser [0.15554621]
9.99 [3.]
Soothing Clear Turmeric Mousse Cleanser [0.15554621]
9.99 [3.]
Cleansing Foam [0.15554621]
9.5 [3.]
Daily Cleansing Cloths [0.15554621]
9.49 [3.]
Complexion Refining Deep Clean Mousse [0.15554621]
9.39 [3.]
Water Babies Pure & Simple Free SPF 50 [0.15554621]
8.99 [3.]
Hot Spots with SPF 30 [0.15554621]
8.95 [3.]
Alpha Arbutin 2% + HA [0.15554621]
8.9 [3.]
Mineral UV Filters SPF 15 with Antioxidants [0.15554621]
8.9 [3.]
Glycolic Acid 7% Toning Solution [0.15554621]
8.7 [3.]
Blackhead Clearing Scrub [0.15554621]
8.49 [3.]
Soothing Aloe Vera Pure Natural Hand Soap [0.15554621]
8.39 [3.]
Clarify & Cleanse Bar [0.15554621]
8.0 [3.]
Pore Refining Daily Cleanser [0.15554621]
7.

Sheer Touch Lotion Sunscreen Broad Spectrum SPF 50 [0.10269925]
9.99 [2.]
Shimmer Effect Lotion Sunscreen with Mica Minerals Broad Spectrum SPF 40 [0.10269925]
9.99 [2.]
Shimmer Effect Lotion Sunscreen with Mica Minerals Broad Spectrum SPF 20 [0.10269925]
9.99 [2.]
Refreshmint Cucumber & Bamboo Eye De-Puffer [0.10269925]
9.95 [2.]
Wet Skin Sunscreen Spray SPF 30 [0.10269925]
9.49 [2.]
Advanced Therapy Hand Cream [0.10269925]
9.49 [2.]
Sheer Touch Lotion Sunscreen Broad Spectrum SPF 30 [0.10269925]
8.99 [2.]
Total Effects 7-in-1 Advanced Anti-Aging Deep Penetrating Moisture Body Wash [0.10269925]
8.49 [2.]
Advanced Therapy Lotion [0.10269925]
8.49 [2.]
Dr. Carver’s Easy Shave Butter [0.10269925]
8.0 [2.]
Solutions Plus+ Total Radiance Thermal Cleanser [0.10269925]
8.0 [2.]
Special Hand Cream with Vitamin E [0.10269925]
8.0 [2.]
Sensitive Skin Face Lotion Sunscreen Broad Spectrum SPF 30 [0.10269925]
7.99 [2.]
Pumice Foot Polish [0.10269925]
7.47 [2.]
Moisturizing Foot Creme [0.10269925]


Ideal Moisture Normal Skin Day Lotion SPF 25 [0.07777311]
6.99 [2.]
Revitalift Radiant Smoothing Cream Cleanser [0.07777311]
6.99 [2.]
Ideal Moisture Dry Skin Day Lotion SPF 25 [0.07777311]
6.99 [2.]
Pure Clay Cleanser Detox & Brighten [0.07777311]
6.99 [2.]
MEN+CARE Clean Comfort Body and Face Wash [0.07777311]
6.79 [2.]
MEN+CARE Extra Fresh Body and Face Wash [0.07777311]
6.79 [2.]
Revitalift Radiant Smoothing Wet Cleansing Towelettes [0.07777311]
6.69 [2.]
Men+Care Sensitive Fresh Awake Body and Face Wash [0.07777311]
6.49 [2.]
Nourishing Cocoa Butter Pure Natural Moisturizing Creme [0.07777311]
6.39 [2.]
Res-Q Ointment [0.07777311]
6.0 [2.]
Moisture Therapy Intensive Extra Strength Cream [0.07777311]
6.0 [2.]
Facial Cleansing Towelettes Cucumber and Sage [0.07777311]
6.0 [2.]
Clean Express! Facial Towelettes [0.07777311]
5.99 [2.]
Yes to Tomatoes Detoxifying Charcoal Facial Wipes [0.07777311]
5.99 [2.]
Lemon Cleansing Wipes [0.07777311]
5.99 [2.]
Lemon Gel Cleanser [0.07777311]
5.9

Studio Eye Refresh [0.05184874]
3.0 [2.]
Yes to Miracle Oil Argan Oil Mud Mask [0.05184874]
2.99 [2.]
Yes to Coconut Ultra Hydrating Paper Mask [0.05184874]
2.99 [2.]
Yes to Grapefruit Brightening Vitamin C Glow Boosting Paper Mask [0.05184874]
2.99 [2.]
True Match Lumi Glotion Natural Glow Enhancer [0.05085051]
14.99 [2.]
Mist Ultimate Hydration Essence Cooling with Cucumber Water & White Mint [0.05085051]
13.49 [2.]
Anti-Wrinkle Vitamin A Glycolic Scrub [0.05085051]
12.75 [2.]
Eye Booster 2-in-1 Day & Night Lash Boosting Serum [0.05085051]
10.95 [2.]
Summer Shine Body Lotion [0.05085051]
10.0 [2.]
Expert Anti-Redness Serum [0.05085051]
7.99 [2.]
Expert Anti-Blemish Serum [0.05085051]
7.99 [2.]
Expert Sensitive Anti-Blemish Serum [0.05085051]
7.99 [2.]
Studio BB Cream SPF 20 [0.04943306]
6.0 [1.]
Vitamin C Renewal Refreshing Cleansing Gel [0.04666386]
12.5 [1.]
Vitamin C Renewal Hydrating Cleansing Milk [0.04666386]
12.5 [1.]
Hydro Boost Water Gel Lotion Sunscreen SPF 50 [0.04666386]


Keep Clear Clarifying Tonic [0.0207395]
10.0 [1.]
NIVEA Sun-Kissed Radiance [Fair to Medium Skin] [0.0207395]
9.99 [1.]
Lemongrass + Moroccan Argan Oil Firming Body Lotion [0.0207395]
9.99 [1.]
Natural Glow & Protect Daily Moisturizer SPF 20 for Fair to Medium Skin Tones [0.0207395]
9.99 [1.]
Yes to Grapefruit Dark Spot Correcting Body Creme [0.0207395]
9.99 [1.]
Yes to Grapefruit Exfoliating Body Wash [0.0207395]
9.99 [1.]
Yes to Tomatoes Clear Skin Detoxifying Charcoal Warming Facial Scrub [0.0207395]
9.99 [1.]
NIVEA Sun-Kissed Radiance [Medium to Dark Skin] [0.0207395]
9.99 [1.]
Silk Hydration Lotion Sunscreen Broad Spectrum SPF 12 [0.0207395]
9.99 [1.]
Absolutely Ageless Pre-Tox Peel Off Mask [0.0207395]
9.99 [1.]
Rosa Mosqueta & English Lavender Facial Toner, for Dry Skin [0.0207395]
9.92 [1.]
Green Tea & Ginkgo Facial Toner, for Normal Skin [0.0207395]
9.92 [1.]
Original Strength Medicated Body Lotion [0.0207395]
9.79 [1.]
Vegecol with Aloe & Oatmeal Soothing Mask, for Sensitive 