In [1]:
import pandas as pd
import os.path
import math
import re
import random

In [2]:
root_path = os.path.dirname(os.getcwd())
flavor_data_raw = pd.read_csv(os.path.join(root_path, 'DATA/flavor_data.csv'))
flavor_data = flavor_data_raw.replace(float('nan'), '')
salad_data_impractical = flavor_data[flavor_data['salad']=='y']
salad_data = salad_data_impractical[(salad_data_impractical['salad_umbrella'] != 'y') & (salad_data_impractical['redirect'] != 'y')]

salad_data_basic = salad_data[salad_data['salad_basic'] == 'y']
# salad_data = salad_data_basic

salad_data.reset_index(inplace=True)

In [3]:
def get_terms_from_pairs_with(pairs_with):
    if str(pairs_with) == 'nan':
        return []
    else:
        return [term.strip() for term in pairs_with.split('\n\n') if term.strip() != '']

# break entries in column that has 'pairs with' strings into lists of ingredient terms
ingredient_pairs_with_terms = salad_data['pairs_with'].apply(get_terms_from_pairs_with)

In [4]:
# create list of all terms, ignoring case and excluding duplicates
all_terms = list(set(ingredient_pairs_with_terms.sum()))
all_terms_lower = list(set([term.lower() for term in ingredient_pairs_with_terms.sum()]))

In [5]:
# !pip install inflect
import inflect
p = inflect.engine()

In [6]:
def get_normal_tokens(phrase):
    # should add and/or, but the data is good enough and I don't want to mess up the manual entry
    tokens = [token.strip().lower() for token in re.split('\(|\)|,|e\.g\.|esp\.|and|—|or|aka|see|see also|;', phrase)]
#     print(tokens)
    tokens = [p.singular_noun(token) or token for token in tokens if token != '']
#     print(tokens)
    return tokens

def get_score(name, term):
    name_tokens = get_normal_tokens(name)
#     print(name_tokens)
    term_tokens = get_normal_tokens(term)
#     print(term_tokens)
    common_tokens = list(set(name_tokens).intersection(set(term_tokens)))
#     print(common_tokens)
    return len(common_tokens)

In [7]:
# # doing it this way so I can add 'print' to monitor progress
# score_data = []
# for name in salad_data['name']:
#     print(name)
#     score_data.append([get_score(name, term) for term in all_terms])

# term_name_scores = pd.DataFrame(score_data, columns = all_terms)
# term_name_scores['name'] = salad_data['name'].values

# term_name_scores.to_csv(os.path.join(root_path, 'DATA/term_name_scores_common.csv'), index=False)

In [8]:
term_name_matches_raw = pd.read_csv(os.path.join(root_path, 'DATA/term_name_matches.csv'))
term_name_matches = term_name_matches_raw.replace(['0', '1', '2', '3', '4', '5', 0, 1, 2, 3, 4, 5, float('nan')], '')
term_name_matches_lower = term_name_matches.replace('Y', 'y')

In [9]:
# create a dataframe with name and a list of pairing terms for each ingredient
pairing_data = pd.DataFrame({
    'name': salad_data['name'],
    'pairs_with_terms': ingredient_pairs_with_terms
})

def get_pairs_with_names(row):
#     print(row['name'], type(row['name']))
    lower_names = []
    upper_names = []
    for term in row['pairs_with_terms']:
        if term in term_name_matches.columns.values.tolist():
            lower_names += term_name_matches[term_name_matches[term] == 'y']['name'].values.tolist()
            upper_names += term_name_matches[term_name_matches[term] == 'Y']['name'].values.tolist()
        else:
            print('OH NO didnt find term:', term)
#     for name in lower_names + upper_names:
#         if name == row['name']:
#             print('DUPLICATE')
#     print(lower_names, upper_names)
    row['lower_pairs_with_names'] = [lower_name for lower_name in lower_names if lower_name != row['name']]
    row['upper_pairs_with_names'] = [upper_name for upper_name in upper_names if upper_name != row['name']]
    row['all_pairs_with_names'] = lower_names + upper_names
    return row

pairing_data = pairing_data.apply(get_pairs_with_names, axis=1)

OH NO didnt find term: Central American cuisine
OH NO didnt find term: eggs, e.g., huevos rancheros, omelets
OH NO didnt find term: *GUACAMOLE
OH NO didnt find term: nachos
OH NO didnt find term: rice, e.g., brown, red
OH NO didnt find term: SALAD DRESSINGS, e.g., Green Goddess
OH NO didnt find term: sandwiches, e.g., cheese
OH NO didnt find term: SOUPS, e.g., avocado, cold, tortilla
OH NO didnt find term: stuffed avocados
OH NO didnt find term: sushi, vegetarian, e.g., hand rolls, nori rolls
OH NO didnt find term: veggie burgers, e.g., as a topping


In [10]:
# name_name_clashes_blank = salad_data[['name', 'protein_cheese_sub', 'salad_allium', 'fruit', 'veg']].copy()
# names = salad_data['name'].values.tolist()

# for i, col_name in enumerate(names):
#     name_name_clashes_blank[col_name] = pd.Series(['x']*(i+1) + ['']*(len(names)-(i+1)))

# name_name_clashes_blank.to_csv(os.path.join(root_path, 'DATA/name_name_clashes_blank.csv'), index=False)

In [11]:
name_name_clashes_input = pd.read_csv(os.path.join(root_path, 'DATA/name_name_clashes_input.csv'))
name_name_clashes_input = name_name_clashes_input.replace([float('nan'), 'x'], '')
columns = ['name'] + name_name_clashes_input['name'].values.tolist()
name_name_clashes_input = name_name_clashes_input[columns]

def get_clashes_with_data(row):
    data = pd.Series([])
#     print(row.index.values.tolist())
    name = row['name']
    data['name'] = name
    
    lower_names = name_name_clashes_input['name'][name_name_clashes_input[name] == 'y'].values.tolist()
    upper_names = name_name_clashes_input['name'][name_name_clashes_input[name] == 'Y'].values.tolist()
    if 'y' in lower_names:
        print(lower_names)
    
    for name in name_name_clashes_input['name']:
#         if name in ['y', 'Y']:
#             print(name)
        if row[name] == 'y':
            lower_names.append(name)
        elif row[name] == 'Y':
            upper_names.append(name)
    
    lower_names = list(set(lower_names))
    upper_names = list(set(upper_names))
    
    data['lower_clashes_with_names'] = lower_names
    data['upper_clashes_with_names'] = upper_names
    data['all_clashes_with_names'] = list(set(lower_names + upper_names)) # shouldn't be overlap here, but hey
    
    data['lower_clashes_with_pairs'] = [tuple(sorted([name, lower_name])) for lower_name in data['lower_clashes_with_names']]
    data['upper_clashes_with_pairs'] = [tuple(sorted([name, lower_name])) for lower_name in data['upper_clashes_with_names']]
    data['all_clashes_with_pairs'] = [tuple(sorted([name, lower_name])) for lower_name in data['all_clashes_with_names']]
    
    
    return data

clashes_with_data = name_name_clashes_input.apply(get_clashes_with_data, axis=1)
            
# # for row in name_name_clashes_input.iterrows():
# clashes_with_data = pd.DataFrame({'name': salad_data['name']})
# for i, row in clashes_with_data.iterrows():
# #     print(row, type(row))
#     name = row['name']
#     row['lower_clashes_with'] = name_name_clashes_input[name][name_name_clashes_input[name] == 'y']
#     row['upper_clashes_with'] = name_name_clashes_input[name][name_name_clashes_input[name] == 'Y']
#     row['all_clashes_with'] = row['lower_clashes_with'] + row['upper_clashes_with']

In [12]:
all_lower_clashes_with_pairs = list(set(clashes_with_data['lower_clashes_with_pairs'].sum()))
all_upper_clashes_with_pairs = list(set(clashes_with_data['upper_clashes_with_pairs'].sum()))
all_clashes_with_pairs = list(set(clashes_with_data['all_clashes_with_pairs'].sum()))

In [13]:
# !pip install pyvis
from pyvis import network as net

In [14]:
salad_greens = salad_data[salad_data['salad_green'] == 'y']

salad_extras = salad_data[salad_data['salad_extra'] == 'y']
salad_extra_veg = salad_data[(salad_data['veg'] == 'y') & (salad_data['salad_extra'] == 'y')]
salad_extra_fruits = salad_data[(salad_data['fruit'] == 'y') & (salad_data['salad_extra'] == 'y')]
salad_extra_nuts = salad_data[(salad_data['protein_seed'] == 'y') & (salad_data['salad_extra'] == 'y')]
salad_extra_seeds = salad_data[(salad_data['protein_nut'] == 'y') & (salad_data['salad_extra'] == 'y')]
salad_extra_tomatoes = salad_data[salad_data['salad_extra_tomato'] == 'y']
salad_extra_olives = salad_data[salad_data['salad_extra_olive'] == 'y']
salad_extra_cheeses = salad_data[salad_data['salad_extra_cheese'] == 'y']
salad_extra_eggs = salad_data[salad_data['salad_extra_egg'] == 'y']
salad_extra_croutons = salad_data[salad_data['salad_extra_crouton'] == 'y']

salad_dressing_oils = salad_data[salad_data['salad_dressing_oil'] == 'y']
salad_dressing_vinegars = salad_data[salad_data['salad_dressing_vinegar'] == 'y']
salad_dressing_salts = salad_data[salad_data['salad_dressing_salt'] == 'y']
salad_dressing_peppers = salad_data[salad_data['salad_dressing_pepper'] == 'y']
salad_dressing_garlics = salad_data[salad_data['salad_dressing_garlic'] == 'y']

In [15]:
# !pip install networkx
import networkx as nx

In [44]:
#IDEAS
# savory v sweet templates (avoid eggs/mushroom/hard veg w fruit...)
# get eg working, weight it as 2
# get rid of ingredients pairing w themselves??
# start out w 1 base salad, extra (randomly selected from vip list), then branch out
# control for eg lots of types of citrus, alliums
# mark strong flavors, treat them separately
# mark potential clashes eg allium & fruit, treat them separately
# select main ingredients from each category that go with each other, then branch out from each, weighting traditionally at the end
# work off of pairs_with data for categories, while at same time picking categories then within categories (to account for eg allium bias)
# create a clashes_with matrix,filter out matches/ingredients on this
# *only* match eg.s with specifically named; also, consider not matching categories the weird way
# add a bonus if category of ingredient pairs well with other ingredients in salad (meh)
# scratch match categories (don't pair individual terms w umbrella names? tho, I guess it's moot if we omit umbrellas)

# TODO
# deal w pairings being added from both sides? not sure if this is a problem, but
    # actually, could record paired_with, from the other side; maybe even emphasis on pairings shared by both sides

# PROBLEM
    # clashes w is punishing fruit pretty heavily. might fix w bias.
    
# NEXT PRIORITIES
    # balance
        # designed to encompass prev 'bias'
        # could be flavor, could be food group
        # parse flavor, volume to create table of names v y/Y/n/N for sweet, salty, sour, savory, bitter, spicy
        # maybe also mouthfeel
            # crunchy/chewy/smooth
            # dry/wet
    # eg
        # should improve pairing significantly
    # speed
        # not a priority til UI development
    
# NEXT BIG THINGS
    # flavor balance?
    
    
# ALGORITHM A (best of many)

# THE PLAN
# count all the lower and upper connections among selected ingredients
# go through a buncha times, select the one w highest counts

top_score = 0
for i in range(300):
    n_subgraphs = 2
#     while n_subgraphs > 1 or n_extra_subgraphs > 1: # keep shuffling until you get a well connected graph
    while n_subgraphs > 1: # keep shuffling until you get a well connected graph
        n_greens = random.randrange(2, 4)
        n_extras = random.randrange(2, 6)
        n_dressing_oils = 1
        n_dressing_vinegars = 1
        n_dressing_salts = 1
        n_dressing_peppers = 1
        # n_dressing_garlics = random.randrange(0, 2) # maybe make presence dependent on the rest. or, just leave out for now.
        
        selected_greens = salad_greens.sample(n_greens)
        selected_extras = salad_extras.sample(n_extras)
        selected_dressing_oils = salad_dressing_oils.sample(n_dressing_oils)
        selected_dressing_vinegars = salad_dressing_vinegars.sample(n_dressing_vinegars)
        selected_dressing_salts = salad_dressing_salts.sample(n_dressing_salts)
        selected_dressing_peppers = salad_dressing_peppers.sample(n_dressing_peppers)

        selected_ingredients = selected_greens.append(selected_extras).append(selected_dressing_oils).append(selected_dressing_vinegars).append(selected_dressing_salts).append(selected_dressing_peppers)

        upper_pairs = []
        lower_pairs = []
        for ingredient_name in selected_ingredients['name']:
#             print(pairing_data['upper_pairs_with_names'])
            for pairs_with_name in pairing_data['upper_pairs_with_names'][pairing_data['name'] == ingredient_name].iloc[0]:
#                 print(pairs_with_name in selected_ingredients['name'].values.tolist())
                if pairs_with_name in selected_ingredients['name'].values.tolist():
#                     print('UPPER PAIR:', ingredient_name, 'WITH', pairs_with_name)
                    lower_pairs.append(tuple(sorted([ingredient_name, pairs_with_name])))
            for pairs_with_name in pairing_data['lower_pairs_with_names'][pairing_data['name'] == ingredient_name].iloc[0]:
                if pairs_with_name in selected_ingredients['name'].values.tolist():
#                     print('LOWER PAIR:', ingredient_name, 'WITH', pairs_with_name)
                    upper_pairs.append(tuple(sorted([ingredient_name, pairs_with_name])))

#         upper_pairs_sets = [sorted(pair) for pair in upper_pairs]
#         upper_pairs_unique = list(set(upper_pairs_sets))
        
        upper_pairs = list(set(upper_pairs))
        lower_pairs = list(set(lower_pairs).difference(set(upper_pairs))) # if a pair is both lower and upper, count it as upper
        all_pairs = list(set(upper_pairs + lower_pairs))

        G = nx.Graph()
        G.add_nodes_from(selected_ingredients['name'].values.tolist())
        G.add_edges_from(upper_pairs + lower_pairs)
        n_subgraphs = len(list(nx.connected_component_subgraphs(G)))
#         print(len(selected_ingredients), n_subgraphs)
        
#         extra_pairs = []
#         for ingredient_name in selected_extras['name']:
#             for pairs_with_name in pairing_data['upper_pairs_with_names'][pairing_data['name'] == ingredient_name].iloc[0]:
#                 if pairs_with_name in selected_extras['name'].values.tolist():
#                     extra_pairs.append([ingredient_name, pairs_with_name])

#         G_extra = nx.Graph()
#         G_extra.add_nodes_from(selected_extras['name'].values.tolist())
#         G_extra.add_edges_from(extra_pairs)
#         n_extra_subgraphs = len(list(nx.connected_component_subgraphs(G_extra)))
# #         print(len(selected_extras), n_extra_subgraphs)
# #         print()
            
    score = 0

#     upper_pairs_score = len(upper_pairs)/(n_greens+n_extras)**2
#     print(upper_pairs_score)
#     score += upper_pairs_score

#     lower_pairs_score = len(lower_pairs)/(n_greens+n_extras)**2/3
#     print(lower_pairs_score)

# PAIRING DENSITY BONUS ============================================================================================
    # ranges from roughly (.5 to 1) * 3
    average_shortest_path_length = nx.average_shortest_path_length(G)
    average_shortest_path_score = 3 / average_shortest_path_length
#     print(average_shortest_path_score)
    score += average_shortest_path_score
    
# UPPER PAIRING BONUS ==============================================================================================
    # ranges from roughly (.05 to .25) * 4
    upper_proportion_score = len(upper_pairs) / len(all_pairs) * 4 # messed with this, not sure if it still workss
#     print(upper_proportion_score)
    score += upper_proportion_score

#     # ranges from roughly () * 
#     upper_per_node_score = len(upper_pairs) / len(selected_ingredients)
# #     print(upper_per_node_score)
#     score += upper_per_node_score
    
#     # ranges from roughly (something_small to 1) * .5; hopefully linear, maybe normal?..
#     # designed to pull n distribution to wherever I feel it should be (correcting for bias, mostly)
#     reasonable_n_score = .5 / (1 + abs(3 - n_greens) + abs(8 - n_extras))
#     score += reasonable_n_score
    
#     # ranges from roughly (.5 to 1) * 2
#     if len(selected_extras) != 1:
#         extra_average_shortest_path_length = nx.average_shortest_path_length(G_extra)
#         extra_average_shortest_path_score = 2 / average_shortest_path_length
#     else:
#         extra_average_shortest_path_length = 1
#     score += extra_average_shortest_path_score

#     n_clashing_pairs = len(set(all_pairs).intersection(set(all_clashes_with_pairs)))
#     if n_clashing_pairs > 0:
#         print(n_clashing_pairs, all_pairs)

# CLASH PENALTY ====================================================================================================
    # ranges from roughly (0 to 4) / -2
#     upper_clashing_pairs = []
#     lower_clashing_pairs = []
    all_clashing_pairs = []
    selected_ingredients_list = selected_ingredients['name'].values.tolist()
    for name in selected_ingredients_list:
#         names_that_upper_clash_with_name = clashes_with_data['upper_clashes_with_names'][clashes_with_data['name'] == name].iloc[0]
#         upper_clashing_names = set(selected_ingredients_list).intersection(set(names_that_upper_clash_with_name)) # selected names that clash with this selected name
#         upper_clashing_pairs += [tuple(sorted([name, upper_clashing_name])) for upper_clashing_name in upper_clashing_names]

        names_that_clash_with_name = clashes_with_data['all_clashes_with_names'][clashes_with_data['name'] == name].iloc[0]
        all_clashing_names = set(selected_ingredients_list).intersection(set(names_that_clash_with_name)) # selected names that clash with this selected name
        all_clashing_pairs += [tuple(sorted([name, all_clashing_name])) for all_clashing_name in all_clashing_names]

    all_clashing_pairs = list(set(all_clashing_pairs))
#     print('NUM CLASHING PAIRS', len(all_clashing_pairs), all_clashing_pairs)
    
    score -= len(all_clashing_pairs) / 2
#     print(score)
#     print()
   
# FRUIT BONUS ======================================================================================================
    # ranges from roughly (0 to 3) * .1
    n_fruit = len(selected_ingredients[selected_ingredients['fruit'] == 'y'])
#     print(n_fruit)
    score += n_fruit * .1
    
# NUT SEED BONUS ===================================================================================================    
    # ranges from roughly (0 to 2) * .15
    n_nut_seed = len(selected_ingredients[selected_ingredients['protein_nut_seed'] == 'y'])
#     print(n_nut_seed)
    score += n_nut_seed * .15
    
    
#     print(score)
#     print()
#     print('SCORE', score)
#     print()
    if score > top_score:
        top_score = score
        top_upper_pairs = upper_pairs
        top_lower_pairs = lower_pairs
        top_selected_ingredients = selected_ingredients
        top_average_shortest_path_length = average_shortest_path_length
        top_upper_proportion = len(upper_pairs) / (len(upper_pairs) + len(lower_pairs))
print('TOP AVG SHORTEST PATH LENGTH', top_average_shortest_path_length)
print('TOP UPPER PROPORTION', top_upper_proportion)
print('TOP_SCORE', top_score)
            
    
top_net = net.Network(notebook=True)

nodes = top_selected_ingredients['name'].tolist()

def get_color(row):
#     print(type(row))
#     print(row)
    if row['salad_green'] == 'y':
        return 'lightgreen'
    elif row['salad_extra'] == 'y':
        if row['veg'] == 'y':
            return 'green'
        elif row['fruit'] == 'y':
            return 'orange'
        elif row['protein_nut_seed'] == 'y':
            return 'brown'
        else:
            return 'lightblue'
    elif row['salad_dressing'] == 'y':
        return 'lightgrey'
    
nodes_color = top_selected_ingredients.apply(get_color, axis=1).tolist()

top_net.add_nodes(
    nodes=nodes,
    color=nodes_color
)


for pair in top_upper_pairs:
    top_net.add_edge(pair[0], pair[1], color='black')

for pair in top_lower_pairs:
    top_net.add_edge(pair[0], pair[1], physics=False, color='lightgrey')

top_net.show('top_net.html')

TOP AVG SHORTEST PATH LENGTH 1.0727272727272728
TOP UPPER PROPORTION 0.9607843137254902
TOP_SCORE 6.789747424393487


In [17]:
# ALGORITHM B (random control)
n_greens = random.randrange(1, 4)
n_extras = random.randrange(0, 10)
n_dressing_oils = 1
n_dressing_vinegars = 1
n_dressing_salts = 1
n_dressing_peppers = 1
# n_dressing_garlics = random.randrange(0, 2) # maybe make presence dependent on the rest. or, just leave out for now.

selected_greens = salad_greens.sample(n_greens)
selected_extras = salad_extras.sample(n_extras)
selected_dressing_oils = salad_dressing_oils.sample(n_dressing_oils)
selected_dressing_vinegars = salad_dressing_vinegars.sample(n_dressing_vinegars)
selected_dressing_salts = salad_dressing_salts.sample(n_dressing_salts)
selected_dressing_peppers = salad_dressing_peppers.sample(n_dressing_peppers)

selected_ingredients = selected_greens.append(selected_extras).append(selected_dressing_oils).append(selected_dressing_vinegars).append(selected_dressing_salts).append(selected_dressing_peppers)

upper_pairs = []
lower_pairs = []
for ingredient_name in selected_ingredients['name']:
    for pairs_with_name in pairing_data['upper_pairs_with_names'][pairing_data['name'] == ingredient_name].iloc[0]:
        if pairs_with_name in selected_ingredients['name'].values.tolist():
#             print('UPPER PAIR:', ingredient_name, 'WITH', pairs_with_name)
            upper_pairs.append([ingredient_name, pairs_with_name])
    for pairs_with_name in pairing_data['lower_pairs_with_names'][pairing_data['name'] == ingredient_name].iloc[0]:
        if pairs_with_name in selected_ingredients['name'].values.tolist():
#             print('LOWER PAIR:', ingredient_name, 'WITH', pairs_with_name)
            lower_pairs.append([ingredient_name, pairs_with_name])

score = (len(upper_pairs) * 3 + len(lower_pairs)) / (n_greens + n_extras)
print('RANDOM SCORE', score)



random_net = net.Network(notebook=True)

nodes = selected_ingredients['name'].tolist()

def get_color(row):
#     print(type(row))
#     print(row)
    if row['salad_green'] == 'y':
        return 'lightgreen'
    elif row['salad_extra'] == 'y':
        if row['veg'] == 'y':
            return 'green'
        elif row['fruit'] == 'y':
            return 'orange'
        elif row['protein_nut_seed'] == 'y':
            return 'brown'
        else:
            return 'lightblue'
    elif row['salad_dressing'] == 'y':
        return 'lightgrey'
    
nodes_color = selected_ingredients.apply(get_color, axis=1).tolist()

random_net.add_nodes(
    nodes=nodes,
    color=nodes_color
)


for pair in upper_pairs:
    random_net.add_edge(pair[0], pair[1], color='black')

for pair in lower_pairs:
    random_net.add_edge(pair[0], pair[1], physics=False, color='lightgrey')

random_net.show('selected_net.html')

RANDOM SCORE 8.6
