# Nutrient optimization

## todo

* parallel GA with deap
    * DONE: that was too easy!
* API
* set initial seed population with clusters
* GA: remove nutrients that are uncontrained when solving (eg, 1e10)
    * DONE: this doesn't speed up the function
* random seed doesn't work

## Import custom helper functions

* def load_data():
* def do_clust(N,lim,req,nut):
* def evaluate(individual, nut,limt,reqd):
* def makeclusters(nclust,limt,reqd,nutrients ):
* def InitPopulation( pcls, ind_init,nfood, nclust, nseed,clust):
* def generate_ssdum(random, args):

In [1]:
import sys
print(sys.path)
from lib.libraries import *

['/home/pedwards/diet_ga', '/home/pedwards/miniconda3/envs/diet/lib/python37.zip', '/home/pedwards/miniconda3/envs/diet/lib/python3.7', '/home/pedwards/miniconda3/envs/diet/lib/python3.7/lib-dynload', '', '/home/pedwards/miniconda3/envs/diet/lib/python3.7/site-packages', '/home/pedwards/miniconda3/envs/diet/lib/python3.7/site-packages/IPython/extensions', '/home/pedwards/.ipython']


## Import external libraries

Most are standard, but we want the glpk solver for cvxopt, which requires the following,
```
$ sudo apt-get install libglpk-dev
$ sudo CVXOPT_BUILD_GLPK=1 pip install cvxopt
```

In [2]:
from os import path

import pickle
import pandas
import numpy
from deap import base, creator, tools, algorithms
from sklearn.preprocessing import normalize
from cvxopt import matrix, solvers # an alternative linear programming library
solvers.options['show_progress'] = False
solvers.options['glpk'] = {'msg_lev' : 'GLP_MSG_OFF'} #mute all output from glpk

from sklearn.cluster import KMeans
import random
from time import time
import multiprocessing

pandas.set_option('display.max_rows', None)
pandas.set_option('display.max_columns', None)

In [3]:
def find_diet(N_FOODS=6,exclude_food_ids=[],include_food_ids=[],metric_nutrients=[208],metric_weights=[1]):
    #
    # Internal constants
    #
    Nseed=500
    #
    # Load nutrient data
    #
    (nutrients,reqd,limt,food_desc,nutrient_desc)=load_data()
    print( '[*] Loaded %d foods from database' % nutrients.shape[0] )
    NT_DIM=nutrients.shape[0]
    
    #
    # drop any foods that we passed in exclude list
    #
    if len(exclude_food_ids)>0:
        valid_drop=list(set(nutrients.index) & set(exclude_food_ids)) # the food ids that are passed and are in the index
        if len(valid_drop)>0:
            nutrients.drop(index=valid_drop,inplace=True)
    
    #
    # drop any foods that we passed in exclude list
    #
    if len(include_food_ids)>0:
        valid_keep=list(set(nutrients.index) & set(include_food_ids)) # the food ids that are passed and are in the index
        if len(valid_keep)>0:
            nutrients=nutrients.loc[valid_keep,:]
            
    #
    # Load food clusters
    #
    cluster_food_count=0
    if path.exists('clust.pkl'):
        clust=pickle.load( open( "clust.pkl", "rb" ) )
        print( '[*] Found pickle file with %d clusters and %d foods' % (clust.max()+1,len(clust)) )
        Nclust=clust.max()+1
        cluster_food_count=len(clust)
    else:
        print('error')

    if cluster_food_count != nutrients.shape[0] :
        print('error')
        
    Nclust=clust.max()+1
    
    creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMin) # an individual comprises a list (of food IDs)

    toolbox = base.Toolbox()
    pool = multiprocessing.Pool()
    toolbox.register("map", pool.map)
    # Attribute generator 
    toolbox.register("attr_foodid", random.randrange, NT_DIM)
    # Structure initializers
    toolbox.register("individual", tools.initRepeat, creator.Individual, 
        toolbox.attr_foodid, N_FOODS)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)

    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutUniformInt, low=0, up=NT_DIM, indpb=0.1)
    #toolbox.register("select", tools.selBest, k=3)
    toolbox.register("select", tools.selTournament, tournsize=10)
    toolbox.register("evaluate", evaluate, nut=nutrients,limt=limt,reqd=reqd,metric_nutrients=metric_nutrients,metric_weights=metric_weights)

    # used to make a seed population (only) ; per: https://deap.readthedocs.io/en/master/tutorials/basic/part1.html?highlight=seeding#seeding-a-population
    toolbox.register("population_guess", InitPopulation, list, creator.Individual, N_FOODS,Nclust,Nseed,clust )

    stats = tools.Statistics(key=lambda ind: ind.fitness.values)
    stats.register("min", numpy.min)
    stats.register("median", numpy.median)
    stats.register("max", numpy.max)
    
    #pop = toolbox.population(n=300) # totally random initial population
    pop = toolbox.population_guess()
    pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=50,stats=stats, verbose=True)
    
    # clean up
    pool.close()
    del creator.Class
    
    return(pop)

In [4]:
food_sets=get_food_lists()

In [5]:
pop=find_diet(N_FOODS=6,include_food_ids=food_sets['veg_with_eggs_and_dairy'])
best=tools.selBest(pop, k=1)
best=best[0]

[*] Loaded 4482 foods from database
[*] Found pickle file with 15 clusters and 4482 foods
error


IndexError: positional indexers are out-of-bounds

In [7]:
best

[1222, 489, 2078, 1794, 2867, 2184]

In [9]:
(nutrients,reqd,limt,food_desc,nutrient_desc)=load_data()

evaluate(best, nut=nutrients,limt=limt,reqd=reqd)
nt=nutrients.iloc[best,:]
c = matrix(numpy.repeat(1.0,nt.shape[0]))
np_G= numpy.concatenate(
                        (   nt.transpose().values, 
                            nt.transpose().multiply(-1.0).values,
                            numpy.diag(numpy.repeat(-1,nt.shape[0])) 
                        )
                       ).astype(numpy.double) 
G = matrix( np_G ) 
h = matrix( numpy.concatenate( (
                limt.values, 
                reqd.multiply(-1.0).values, 
                numpy.repeat(0.0,nt.shape[0])
            ) ).astype(numpy.double) )    
o = solvers.lp(c, G, h, solver='glpk')
food_amounts=numpy.array(o['x'])[:,0]

In [18]:
list(numpy.round(abs(food_amounts),5))

[7.70038, 0.0, 7.11531, 8.27619, 2.98706, 0.809]

In [17]:
list(nt.index)

[9042, 5284, 11642, 11137, 15029, 11801]

In [8]:
nt.loc[:,[208,301]] *numpy.array([1,2])

nutrient_id,208,301
food_id,Unnamed: 1_level_1,Unnamed: 2_level_1
11965,31.0,66.0
11693,32.0,68.0
23592,177.0,14.0
11143,16.0,80.0
19273,342.0,0.0
23380,150.0,34.0


In [21]:
nt.loc[:,[208,301]].values *numpy.array([1,2])

array([[292.,  22.],
       [ 47.,  26.],
       [124.,  16.],
       [138., 566.],
       [ 28.,  66.],
       [144.,  30.]])

## Load food & nutrients from database

In [10]:
(nutrients,reqd,limt,food_desc,nutrient_desc)=load_data()
print( '[*] Loaded %d foods from database' % nutrients.shape[0] )

[*] Loaded 4482 foods from database


## Constants

In [11]:
N_FOODS=6
Nseed=500
NT_DIM=nutrients.shape[0]
Nclust=15

## Cluster

Observation: the optimization converges faster (and to lower minima) if provided a "seed" population with random baskets of *diverse* foods

Technique: I use a kmeans to find clusters of food types then sample from them (with a multinomial dist)

In [12]:
cluster_food_count=0

if path.exists('clust.pkl'):
    clust=pickle.load( open( "clust.pkl", "rb" ) )
    print( '[*] Found pickle file with %d clusters and %d foods' % (clust.max()+1,len(clust)) )
    Nclust=clust.max()+1
    cluster_food_count=len(clust)

if cluster_food_count != nutrients.shape[0] :
    print( '[*] Rebuilding clusters...' )
    clust=makeclusters(Nclust,limt,reqd,nutrients )
    pickle.dump( clust, open( "clust.pkl", "wb" ) )

[*] Found pickle file with 15 clusters and 4482 foods


## Genetic algorithm

This is the outermost optimization layer (the inner optim is in `evaluate()`)


Todo: play with early stopping. no need to keep going after improvement has ceased or slowed.

In [13]:
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin) # an individual comprises a list (of food IDs)

toolbox = base.Toolbox()
pool = multiprocessing.Pool()
toolbox.register("map", pool.map)
# Attribute generator 
toolbox.register("attr_foodid", random.randrange, NT_DIM)
# Structure initializers
toolbox.register("individual", tools.initRepeat, creator.Individual, 
    toolbox.attr_foodid, N_FOODS)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutUniformInt, low=0, up=NT_DIM, indpb=0.1)
#toolbox.register("select", tools.selBest, k=3)
toolbox.register("select", tools.selTournament, tournsize=10)
toolbox.register("evaluate", evaluate, nut=nutrients,limt=limt,reqd=reqd,metr=[208],weights=[1])

# used to make a seed population (only) ; per: https://deap.readthedocs.io/en/master/tutorials/basic/part1.html?highlight=seeding#seeding-a-population
toolbox.register("population_guess", InitPopulation, list, creator.Individual, N_FOODS,Nclust,Nseed,limt,reqd,nutrients )

stats = tools.Statistics(key=lambda ind: ind.fitness.values)
stats.register("min", numpy.min)
stats.register("median", numpy.median)
stats.register("max", numpy.max)



In [14]:
%%time

pop = toolbox.population(n=300) # totally random initial population
#pop = toolbox.population_guess()
pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=50, 
                                   stats=stats, verbose=True)

TypeError: evaluate() got an unexpected keyword argument 'metr'

In [15]:
best=tools.selBest(pop, k=1)
best=best[0]
evaluate(best, nut=nutrients,limt=limt,reqd=reqd,metr=[208],weights=[1])
nt=nutrients.iloc[best,:]
c = matrix(numpy.repeat(1.0,nt.shape[0]))
np_G= numpy.concatenate(
                        (   nt.transpose().values, 
                            nt.transpose().multiply(-1.0).values,
                            numpy.diag(numpy.repeat(-1,nt.shape[0])) 
                        )
                       ).astype(numpy.double) 
G = matrix( np_G ) 
h = matrix( numpy.concatenate( (
                limt.values, 
                reqd.multiply(-1.0).values, 
                numpy.repeat(0.0,nt.shape[0])
            ) ).astype(numpy.double) )    
o = solvers.lp(c, G, h, solver='glpk')
food_amounts=numpy.array(o['x'])[:,0]


TypeError: evaluate() got an unexpected keyword argument 'metr'

In [None]:
o

## Print the best "diet"

This is the food and corresponding amount to eat (in grams, sorry bud!). The idea is if you eat all this in a day you have the nutrients you need for the day.

Todo: this would be better if split up into recipes. 

In [None]:
final_foods= [ best[i] for i in range(len(food_amounts)) if food_amounts[i]>1e-6 ] # select those with non-trivial amounts
final_food_amounts= food_amounts[ food_amounts>1e-6 ]

nt=nutrients.iloc[final_foods,:]
df1= nt.join(food_desc).loc[:,['long_desc']] #food_desc.iloc[final_foods]
df2=pandas.DataFrame(final_food_amounts*100,index=df1.index,columns=["amount"])
df3=pandas.DataFrame(nt.loc[:,208].values * df2.loc[:,'amount'].values/100 ,columns=['calories'], index=df2.index)
diet_table=df1.join(df2).join(df3)

In [None]:
diet_table.to_json(orient='split')

In [None]:
nutrient_totals=pandas.DataFrame( ( 
                    numpy.dot( numpy.transpose(final_food_amounts), nt.values),
                    reqd,
                    limt
                  ), index=['Total','Amount required','Amount limit'], columns=nt.columns).transpose()

In [None]:
A=nt.join(df1).set_index('long_desc').transpose() * final_food_amounts
nutrient_report=A.join(nutrient_desc).join(nutrient_totals).set_index('name')
nutrient_report