In [24]:
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from scipy.special import digamma
from collections import defaultdict
from itertools import chain
from scipy.stats import dirichlet
from collections import Counter
from random import random
import json
import heapq
from operator import itemgetter

In [25]:
LDA=LatentDirichletAllocation(n_topics=160,learning_method='online',n_jobs=1,random_state=1,max_iter=500, max_doc_update_iter=1000, perp_tol=1e-3, evaluate_every=10,verbose=1)

In [26]:
components=np.load('./data/LDAcomponent.npy')

with open('./data/ingredients_dict.json') as json_data:
    ingredient_dict = json.load(json_data)
    
key_file=open('./data/keywords.txt','r')

word_list=[]
for line in key_file:
    word_list.append(line[1:-2].decode('utf-8'))
    
all_cuisines=filter(lambda x: x.isupper(),word_list)

In [27]:
def marginal_components(comp,word_list,select):
    selected_comp=comp[:,np.logical_not(select)]
    marginalized_comp=np.sum(comp[:,select],axis=1)
    return np.concatenate((selected_comp,marginalized_comp[:,np.newaxis]),axis=1)

In [28]:
def exp_components(comp):
    n=comp.shape[1]
    return np.exp(digamma(comp)-np.repeat(digamma(np.sum(comp,axis=1))[:,np.newaxis],n,axis=1))

In [29]:
def input_parse(inputs,word_list, all_cuisine):
    cuisine_vector=list(chain.from_iterable([words.split(' ') for words in inputs['cuisine']]))
    required_vector=list(chain.from_iterable([words.split(' ') for words in inputs['ingredients_required']]))
    excluded_vector=list(chain.from_iterable([words.split(' ') for words in inputs['ingredients_excluded']]))
    if cuisine_vector:
        excluded_vector.extend(all_cuisine)
    word_vector=cuisine_vector+required_vector+excluded_vector
    word_dict=defaultdict(int)
    for word in cuisine_vector+required_vector:
        word_dict[word]+=1
    for word in excluded_vector:
        if word not in word_dict:
            word_dict[word]=0
    word_select=lambda x: x in word_vector
    word_filtered=filter(word_select, word_list)
    word_to_numeric=lambda x: word_dict[x] 
    word_count=map(word_to_numeric, word_filtered)
    word_count.append(10)
    return word_filtered, word_count

In [30]:
def words_to_ingredients(word_freq, ingredients):
    selected=[]
    uncovered=[]
    for word in word_freq:
        values=ingredients[word]
        if len(values)==0:
            continue
        elif len(values)==1:
            selected.append(values[0])
        else:
            uncovered.extend([word]*word_freq[word])
    while len(uncovered)>0:
        temp=[]
        for word in uncovered:
            temp.extend(ingredients[word])
        freq=[(k,v,-len(k.split(" ")),random()) for k, v in Counter(temp).iteritems()]
        freq=sorted(freq,key=lambda x:(x[1],x[2],x[3]),reverse=True)
        for item in freq:
            if item[0] not in selected:
                target=item[0]
                selected.append(target)
                break
        for word in target.split(" "):
            try:
                uncovered.remove(word)
            except:
                pass
    return selected

In [31]:
inputs={'cuisine':['MEXICAN'],'ingredients_required':['pork','pepper'],'ingredients_excluded':['onion','sausage']}
key_word, word_count=input_parse(inputs, word_list, all_cuisines)
select=np.array([False if word in key_word else True for word in word_list])
LDA.components_=marginal_components(components,word_list,select)
LDA.exp_dirichlet_component_=exp_components(LDA.components_)
LDA.doc_topic_prior_=0.255644474041518 #estimated from original data
doc_topic_dist=LDA.transform(word_count)

probs=np.empty_like(components)
for i in xrange(probs.shape[0]):
    probs[i,:]=dirichlet.rvs(alpha=components[i,:])
    
multinomial=np.squeeze(np.dot(doc_topic_dist,probs))
undetermined_words=np.array(word_list)[select]
condMultinomial=multinomial[select]
words_probs=zip(undetermined_words,condMultinomial)
largest_words = dict([(x[0],1) for x in heapq.nlargest(word_count[-1],words_probs,key=itemgetter(1))])
results=words_to_ingredients(largest_words, ingredient_dict)

print results
#inferenced_words=np.random.multinomial(n=word_count[-1], pvals=condMultinomial, size=3)
#results=[]
#for row in inferenced_words:
    #word_freq=zip(undetermined_words, row)
    #word_freq=filter(lambda x: x[1]>0, word_freq)
    #word_freq=dict(word_freq)
    #results.append(words_to_ingredients(word_freq, ingredient_dict))

[u'corn flour', u'red bell pepper', u'nacho cheese tortilla chip', u'salt', u'salsa', u'oil', u'cilantro']


