In [1]:
import numpy as np
import pandas as pd
import math
import re
import time
import csv
import nltk as nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import warnings
from tqdm import tqdm
import pyreadr
from mlutils import dataset
import multiprocessing as mp
from multiprocessing import Pool
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import json
import torch
import torch.nn.functional as F
import re
from collections import OrderedDict
from torch.autograd import Variable
from IPython.core.display import display, HTML


In [23]:
mkt = 'SAMS'
sbu = 'FOOD'

In [3]:
#----------------------- Node Parameters ----------------------#
#mkt = getArgument("mkt", "PARAMS")
#sbu = getArgument("sbu", "PARAMS")


#----------------------------- Input_Path -------------------------------------#
inp_path = '/data/'+mkt+'/'+sbu+'/'

#Master_data_paths
model_data_path = inp_path +'01_raw_data/ing_nut_model_data.rds'

#Reading the Master File
#master_file = dataset.load(name = 'us_food_upc_master_data')
result = pyreadr.read_r(model_data_path)
total_nutrient = result[None] # extr


#Secondary_mapping_files
data_unimp_extract_path = inp_path + '04_clean_ing/sec_unimp_cleaned'
df_secondary_path =   inp_path + '03_ing_matches/sec_ing_match'
df_primary_path = inp_path + '03_ing_matches/prim_ing_match'
total_sec_path = inp_path + '04_clean_ing/sec_total_cleaned'
constraint_path = inp_path + '00_base_files/Ingredients_constraints.xlsx'


#-----------------Input to ALgorithm------------------------------------------#
sec_path = inp_path + '05_ing_share/sec_input/'


df_prim_filt = pd.read_pickle(sec_path + 'df_prim_filt.pkl')
df_sec_ingred_nutr = pd.read_pickle(sec_path + 'df_sec_ingred_nutr.pkl')
data_unimp_extract = pd.read_pickle(sec_path + 'data_unimp_extract.pkl')
df_total = pd.read_pickle(sec_path + 'df_total.pkl')

#-----------------Output from ALgorithm------------------------------------------#
output_path_name = inp_path + '05_ing_share/sec_temp_output/'


In [4]:
#df_prop_dict has the relevant information regarding each secondary ingredient and the corresponding rank
df_prop_dict_sec = {}
for i in tqdm(range(0,len(df_sec_ingred_nutr))):
    upc =  df_sec_ingred_nutr['UPC'].iloc[i]
    prim = df_sec_ingred_nutr['Primary Ingredient'].iloc[i]
    sec =  df_sec_ingred_nutr['Secondary Ingredient'].iloc[i]
    prop = df_sec_ingred_nutr['Proportion'].iloc[i]
    
    df_prop_dict_sec[upc,prim,sec] = prop


# In[23]:


nutritional_components = ['protein_per_100grams','total_fat_per_100grams',
                         'carbohydrates_per_100grams','calories_per_100grams',
                         'saturated_fat_per_100grams','cholesterol_per_100grams',
                         'sodium_per_100grams', 'dietary_fiber_per_100grams',
                         'sugar_per_100grams']


100%|██████████| 15182/15182 [00:01<00:00, 12262.64it/s]


In [24]:

def get_updated_nutrition(nutritional_components,primary_data,upc,prim_ingred):
    
    x= primary_data
    temp = x[(x['UPC']==upc) & (x['Primary Ingredient']==prim_ingred)]
    nutritional_components_updated=[]
    
    #Checking for zero values in y and removing the nutrients
    for i in nutritional_components:
        if temp[i].values[0] != 0:
            nutritional_components_updated.append(i)
            
    return nutritional_components_updated


# In[25]:


def get_food_comp_dict(primary_data,upc,prim_ingred,secondary_data):
    
    #Food_comp dict
    food_composition = {}
    
    #Secondary ingredient data
    sec_ingredient_data = secondary_data[(secondary_data['UPC']==upc) & 
                                         (secondary_data['Primary Ingredient']==prim_ingred)]
    
    
    #Adding the Nutrient share of the ingredient
    for i in range(0,len(sec_ingredient_data)):
        ing = sec_ingredient_data['Secondary Ingredient'].iloc[i]
        nut = sec_ingredient_data.iloc[i]
        food_composition[ing]= nut
    
    
    #Adding the Nutrient share of the Item
    temp = primary_data[(primary_data['UPC']==upc) & (primary_data['Primary Ingredient']==prim_ingred)]
    prim_ing_desc = temp['Primary Ingredient'].values[0]
    food_composition[prim_ing_desc] =  temp.iloc[0]
    
    return food_composition


# In[26]:


def get_unimp_ingredients(upc,prim_ingred,data_unimp_extract):
    
    x= data_unimp_extract[(data_unimp_extract['UPC']==upc) & 
                          (data_unimp_extract['Primary_Ingredient']==prim_ingred)]
    
    #Getting Ingredient information for item
    if len(x['Unimportant_Clean'].values[0]) !=0:
        ingred_unimp = x['Unimportant_Clean'].values[0].split(',')
        percent_unimp = int(x['Percent_unimp'].values[0])
        
    else :
        ingred_unimp =''
        percent_unimp = 0

    return ingred_unimp,percent_unimp


# In[27]:


def get_matrix_opt(upc,prim_ingred,primary_data,secondary_data,nutritional_components):
    
    #Taking the primary data
    x= primary_data
    
    #Getting Updated Nutritional component for the item
    nutritional_components = get_updated_nutrition(nutritional_components,primary_data,upc,prim_ingred)

    #Getting the total list
    #ingredients_total ,ingred_unimp,unimp_percent = get_ingredients(x,upc)
    
    #Naming the target
    temp = primary_data[(primary_data['UPC']==upc) & (primary_data['Primary Ingredient']==prim_ingred)]
    prim_ing_desc = temp['Primary Ingredient'].values[0]
    target = prim_ing_desc
    
    #Ingredient data for upc
    ingredient_data = secondary_data[(secondary_data['UPC']==upc) & (secondary_data['Primary Ingredient']==prim_ingred)]

    #Getting the Ingredients
    ingredients = list(ingredient_data['Secondary Ingredient'])
    
    #Computing the ingredient weights
    ingredient_weights = [None]*len(ingredients)
    
    ig_weights = OrderedDict(zip(ingredients, ingredient_weights))
    #print "%s ingredients:" % target, json.dumps(ig_weights, indent=2)
    
    #Get the Food composition dict
    food_composition = get_food_comp_dict(primary_data,upc,prim_ingred,secondary_data)
    
    # Formulate the problem as an optimization task:
    # - linear model without bias
    # - one training sample per nutritional component (proteins, carbs, etc.)
    # - each sample Xi from X is a row vector containing the percentage of a given nutritional component in each ingredient
    # - each output Yi from Y is a scalar corresponding to the total amount of this nutrinional component in the final product
    # - after training, the weights are the amount of unknown ingredients in grams

    X = torch.zeros(len(nutritional_components), len(ingredients))
    W = torch.zeros(len(ingredients), 1)
    Y = torch.zeros(len(nutritional_components), 1)

    for i, nutritional_component in enumerate(nutritional_components):
        for j, ingredient in enumerate(ingredients):
            X[i,j] = float(food_composition[ingredient][nutritional_component])
        Y[i,0] = food_composition[target][nutritional_component]

    #initializing weights with what we know
    for i, ingredient in enumerate(ingredients):
        if ingredient_weights[i] is not None:
            W[i,0] = ingredient_weights[i]

    #print("X=", X)
    #print("W=", W)
    #print("Y=", Y)

    #normalization to be more efficient on smaller nutritional amounts
    Y_scaler = Y.clone()
    X.div_(Y_scaler)
    _ = Y.div_(Y_scaler)
    
    #print("X=", X)
    #print("W=", W)
    #print("Y=", Y)

    
    #Defining the variables
    X = Variable(X, requires_grad=False)
    W = Variable(W, requires_grad=True)
    Y = Variable(Y, requires_grad=False)
    
    return X,W,Y,ingredient_weights


# In[28]:


def get_positional_rank (upc,prim,df_sec_ingred_nutr,df_prop_dict_sec) :
    ingredient = list(df_sec_ingred_nutr[(df_sec_ingred_nutr['UPC']== upc) & (df_sec_ingred_nutr['Primary Ingredient']== prim)]['Secondary Ingredient'])
    pos_rank = []

    for ingred in ingredient :
        pos_rank.append(df_prop_dict_sec[upc,prim,ingred])
    
    return np.array(pos_rank)


# In[29]:


def get_unimp_vec(upc ,prim, data_unimp_extract, df_sec_ingred_nutr):
    
    df_temp1 = data_unimp_extract[(data_unimp_extract['UPC'] == upc) & (data_unimp_extract['Primary_Ingredient'] == prim)]
    
    #Get unimp sec_ingred
    ingred =  df_temp1['Unimportant_Clean'].values[0].split(',')[0]
    
    #Get the percent
    per = df_temp1['Percent_unimp'].values[0]
    
    #Getting the array for the output vec
    df_temp2 = df_sec_ingred_nutr[(df_sec_ingred_nutr['UPC'] == upc) & (df_sec_ingred_nutr['Primary Ingredient'] == prim)]['Secondary Ingredient']
    list_unimp = list(df_temp2)
    
    #Checking
    check = len(set(list_unimp).intersection(ingred))
    
    if check != 0 :
        #Getting the position of the first unimp element
        ind_ingred = list_unimp.index(ingred)
        
        #Length of array computation
        len_arr = len(list_unimp)
        arr = np.zeros(len_arr)

        #final vector
        arr[ind_ingred] = 1
    
    else :
        arr = [None] * 5
    
    return arr,per
    



# In[30]:


def get_final_weights_reparam(epochs,upc,prim_ingred,df_prim_filt,df_sec_ingred_nutr,nutritional_components,data_unimp_extract,df_prop_dict_sec):
    
    #Defining the data
    primary_data = df_prim_filt
    secondary_data = df_sec_ingred_nutr
    data_unimp_extract= data_unimp_extract
    prim = prim_ingred
    df_prop_dict_sec = df_prop_dict_sec
    
    #getting matrices
    X,W,Y,ingredient_weights = get_matrix_opt(upc,prim_ingred,primary_data,secondary_data,nutritional_components)
    
    #Getting ingredient total information ----- to be done
    ingred_unimp,unimp_percent = get_unimp_ingredients(upc,prim_ingred,data_unimp_extract)
    
    #Ingredient data for upc
    ingredient_data = secondary_data[(secondary_data['UPC']==upc) & (secondary_data['Primary Ingredient']==prim_ingred)]

    #Getting the Ingredients
    ingredients = list(ingredient_data['Secondary Ingredient'])
    
    #Initializing V
    V = torch.ones(len(ingredients), 1)/np.sqrt(len(ingredients))
    #print V
    V = Variable(V, requires_grad=True)
    
    #Defining the Setup for Intialisation
    one_mat = torch.ones(len(ingredients), len(ingredients))
    inv_seq = get_positional_rank (upc,prim,df_sec_ingred_nutr,df_prop_dict_sec)
    inv_diag = torch.diag(torch.from_numpy(inv_seq)).type('torch.FloatTensor')
    
    #print inv_diag
    one_mat_inv = one_mat.mm(inv_diag)
    upper_tr = one_mat_inv.triu()
    
   
    ####################################################################################
    
    #Unimportant Ingredients Constraint
    if len(ingred_unimp) != 0 :
        unimp_vec , per = get_unimp_vec(upc ,prim, data_unimp_extract, df_sec_ingred_nutr)
        
        if unimp_vec[0] != None :
            percent = (float(per)/100.0) + 0.0001
            unimp_tensor = torch.from_numpy(unimp_vec).type('torch.FloatTensor')
            unimp_tensor1 = unimp_tensor.view(1,len(unimp_tensor))
            lamda = 1
        else :
            lamda = 0

    else :
        lamda = 0
        
    ########################################################################################
    
    
    #Running the Optimization
    #alpha=1e-20
    loss_history_par = np.zeros((epochs))
    
  
    for epoch in range(epochs):
    #Matrix multiply

        Y_pred = X.mm(upper_tr.mm(V**2))

        # l2 loss
        loss_par = (Y_pred - Y).pow(2).sum()
        #print loss

        #try to go and stay at 100g total
        V_sq = V**2
        
        loss_par += (V_sq.sum() - 1.).abs()*5
        #loss_par += (V_sq.sum() - 1.)**2
        
        #print loss
        
        #If ingredient unimportant is present
        if  lamda == 1:
            temp1 = upper_tr.mm(V**2)
            loss_par +=  (unimp_tensor1.mm(temp1)[0][0] - percent)
        
        
        loss_history_par[epoch] = loss_par.data.item()
        loss_par.backward()
        
        #Early Stopping
        if epoch > 10 :
            cur_loss_avg =  np.mean(loss_history_par[(epoch-5):(epoch)])
            prev_loss_avg = np.mean(loss_history_par[(epoch-10):(epoch-6)]) 
            if np.abs(cur_loss_avg - prev_loss_avg) < 1e-6 :
                loss_history_par = loss_history_par[0:epoch]
                break

        for i in range(V.size(0)):
            if ingredient_weights[i] is None:  #update only unknown quantities
                
                #Clipping the Gradient
                torch.nn.utils.clip_grad_norm_(V, 1)
                
                #performing the gradient descent algo
                V.data[i].sub_(1e-4 * V.grad[i].data)
                
                
        #V.data.sub_(1e-5 * V.grad.data)
        V.grad.data.zero_()
    
    W_par = upper_tr.mm(V**2)
        
    return W_par,loss_history_par ,loss_par


# In[30]:


def get_prop_ingred_sec(list_reqd,epochs,df_prim_filt,df_sec_ingred_nutr,nutritional_components,data_unimp_extract,df_prop_dict_sec):

    #Saves the proportion of ingredients
    prob=[]
    #Prop Ingredient
    prop_ing=[]

    #loss_history for item
    #loss_his = []

    #loss_final for each item
    loss_final = []

    #Weight sum acheived
    weight_sum = []

    #Defining the modified data
    primary_data = df_prim_filt
    secondary_data = df_sec_ingred_nutr
    data_unimp_extract= data_unimp_extract

    #Saving the final upc and prim_ingredient
    #upc_final=[]
    #prim_ingred_final=[]
    upc_prim_output = []
    
    #Saving the err
    #upc_err = []
    #prim_err = []
    upc_prim_err = []

    #Getting the list
    list_current = list_reqd
    
    
    for i in tqdm(range(0,len(list_current))):

        #Getting the UPC Information
        upc,prim_ingred = list_current[i]
        prim = prim_ingred
        
        try :
            upc_final.append(upc)
            prim_ingred_final.append(prim_ingred)

            #Getting the Food comp dict
            food_composition = get_food_comp_dict(primary_data,upc,prim_ingred,secondary_data)

            #Get Ingredient Percentages
            W,loss_history,loss = get_final_weights_reparam(epochs,upc,prim_ingred,df_prim_filt,
                                                        df_sec_ingred_nutr,nutritional_components,
                                                        data_unimp_extract,df_prop_dict_sec)
            #Array format from tensor
            W_arr = W.data.numpy()

            #Appending the same in a list
            prop_ing.append(W_arr)
            loss_final.append(loss.item())
            weight_sum.append(W.sum().item())
            #upc_final.append(upc)
            #prim_ingred_final.append(prim_ingred)
            upc_prim_output.append((upc,prim_ingred))

        except :
            #print i
            #upc_err.append(upc)
            #prim_err.append(prim_ingred)
            upc_prim_err.append((upc,prim_ingred))

        
        
    return prop_ing, loss_final,weight_sum,upc_prim_output,upc_prim_err



In [6]:
#Defining the List_Upc
list_reqd = list(df_total['UPC_and_Prim'])

In [25]:
#Workflow
epochs = 1

#Getting the most important ingredient
prop_ing, loss_final, weight_sum, upc_prim_output, upc_prim_err = get_prop_ingred_sec(list_reqd,epochs,df_prim_filt,
                                                                                  df_sec_ingred_nutr,nutritional_components,
                                                                                  data_unimp_extract,df_prop_dict_sec)


100%|██████████| 2243/2243 [01:32<00:00, 24.37it/s]


In [27]:
#Saving the Output
np.save(output_path_name + 'prop_ing',np.array(prop_ing))
np.save(output_path_name + 'loss_final',np.array(loss_final))
np.save(output_path_name + 'weight_sum',np.array(weight_sum))
np.save(output_path_name + 'upc_prim_output',np.array(upc_prim_output))

#np.save(output_path_name + 'upc_final',np.array(upc_final))
#np.save(output_path_name + 'prim_ingred_final',np.array(prim_ingred_final))