In [1]:
import pandas as pd
import copy
import random
import sys
import numpy as np
from numpy import dot
from numpy.linalg import norm

In [2]:
# setting path
sys.path.append('../code/src')

In [3]:
import load_data as ld

In [4]:
# constsant
# file path
dataset_path = "../../dataset/"
input_path = "../data/"
output_path = "../data/"

# size reduction
user_max = 1000
item_max = 5000

# number of dataset of each type of constraint
const_num = 3

# constraint column names
ul_name = 'u'
i1_name = 'i1'
i2_name = 'i2'
nl_name = 'nl'

# parameters for const1/const2
ingr_percent = 1.5
good_rat = 5.0
bad_rat = 0.0

# Load & Edit Dataframes

### Ingredient List

In [5]:
ingr_data = pd.read_csv(output_path + 'ingr_data.csv')
ingr_data.sort_values(by = 'count', ascending = False, inplace = True)
ingr_data.head()

Unnamed: 0,iid,name,count
6270,6270,salt,72848
840,840,butter,48039
2499,2499,egg,43350
5010,5010,onion,42631
6906,6906,sugar,37464


In [6]:
ingr_candidate = ingr_data.head(int(len(ingr_data)*(ingr_percent/100)))
ingr_candidate.head()

Unnamed: 0,iid,name,count
6270,6270,salt,72848
840,840,butter,48039
2499,2499,egg,43350
5010,5010,onion,42631
6906,6906,sugar,37464


In [7]:
ingr_list = ingr_candidate['iid'].tolist()

### Recipe List

In [8]:
recipe_data = ld.load_attr(input_path + 'recipe_data.csv')
recipe_data.head()

Unnamed: 0_level_0,name,full_id,nutrition,ingredient_ids
fid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,white bean green chile pepper soup,40893,"[204.8, 5.0, 9.0, 26.0, 24.0, 2.0, 10.0]","[3384, 7979, 2127, 3502, 3217, 1257, 2778, 500..."
1,devilicious cookie cake delights,44394,"[132.3, 11.0, 39.0, 5.0, 4.0, 11.0, 5.0]","[912, 7557, 2499, 5382]"
2,baked potato toppings,85009,"[2786.2, 342.0, 134.0, 290.0, 161.0, 301.0, 42.0]","[4623, 6265, 1168, 6016, 3597, 3440, 7213, 169..."
3,kfc honey bbq strips,134728,"[316.0, 4.0, 40.0, 37.0, 78.0, 4.0, 10.0]","[1304, 2683, 3217, 6270, 3532, 869, 7557, 3698..."
4,lamb stew with tomatoes chickpeas and spices,200236,"[606.5, 65.0, 12.0, 34.0, 65.0, 83.0, 7.0]","[4130, 6270, 3486, 7557, 5010, 3203, 2683, 125..."


### User List

In [9]:
user_data = pd.read_csv(dataset_path + 'PP_users.csv')

In [10]:
user_data['items'] = user_data['items'].str.replace(" ", "")
user_data['items'] = user_data['items'].apply(lambda x: x[1:-1].split(','))
user_data['ratings'] = user_data['ratings'].str.replace(" ", "")
user_data['ratings'] = user_data['ratings'].apply(lambda x: x[1:-1].split(','))

In [11]:
user_data = user_data[user_data['u'] < user_max]
user_data.set_index('u', inplace = True)
user_data = user_data.loc[:, ['items', 'ratings']]

In [12]:
user_data

Unnamed: 0_level_0,items,ratings
u,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[1118, 27680, 32541, 137353, 16428, 28815, 658...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ..."
1,"[122140, 77036, 156817, 76957, 68818, 155600, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
2,"[168054, 87218, 35731, 1, 20475, 9039, 124834,...","[3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, ..."
3,"[163193, 156352, 102888, 19914, 169438, 55772,...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, ..."
4,"[72857, 38652, 160427, 55772, 119999, 141777, ...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, ..."
...,...,...
995,"[324, 154349]","[5.0, 5.0]"
996,"[994, 42272, 159211, 31188, 59622, 114480, 168...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
997,"[324, 2949]","[5.0, 5.0]"
998,"[8345, 48889, 116201, 59869, 76258, 174467, 15...","[2.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."


# Construct Constraint 1 and 2

### Helper Function

In [13]:
def random_select(x): # pick from 0 to x
    result = []
    while len(result)<const_num:
        p = int(random.uniform(0, x))
        if p not in result:
            result.append(p)
    return result

### Consistency of Constraint
ingredient related constraints need to be consistent  
ex) included ingredient shouldn't have bad rating  
ex) excluded ingredient shouldn't have good rating

In [14]:
ingr_column = [copy.deepcopy(ingr_list) for x in range(len(user_data))]
len(ingr_column)

1000

In [15]:
user_data = user_data.assign(include = ingr_column)
user_data = user_data.assign(exclude = copy.deepcopy(ingr_column))
user_data.head()

Unnamed: 0_level_0,items,ratings,include,exclude
u,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[1118, 27680, 32541, 137353, 16428, 28815, 658...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655..."
1,"[122140, 77036, 156817, 76957, 68818, 155600, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655..."
2,"[168054, 87218, 35731, 1, 20475, 9039, 124834,...","[3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, ...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655..."
3,"[163193, 156352, 102888, 19914, 169438, 55772,...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, ...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655..."
4,"[72857, 38652, 160427, 55772, 119999, 141777, ...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, ...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655..."


In [16]:
for index, row in user_data.iterrows():
    for x in range (0, len(row['items'])):
        item_id = int(row['items'][x])
        if(item_id >= item_max): # ignore if item id is larger or equal than 5000
            continue
            
        if float(row['ratings'][x]) == good_rat : # remove ingredients included in the item frome exclude list
            ingrs = recipe_data.loc[item_id].ingredient_ids
            for ingr in ingrs:
                if int(ingr) in row['exclude']:
                    row['exclude'].remove(int(ingr))
        if float(row['ratings'][x]) == bad_rat :
            ingrs = recipe_data.loc[item_id].ingredient_ids
            for ingr in ingrs:
                if int(ingr) in row['include']:
                    row['include'].remove(int(ingr))

In [17]:
user_data.head()

Unnamed: 0_level_0,items,ratings,include,exclude
u,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[1118, 27680, 32541, 137353, 16428, 28815, 658...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655...","[840, 2499, 5010, 6906, 3203, 7655, 4717, 2683..."
1,"[122140, 77036, 156817, 76957, 68818, 155600, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655...","[2499, 5010, 6906, 3203, 7655, 4717, 2683, 800..."
2,"[168054, 87218, 35731, 1, 20475, 9039, 124834,...","[3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, ...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655..."
3,"[163193, 156352, 102888, 19914, 169438, 55772,...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, ...","[6270, 840, 2499, 5010, 6906, 5006, 3203, 7655...","[7449, 6654, 1257, 4987, 1124, 3723, 2856, 298..."
4,"[72857, 38652, 160427, 55772, 119999, 141777, ...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, ...","[6270, 840, 2499, 5010, 6906, 5006, 7655, 4717...","[5010, 6906, 7655, 5319, 3184, 63, 6276, 4253,..."


In [18]:
for index, row in user_data.iterrows():
    # select random ingr to include
    if len(row['include']) > const_num:
        pick = random_select(len(row['include'])-1)
        result = []
        for p in pick:
            result.append(row['include'][p])
        row['include'] = result

    # select random ingr to exclude
    if len(row['exclude']) > const_num:
        pick = random_select(len(row['exclude'])-1)
        result = []
        for p in range (0, len(pick)):
            result.append(row['exclude'][p])
        row['exclude'] = result            

In [19]:
user_data.head()

Unnamed: 0_level_0,items,ratings,include,exclude
u,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[1118, 27680, 32541, 137353, 16428, 28815, 658...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ...","[1505, 3184, 1043]","[840, 2499, 5010]"
1,"[122140, 77036, 156817, 76957, 68818, 155600, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...","[2683, 2022, 3497]","[2499, 5010, 6906]"
2,"[168054, 87218, 35731, 1, 20475, 9039, 124834,...","[3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, ...","[7233, 150, 5298]","[6270, 840, 2499]"
3,"[163193, 156352, 102888, 19914, 169438, 55772,...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, ...","[3668, 2809, 6270]","[7449, 6654, 1257]"
4,"[72857, 38652, 160427, 55772, 119999, 141777, ...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, ...","[4574, 2022, 1505]","[5010, 6906, 7655]"


In [20]:
ingr_const = user_data.loc[:, ['include', 'exclude']]
ingr_const.head()

Unnamed: 0_level_0,include,exclude
u,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[1505, 3184, 1043]","[840, 2499, 5010]"
1,"[2683, 2022, 3497]","[2499, 5010, 6906]"
2,"[7233, 150, 5298]","[6270, 840, 2499]"
3,"[3668, 2809, 6270]","[7449, 6654, 1257]"
4,"[4574, 2022, 1505]","[5010, 6906, 7655]"


## Construct const_1.x

In [21]:
# build 3 datasets
for i in range (0, const_num):
    ul = []
    i1 = []
    i2 = []
    nl = []
    
    for u in range (0, user_max):
        ul.append(u)
        i1.append(ingr_const.iloc[u].include[i])
        i2.append(None)
        nl.append(None)
    
    data = {
        ul_name: ul,
        i1_name: i1,
        i2_name: i2,
        nl_name: nl
    }
    
    df = pd.DataFrame(data)
    df.to_csv(output_path + 'const_1.' + str(i+1) + '.csv', index = False)

In [22]:
df.head()

Unnamed: 0,u,i1,i2,nl
0,0,1043,,
1,1,3497,,
2,2,5298,,
3,3,6270,,
4,4,1505,,


## Construct Const_2.x

In [23]:
# build 3 datasets
for i in range (0, const_num):
    ul = []
    i1 = []
    i2 = []
    nl = []
    
    for u in range (0, user_max):
        ul.append(u)
        i2.append(ingr_const.iloc[u].exclude[i])
        i1.append(None)
        nl.append(None)
    
    data = {
        ul_name: ul,
        i1_name: i1,
        i2_name: i2,
        nl_name: nl
    }
    
    df = pd.DataFrame(data)
    df.to_csv(output_path + 'const_2.' + str(i+1) + '.csv', index = False)

In [24]:
df.head()

Unnamed: 0,u,i1,i2,nl
0,0,,5010,
1,1,,6906,
2,2,,2499,
3,3,,1257,
4,4,,7655,


# Construct Nutrient Related Constraints

### Helper Function

In [25]:
def cos_sim(A, B):
  return dot(A, B)/(norm(A)*norm(B))

In [26]:
c3_dict = {}
for user in range (0, user_max):
    nutrs= []
    for item in user_data.iloc[user][0]:
        idx = user_data.iloc[user][0].index(item)
        if int(item) > item_max: # ignore if item idx > item_max
            continue
        if float(user_data.iloc[user][1][idx]) < 4: # ignore if item rate < 4
            continue
        nutrs.append(recipe_data.iloc[int(item)].nutrition)
    nutrs.sort()
    
    # pick 3 nutrients
    c3_dict[user] = []
    if len(nutrs) == 0: # no information
        for i in range (0, const_num):
            p = int(random.uniform(0, item_max))
            c3_dict[user].append(recipe_data.iloc[p].nutrition)
    else:
        c3_dict[user].append(nutrs[0])
        c3_dict[user].append(nutrs[int(len(nutrs)/2)])
        c3_dict[user].append(nutrs[-1])

### Test

In [27]:
alp = 0.5
user = 3
for i in range (2, const_num):
    RMSE = 0
    for item in user_data.iloc[user][0]:
        idx = user_data.iloc[user][0].index(item)
        if int(item) > item_max:
            continue
        rate = float(user_data.iloc[user][1][idx])
        if rate < 4:
            continue
        nutr = recipe_data.iloc[int(item)].nutrition
        
        new_rate = cos_sim(nutr, c3_dict[user][i]) * alp * 5
        new_rate = new_rate + rate * (1-alp)
        RMSE = RMSE + (new_rate - rate) * (new_rate - rate)
        print('rate: ', rate)
        print('similarity: ', cos_sim(nutr, c3_dict[user][i]) * alp * 5)
        print('new rate: ', new_rate)
    RMSE = RMSE / len(user_data.iloc[user][0])
    print(RMSE)
    break

rate:  5.0
similarity:  2.425832621402183
new rate:  4.925832621402183
rate:  5.0
similarity:  2.453080544641862
new rate:  4.953080544641862
rate:  5.0
similarity:  2.3164254650505716
new rate:  4.816425465050571
rate:  5.0
similarity:  2.4653237825516725
new rate:  4.965323782551673
rate:  5.0
similarity:  2.418278152366524
new rate:  4.918278152366524
rate:  5.0
similarity:  2.4738517631262837
new rate:  4.973851763126284
rate:  5.0
similarity:  2.4774919045404573
new rate:  4.977491904540457
rate:  5.0
similarity:  2.471063460032314
new rate:  4.971063460032314
rate:  5.0
similarity:  2.15488654394671
new rate:  4.65488654394671
rate:  5.0
similarity:  2.069586954148143
new rate:  4.569586954148143
rate:  5.0
similarity:  2.4803934109291816
new rate:  4.980393410929182
rate:  5.0
similarity:  2.5000000000000004
new rate:  5.0
rate:  5.0
similarity:  2.4670855965127365
new rate:  4.967085596512737
rate:  4.0
similarity:  2.4859217488188836
new rate:  4.485921748818884
rate:  5.0
sim

## Construct Const_3.x

In [28]:
# build 3 datasets
for i in range (0, const_num):
    ul = []
    i1 = []
    i2 = []
    nl = []
    
    for u in range (0, user_max):
        ul.append(u)
        i1.append(None)
        i2.append(None)
        nl.append(c3_dict[u][i])
    
    data = {
        ul_name: ul,
        i1_name: i1,
        i2_name: i2,
        nl_name: nl
    }
    
    df = pd.DataFrame(data)
    df.to_csv(output_path + 'const_3.' + str(i+1) + '.csv', index = False)
    df.head()