In [2]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds

In [4]:
input_path = "../../dataset/"
output_path = "../data/"

### recipe_data.csv

In [7]:
raw_recipes = pd.read_csv(input_path + 'RAW_recipes.csv')
pp_recipes = pd.read_csv(input_path + 'PP_recipes.csv')

In [None]:
raw_recipes.head()

In [None]:
pp_recipes.head()

In [6]:
raw_recipes_data = raw_recipes.loc[:, ['name', 'id', 'nutrition']]
raw_recipes_data.head()

Unnamed: 0,name,id,nutrition
0,arriba baked winter squash mexican style,137739,"[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]"
1,a bit different breakfast pizza,31490,"[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]"
2,all in the kitchen chili,112140,"[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]"
3,alouette potatoes,59389,"[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]"
4,amish tomato ketchup for canning,44061,"[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]"


In [7]:
pp_recipes_data = pp_recipes.loc[:, ['id', 'ingredient_ids', 'i']]
pp_recipes_data.head()

Unnamed: 0,id,ingredient_ids,i
0,424415,"[389, 7655, 6270, 1527, 3406]",23
1,146223,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,...",96900
2,312329,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696...",120056
3,74301,"[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]",168258
4,76272,"[3484, 6324, 7594, 243]",109030


In [8]:
# cleaned up racipe dataframe
recipe_data = pd.merge(raw_recipes_data, pp_recipes_data, on = 'id')
recipe_data = recipe_data.rename(columns={'i': 'fid', 'id' : 'full_id'})
recipe_data.set_index('fid', inplace = True)
recipe_data.sort_index(inplace = True)
recipe_data.head()

# [calories (#), total fat (PDV), sugar (PDV) , sodium (PDV) , protein (PDV) , saturated fat (PDV) , and carbohydrates (PDV)]

Unnamed: 0_level_0,name,full_id,nutrition,ingredient_ids
fid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,white bean green chile pepper soup,40893,"[204.8, 5.0, 9.0, 26.0, 24.0, 2.0, 10.0]","[3384, 7979, 2127, 3502, 3217, 1257, 2778, 500..."
1,devilicious cookie cake delights,44394,"[132.3, 11.0, 39.0, 5.0, 4.0, 11.0, 5.0]","[912, 7557, 2499, 5382]"
2,baked potato toppings,85009,"[2786.2, 342.0, 134.0, 290.0, 161.0, 301.0, 42.0]","[4623, 6265, 1168, 6016, 3597, 3440, 7213, 169..."
3,kfc honey bbq strips,134728,"[316.0, 4.0, 40.0, 37.0, 78.0, 4.0, 10.0]","[1304, 2683, 3217, 6270, 3532, 869, 7557, 3698..."
4,lamb stew with tomatoes chickpeas and spices,200236,"[606.5, 65.0, 12.0, 34.0, 65.0, 83.0, 7.0]","[4130, 6270, 3486, 7557, 5010, 3203, 2683, 125..."


In [9]:
recipe_data.to_csv(output_path + 'recipe_data.csv')

### ingr_data.csv

In [13]:
ingr_map = pd.read_pickle(input_path + 'ingr_map.pkl')

In [14]:
ingr_map.head()

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
0,"medium heads bibb or red leaf lettuce, washed,...",13,"medium heads bibb or red leaf lettuce, washed,...",73,lettuce,4507,4308
1,mixed baby lettuces and spring greens,6,mixed baby lettuces and spring green,36,lettuce,4507,4308
2,romaine lettuce leaf,3,romaine lettuce leaf,20,lettuce,4507,4308
3,iceberg lettuce leaf,3,iceberg lettuce leaf,20,lettuce,4507,4308
4,red romaine lettuce,3,red romaine lettuce,19,lettuce,4507,4308


In [15]:
ingr_data = ingr_map.loc[:, ['id', 'replaced', 'count']]
ingr_data.drop_duplicates(inplace=True)
ingr_data.sort_values(by='id', inplace=True)
ingr_data.rename(columns = {'replaced' : 'name', 'id': 'iid'}, inplace = True)
ingr_data.set_index('iid', inplace = True)
ingr_data.sort_index(inplace = True)
ingr_data.head()

Unnamed: 0_level_0,name,count
iid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,'s baking chocolate,2
1,'s chocolate chip,13
2,'s hugs chocolate,7
3,'s sauce,6
4,'s sour cream,2


In [16]:
ingr_data.to_csv(output_path + 'ingr_data.csv')

### rating_data.csv

In [9]:
PP_users = pd.read_csv(input_path + 'PP_users.csv')
rate_data = PP_users.set_index('u').loc[:, ['items', 'ratings']]
rate_data.head()

Unnamed: 0_level_0,items,ratings
u,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[1118, 27680, 32541, 137353, 16428, 28815, 658...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ..."
1,"[122140, 77036, 156817, 76957, 68818, 155600, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
2,"[168054, 87218, 35731, 1, 20475, 9039, 124834,...","[3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, ..."
3,"[163193, 156352, 102888, 19914, 169438, 55772,...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, ..."
4,"[72857, 38652, 160427, 55772, 119999, 141777, ...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, ..."


In [10]:
rate_data['items'] = rate_data['items'].str.replace(" ", "")
rate_data['items'] = rate_data['items'].apply(lambda x: x[1:-1].split(','))

In [11]:
rate_data['ratings'] = rate_data['ratings'].str.replace(" ", "")
rate_data['ratings'] = rate_data['ratings'].apply(lambda x: x[1:-1].split(','))

In [12]:
rate_data.head()

Unnamed: 0_level_0,items,ratings
u,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[1118, 27680, 32541, 137353, 16428, 28815, 658...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ..."
1,"[122140, 77036, 156817, 76957, 68818, 155600, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
2,"[168054, 87218, 35731, 1, 20475, 9039, 124834,...","[3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, ..."
3,"[163193, 156352, 102888, 19914, 169438, 55772,...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, ..."
4,"[72857, 38652, 160427, 55772, 119999, 141777, ...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, ..."


In [19]:
user_list = []
item_list = []
rating_list = []

In [22]:
for index, row in rate_data.iterrows():
    for x in range (0, len(row['items'])):
        item_id = int(row['items'][x])
        user_list.append(index)
        item_list.append(item_id)
        rating_list.append(float(row['ratings'][x]))

In [24]:
rating_data = pd.DataFrame({"user": user_list, "item": item_list, "rate": rating_list})

In [26]:
rating_data.to_csv(output_path + 'rating_data.csv', index = False, header = False)

### reduce_rating_data.csv

In [1]:
from surprise import Reader
from surprise import Dataset

In [5]:
df = pd.read_csv(output_path + 'rating_data.csv', names = ['userID', 'itemID', 'rating'])
df.head()

Unnamed: 0,userID,itemID,rating
0,0,1118,5.0
1,0,27680,5.0
2,0,32541,5.0
3,0,137353,5.0
4,0,16428,5.0


In [11]:
reduced = df[(df['userID']<=1000) & (df['itemID']<=5000)]

In [12]:
reduced

Unnamed: 0,userID,itemID,rating
0,0,1118,5.0
18,0,0,4.0
31,0,1118,5.0
49,0,0,4.0
87,1,4446,5.0
...,...,...,...
276789,999,2210,5.0
276832,999,453,5.0
276833,999,1690,5.0
276876,1000,610,5.0


In [13]:
reduced.to_csv(output_path + 'reduced_rating_data.csv', index = False, header = False)

In [14]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(reduced[['userID', 'itemID', 'rating']], reader)

In [15]:
# build test_set with anti_test
train_set = data.build_full_trainset()
test_set = train_set.build_anti_testset()

KeyboardInterrupt: 