In [1]:
# Raw Interactions: User-id, Recipe-id, Rating Given, Date of Interaction

# (Recipe id, User Id) = recipe rating given by user

In [2]:
import pandas as pd

data_df = pd.read_csv('./interactions_train.csv', sep=',', skiprows=[0], engine='python', names=["UserID", "RecipeID", "Date", "Rating", "u", "i"])
data_df.head()

Unnamed: 0,UserID,RecipeID,Date,Rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723


In [3]:
print('recipe num = ' + str(len(data_df['RecipeID'].unique())))
print('user num = ' + str(len(data_df['UserID'].unique())))
print('rating num = ' + str(len(data_df)))

recipe num = 160901
user num = 25076
rating num = 698901


In [4]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix

# First, generate dictionaries for mapping old id to new id for users and recipes
unique_RecipeID = data_df['RecipeID'].unique()
unique_UserID = data_df['UserID'].unique()
j = 0
user_old2new_id_dict = dict()
for u in unique_UserID:
    user_old2new_id_dict[u] = j
    j += 1
j = 0
recipe_old2new_id_dict = dict()
for i in unique_RecipeID:
    recipe_old2new_id_dict[i] = j
    j += 1
    
# Then, use the generated dictionaries to reindex UserID and RecipeID in the data_df
for j in range(len(data_df)):
    data_df.at[j, 'UserID'] = user_old2new_id_dict[data_df.at[j, 'UserID']]
    data_df.at[j, 'RecipeID'] = recipe_old2new_id_dict[data_df.at[j, 'RecipeID']]


In [5]:
from scipy.sparse import coo_matrix
data_df = data_df[0:250000]
num_user = len(data_df['UserID'].unique())
num_recipe = len(data_df['RecipeID'].unique())

train_mat = coo_matrix((data_df['Rating'].values, (data_df['UserID'].values, data_df['RecipeID'].values)), shape=(num_user, num_recipe)).toarray().astype(float)

In [6]:
# find number of ratings for each recipe
num_ratings_recipe = np.count_nonzero(train_mat > 0, axis=0)
print(num_ratings_recipe)

[18  1 11 ...  1  1  1]


In [7]:
# find sum of ratings for each recipe
ratings_sum = train_mat.sum(axis=0)

In [8]:
# find weighted average for each recipe
weighted_averages = ratings_sum * num_ratings_recipe
# sort the recipes by weighted average and generate top k list of recipes
top_recipes = sorted(range(len(weighted_averages)), key=lambda i: weighted_averages[i])[-num_recipe:]
top_recipes_indices = top_recipes[::-1]

In [9]:
recipes_df = pd.read_csv('./RAW_recipes.csv', sep=',', skiprows=[0], engine='python', names=["RecipeName", "RecipeID", "Minutes", "ContributorId", "Submitted", "Tags", "Nutrition", "N-steps", "Steps", "Description","Ingredients", "N-ingredients"])

In [10]:
# put all names of recipes in a list
all_recipes = []
for index, row in recipes_df.iterrows():
    recipeName = row.RecipeName
    all_recipes.append(recipeName)

In [11]:
# find and create a list for the top k list recipe names
top_recipes_names = []
for recipe_id in top_recipes_indices:
    recipeName = recipes_df.loc[recipes_df['RecipeID'] == recipe_id].RecipeName.values
    recipeName = recipeName.tolist()
    top_recipes_names.append(recipeName)

In [12]:
# generate non-personalized top-k lists for each category
top_beginner_friendly = []
top_vegetarian = []
top_healthy = []
top_vegan = []
top_desserts = []
top_asian = []
top_indian = []
top_15minutes = []
top_diabetic = []
top_lactose = []
top_breakfast = []
top_lunch = []
top_snacks = []
top_nutfree = []
top_ramadan = []
top_dairy_free = []

for recipe_id in top_recipes_indices:
    row = recipes_df.loc[recipes_df['RecipeID'] == recipe_id]
    tag = row.Tags.values
    name = row.RecipeName.values
    if len(tag) > 0:
        if 'easy' in tag[0]:
            top_beginner_friendly.append(name[0])
        if 'vegetarian' in tag[0]:
            top_vegetarian.append(name[0])
        if 'healthy' in tag[0]:
            top_healthy.append(name[0])
        if 'vegan' in tag[0]:
            top_vegan.append(name[0])
        if 'desserts' in tag[0]:
            top_desserts.append(name[0])
        if 'indian' in tag[0]:
            top_indian.append(name[0])
        if 'asian' in tag[0]:
            top_asian.append(name[0])
        if '15-minutes-or-less' in tag[0]:
            top_15minutes.append(name[0])
        if 'lactose' in tag[0]:
            top_lactose.append(name[0])
        if 'diabetic' in tag[0]:
            top_diabetic.append(name[0])

In [13]:
# output to file 
# name of csv file 
import csv
fields = ['RecipeID', 'RecipeName']
filename = "top_recipes.csv"
    
# writing to csv file 
with open(filename, 'w') as csvfile:
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile) 
    # writing the fields 
    csvwriter.writerow(fields) 
    rows = []
    for index, recipe in enumerate(top_recipes_names):
        if len(recipe) > 0:
            # writing the data rows
            rows.append([top_recipes_indices[index], recipe[0]])
    csvwriter.writerows(rows)

In [14]:
recipes_df["Tags"] = recipes_df["Tags"].apply(eval)

In [15]:
# put all tags inside a list
tags_list =[]
for index, row in recipes_df.iterrows():
    tags = row["Tags"]
    for tag in tags:
        if tag not in tags_list:
            tags_list.append(tag)
print(len(tags_list))

552


In [16]:
# First, generate dictionaries for mapping old id to new id for users and recipes
unique_RecipeID = recipes_df['RecipeID'].unique()
j = 0
recipe_old2new_id_dict = dict()
for i in unique_RecipeID:
    recipe_old2new_id_dict[i] = j
    j += 1
    
# Then, use the generated dictionaries to reindex UserID and RecipeID in the data_df
for j in range(len(recipes_df)):
    recipes_df.at[j, 'RecipeID'] = recipe_old2new_id_dict[recipes_df.at[j, 'RecipeID']]

In [17]:
# generate tag list for first 10k recipes
recipes_df = recipes_df[0:10000]
recipe_tag_list = []
for index, row in recipes_df.iterrows():
    current_tags = row['Tags']
    tags_present = []
    if len(current_tags) > 0:
        for tag in tags_list:
            if tag in current_tags:
                tags_present.append(1)
            else:
                tags_present.append(0)
    else:
        tags_present = [0] * len(tags_list)
    recipe_tag_list.append(tags_present)

In [18]:
 recipe_tag_mat = np.array(recipe_tag_list) # (row, tag)

In [19]:
# generate a (recipe, tags) matrix with size (10k, 552)

# binary matrix to indicate whether there is a rating for a user-movie pair
indicator_mat = (recipe_tag_mat > 0).astype(float)  # size = (#user, #movie)  

# calculate the number of ratings for each user
num_rating_per_recipe = np.sum(indicator_mat, axis=1, keepdims=True)  # size = (#user, 1)  

# calculate the numerator of Jaccard similarity: for two users, calculate the number of movies both of they rated
numerator = np.matmul(indicator_mat, indicator_mat.T)  # size = (#user, #user)

# calculate the denominator of Jaccard similarity: for two users, calculate the number of movies they rated in total
denominator = num_rating_per_recipe + num_rating_per_recipe.T - numerator  # size = (#user, #user)

# set 0 to be 1 to avoid error in division 
denominator[denominator == 0] = 1
num_recipe = len(recipes_df)
# calculate Jaccard similarity matrix
Jaccard_mat = numerator / denominator  # size = (#user, #user)
prediction_mat = recipe_tag_mat.copy()
num_rating_recipe = np.sum(indicator_mat, axis=1, keepdims=True)
num_rating_recipe[num_rating_recipe == 0] = 1
mu_recipe = np.sum(recipe_tag_mat, axis=1, keepdims=True) / num_rating_recipe
deviation_mat = (recipe_tag_mat - mu_recipe) * indicator_mat
prediction_mat = []
for u in range(num_recipe):
    similarities = Jaccard_mat[u, :]
    similarities[u] = -1
    N_idx = np.argpartition(similarities, -10)[-10:]
    prediction_mat.append(N_idx)

In [20]:
# generate a list of top 10 neighbor names for each recipe
item_neighbors = []
for index, prediction in enumerate(prediction_mat):
    res_list = [top_recipes_names[i] for i in prediction_mat[index]]
    item_neighbors.append(res_list)

In [21]:
print(len(item_neighbors))

10000


In [22]:
# writing to csv file 
fields = ['Recipe', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
filename = "personalized.csv"
with open(filename, 'w') as csvfile:
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile) 
    # writing the fields 
    csvwriter.writerow(fields) 
    rows = []
    for index, recipe in enumerate(all_recipes[0:10000]):
        # only include recipes with a name
        recipe = {index: recipe}
        neighbors = item_neighbors[index]  
        flat_list = []
        for neighbor in neighbors:
            if len(neighbor) > 0:
                flat_list.append(neighbor[0])
            else:
                flat_list.append("")
        rows.append([recipe, flat_list[0], flat_list[1], flat_list[2], flat_list[3], flat_list[4], flat_list[5]
                    , flat_list[6], flat_list[7], flat_list[8], flat_list[9]])
    csvwriter.writerows(rows)