In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import scale
import sklearn.metrics as sm
from sklearn import datasets
from sklearn.metrics import confusion_matrix, classification_report
from gensim.models import Word2Vec
import random
from tqdm import tqdm
import umap
import pandas as pd

# FOOD RECOMMENDATION MODEL

# Load dataset

In [2]:
r_recipe = pd.read_csv('../DATASETS/fooddotcom/RAW_recipes.csv')
p_recipe =  pd.read_csv('../DATASETS/fooddotcom/PP_recipes.csv')
i_recipe = pd.read_csv('../DATASETS/fooddotcom/RAW_interactions.csv')




In [3]:
p_recipe.head(1)

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"


In [4]:
r_recipe.head(1)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,9/16/2005,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7


In [5]:
i_recipe.head(1)

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2/17/2003,4,Great with a salad. Cooked on top of stove for...


# Data Merging

In [6]:
m_recipe = pd.merge(r_recipe,p_recipe,on='id')
m_recipe = m_recipe[['name','id','ingredients','description','ingredient_ids']]
m_recipe = m_recipe.rename(columns={"name": "recipe_name", "id": "recipe_id"})
merge_recipe = pd.merge(m_recipe,i_recipe,on='recipe_id')
merge_recipe = merge_recipe[['recipe_name','recipe_id','ingredients','ingredient_ids','user_id','rating']]
merge_recipe.head(1)




Unnamed: 0,recipe_name,recipe_id,ingredients,ingredient_ids,user_id,rating
0,arriba baked winter squash mexican style,137739,"['winter squash', 'mexican seasoning', 'mixed ...","[7933, 4694, 4795, 3723, 840, 5006, 6270]",4470,5


# CHECKING DATATYPES,NULL VALUES

In [7]:
merge_recipe.dtypes

recipe_name       object
recipe_id          int64
ingredients       object
ingredient_ids    object
user_id            int64
rating             int64
dtype: object

In [8]:
merge_recipe.isnull().sum()

recipe_name       0
recipe_id         0
ingredients       0
ingredient_ids    0
user_id           0
rating            0
dtype: int64

# CONVERTING INGREDIENTS DATATYPE TO STRING

In [9]:
merge_recipe['ingredients'] = pd.array(merge_recipe['ingredients'], dtype="string")
merge_recipe.dtypes

recipe_name               object
recipe_id                  int64
ingredients       string[python]
ingredient_ids            object
user_id                    int64
rating                     int64
dtype: object

# NUMBER OF UNIQUE USERS

In [10]:
users = merge_recipe["user_id"].unique().tolist()
len(users)



171378

# SPLIT DATASET

In [11]:
random.shuffle(users)

users_train = [users[i] for i in range(round(0.99*len(users)))]

train_users = merge_recipe[merge_recipe['user_id'].isin(users_train)]
validation_users = merge_recipe[~merge_recipe['user_id'].isin(users_train)]


# Capture list of food purchase history of users on 90% of users

In [12]:
purchases_train = []

for i in tqdm(users_train):
    temp = train_users[train_users["user_id"] == i]["recipe_name"].tolist()
    purchases_train.append(temp)
    

100%|████████████████████████████████████████████████████████████████████████| 169664/169664 [02:35<00:00, 1093.15it/s]


# Capture list of food purchase history of users on remaining users

In [13]:
purchases_val = []

for i in tqdm(validation_users['user_id'].unique()):
    temp = validation_users[validation_users["user_id"] == i]["recipe_id"].tolist()
    useridtemp = validation_users[validation_users["user_id"] == i]["user_id"].tolist()
    purchases_val.append(temp)

100%|████████████████████████████████████████████████████████████████████████████| 1714/1714 [00:01<00:00, 1536.10it/s]


In [14]:
def magic(numList):         
    s = map(str, numList)   
    s = ''.join(s)          
    s = int(s)              
    return s


        
    return np.mean(recipe_vec, axis=0)


user_purhist = validation_users.groupby("user_id")["recipe_name"].apply(lambda x: "{%s}" % ', '.join(x))
user_purhist =  pd.DataFrame(user_purhist)
user_purhist['recipe_count'] = validation_users.groupby("user_id")["recipe_name"].count()
user_purhist['recipe_id_list'] = validation_users.groupby("user_id")["recipe_id"].apply(lambda nums: (','.join(str(i) for i in nums)))


user_purhist.head(5)

Unnamed: 0_level_0,recipe_name,recipe_count,recipe_id_list
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3625,{spaetzle basic recipe art culinaire issue},1,350
4166,{mallow sweet potato balls},1,2425
4535,"{grilled orange sesame chicken and vegetables,...",2,921951964
6928,{should be illegal oven bbq ribs},1,8701
7164,{roast sticky chicken},1,8782



# Building vocabulary model using Word2Vec 
Detail link: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py

https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec

# REFERENCE


In [15]:
model = Word2Vec(window = 1, sg = 1, hs = 0,negative = 10,alpha=0.03, min_alpha=0.0007,seed = 14, compute_loss=True)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count,epochs=10, report_delay=1)

(6277240, 8565600)

In [16]:
model.init_sims(replace=True)
print(model)

Word2Vec<vocab=41301, vector_size=100, alpha=0.03>


  model.init_sims(replace=True)


In [17]:
model.wv.most_similar('creamed tuna on toast', topn=10)

[('creamy parmesan broccoli', 0.9940081834793091),
 ('creamy feta salad dressing and dip', 0.9939537048339844),
 ('creamy crock pot potatoes', 0.9934401512145996),
 ('creamy cucumbers', 0.993410587310791),
 ('creamy red potato salad', 0.9931479692459106),
 ('cranberry oatmeal muffins', 0.9928507208824158),
 ('creamy cucumber salad', 0.9928462505340576),
 ('creamy chicken dijon', 0.9928303956985474),
 ('creamy grape salad', 0.9924914240837097),
 ('creamy parmesan leeks', 0.9924176335334778)]