In [2]:
# import pandas as pd
# from bs4 import BeautifulSoup 
# import requests
# import time 
# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
# from selenium.common.exceptions import NoSuchElementException
# import numpy as np
# import re

In [2]:
import os
import sys
import logging
import unidecode
import ast

import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# pip install import-ipynb
import import_ipynb

# import config
# from ingredient_parser import ingredient_parser
from Cleaning_Data import ingredient_cleaner

importing Jupyter notebook from Cleaning_Data.ipynb


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\12242\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\12242\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
def get_and_sort_corpus(data):
    """
    Get corpus with the documents sorted in alphabetical order
    """
    corpus_sorted = []
    for doc in data.parsed.values:
        if isinstance(doc, list):
            doc_list = doc
        else:
            doc_list = ast.literal_eval(doc)
#         doc_list = ast.literal_eval(doc)
        doc_list.sort()
        corpus_sorted.append(doc_list)
    return corpus_sorted

In [4]:
def get_recommendations(N, scores):
    """
    Top-N recomendations order by score
    """
    # load in recipe dataset
    df_recipes = pd.read_csv('df_parsed_2.csv')
    # order the scores with and filter to get the highest N scores
    top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:N]
    # create dataframe to load in recommendations
    recommendation = pd.DataFrame(columns=["recipe", "ingredients", "score", "url"])
    count = 0
    for i in top:
        recommendation.at[count, "recipe"] = title_parser(df_recipes["recipe_name"][i])
        recommendation.at[count, "ingredients"] = ingredient_parser_final(
            df_recipes["ingredients"][i]
        )
        recommendation.at[count, "url"] = df_recipes["recipe_urls"][i]
        recommendation.at[count, "preperation_time"] = df_recipes["prep_time"][i]
        recommendation.at[count, "cook_time"] = df_recipes["cook_time"][i]
        recommendation.at[count, "serves"] = df_recipes["serve"][i]
        recommendation.at[count, "level_of_cooking"] = df_recipes["level_of_cooking"][i]
        recommendation.at[count, "taste"] = df_recipes["taste"][i]
        recommendation.at[count, "score"] = f"{scores[i]}"
        count += 1
    return recommendation

In [5]:
def title_parser(title):
    title = unidecode.unidecode(title)
    return title

In [6]:
def ingredient_parser_final(ingredient):
    """
    neaten the ingredients being outputted
    """
    if isinstance(ingredient, list):
        ingredients = ingredient
    else:
        ingredients = ast.literal_eval(ingredient)

    ingredients = ",".join(ingredients)
    ingredients = unidecode.unidecode(ingredients)
    return ingredients

In [7]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word_model):
        self.word_model = word_model
        self.vector_size = word_model.wv.vector_size

    def fit(self):  # comply with scikit-learn transformer requirement
        return self

    def transform(self, docs):  # comply with scikit-learn transformer requirement
        doc_word_vector = self.word_average_list(docs)
        return doc_word_vector

    def word_average(self, sent):
        """
        Compute average word vector for a single doc/sentence.
        :param sent: list of sentence tokens
        :return:
            mean: float of averaging word vectors
        """
        mean = []
        for word in sent:
            if word in self.word_model.wv.index_to_key:
                mean.append(self.word_model.wv.get_vector(word))

        if not mean:  # empty words
            # If a text is empty, return a vector of zeros.
            # logging.warning(
            #     "cannot compute average owing to no vector for {}".format(sent)
            # )
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis=0)
            return mean

    def word_average_list(self, docs):
        """
        Compute average word vector for multiple docs, where docs had been tokenized.
        :param docs: list of sentence in list of separated tokens
        :return:
            array of average word vector in shape (len(docs),)
        """
        return np.vstack([self.word_average(sent) for sent in docs])

In [8]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word_model):

        self.word_model = word_model
        self.word_idf_weight = None
        self.vector_size = word_model.wv.vector_size

    def fit(self, docs):  # comply with scikit-learn transformer requirement
        """
        Fit in a list of docs, which had been preprocessed and tokenized,
        such as word bi-grammed, stop-words removed, lemmatized, part of speech filtered.
        Then build up a tfidf model to compute each word's idf as its weight.
        Noted that tf weight is already involved when constructing average word vectors, and thus omitted.
        :param
            pre_processed_docs: list of docs, which are tokenized
        :return:
            self
        """

        text_docs = []
        for doc in docs:
            text_docs.append(" ".join(doc))

        tfidf = TfidfVectorizer()
        tfidf.fit(text_docs)  # must be list of text string

        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of
        # known idf's
        max_idf = max(tfidf.idf_)  # used as default value for defaultdict
        self.word_idf_weight = defaultdict(
            lambda: max_idf,
            [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()],
        )
        return self

    def transform(self, docs):  # comply with scikit-learn transformer requirement
        doc_word_vector = self.word_average_list(docs)
        return doc_word_vector

    def word_average(self, sent):
        """
        Compute average word vector for a single doc/sentence.
        :param sent: list of sentence tokens
        :return:
            mean: float of averaging word vectors
        """

        mean = []
        for word in sent:
            if word in self.word_model.wv.index_to_key:
                mean.append(
                    self.word_model.wv.get_vector(word) * self.word_idf_weight[word]
                )  # idf weighted

        if not mean:  # empty words
            # If a text is empty, return a vector of zeros.
            # logging.warning(
            #     "cannot compute average owing to no vector for {}".format(sent)
            # )
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis=0)
            return mean

    def word_average_list(self, docs):
        """
        Compute average word vector for multiple docs, where docs had been tokenized.
        :param docs: list of sentence in list of separated tokens
        :return:
            array of average word vector in shape (len(docs),)
        """
        return np.vstack([self.word_average(sent) for sent in docs])

In [9]:
def get_recs(ingredients, N=5, mean=True):
    # load in word2vec model
    model = Word2Vec.load("model_cbow7.bin")
    model.init_sims(replace=True)
    if model:
        print("Successfully loaded model")
        print(model)
    # load in data
    data = pd.read_csv("df_parsed_2.csv")
    # parse ingredients
    data["parsed"] = data.ingredients.apply(ingredient_cleaner)
    # create corpus
    corpus = get_and_sort_corpus(data)

    if mean:
        # get average embdeddings for each document
        mean_vec_tr = MeanEmbeddingVectorizer(model)
        doc_vec = mean_vec_tr.transform(corpus)
        doc_vec = [doc.reshape(1, -1) for doc in doc_vec]
        assert len(doc_vec) == len(corpus)
    else:
        # use TF-IDF as weights for each word embedding
        tfidf_vec_tr = TfidfEmbeddingVectorizer(model)
        tfidf_vec_tr.fit(corpus)
        doc_vec = tfidf_vec_tr.transform(corpus)
        doc_vec = [doc.reshape(1, -1) for doc in doc_vec]
        assert len(doc_vec) == len(corpus)

    # create embessing for input text
    input = ingredients
    # create tokens with elements
    input = input.split(",")
    # parse ingredient list
    input = ingredient_cleaner(input)
    # get embeddings for ingredient doc
    if mean:
        input_embedding = mean_vec_tr.transform([input])[0].reshape(1, -1)
    else:
        input_embedding = tfidf_vec_tr.transform([input])[0].reshape(1, -1)

    # get cosine similarity between input embedding and all the document embeddings
    cos_sim = map(lambda x: cosine_similarity(input_embedding, x)[0][0], doc_vec)
    scores = list(cos_sim)
    # Filter top N recommendations
    recommendations = get_recommendations(N, scores)
    return recommendations

In [10]:
if __name__ == "__main__":
    input = "chicken"
    rec = get_recs(input)
    print(rec)
    rec.head(5)

  model.init_sims(replace=True)


Successfully loaded model
Word2Vec(vocab=7019, vector_size=100, alpha=0.2)
                                   recipe  \
0                 Country Captain Chicken   
1                 Coorg Style Dry Chicken   
2                Chicken Stew- Sk Khazana   
3                        Pollo A La Brasa   
4  Chicken Tagine With Chickpeas And Mint   

                                         ingredients               score  \
0  Chicken cut into pieces 1 kilogram,Oil 1 table...  0.5369949339739375   
1  Chicken cut into 8 pieces on the bone 750 gram...  0.5296900021167451   
2  Boneless chicken breasts cut into 1/2 inch cub...  0.5173323377048464   
3  Chicken cut into 4 pieces, 1 whole,Soy sauce 1...  0.5121328081191735   
4  Chickpeas (kabuli chana) soaked and parboiled ...  0.5070411692371333   

                                                 url preperation_time  \
0  https://www.sanjeevkapoor.com/Recipe/Country-C...            11-15   
1  https://www.sanjeevkapoor.com/Recipe/Coorg-Sty..

In [11]:
input = "chicken"
rec = get_recs(input)
rec.head(5)

  model.init_sims(replace=True)


Successfully loaded model
Word2Vec(vocab=7019, vector_size=100, alpha=0.2)


Unnamed: 0,recipe,ingredients,score,url,preperation_time,cook_time,serves,level_of_cooking,taste
0,Country Captain Chicken,"Chicken cut into pieces 1 kilogram,Oil 1 table...",0.5369949339739375,https://www.sanjeevkapoor.com/Recipe/Country-C...,11-15,21-25,4,Moderate,Mild
1,Coorg Style Dry Chicken,Chicken cut into 8 pieces on the bone 750 gram...,0.5296900021167451,https://www.sanjeevkapoor.com/Recipe/Coorg-Sty...,6-10,26-30,4,Easy,Spicy
2,Chicken Stew- Sk Khazana,Boneless chicken breasts cut into 1/2 inch cub...,0.5173323377048464,https://www.sanjeevkapoor.com/Recipe/Chicken-S...,6-10,21-25,4,Easy,Mild
3,Pollo A La Brasa,"Chicken cut into 4 pieces, 1 whole,Soy sauce 1...",0.5121328081191735,https://www.sanjeevkapoor.com/Recipe/Pollo-a-L...,900.0-1200.0,16-20,4,Moderate,Mild
4,Chicken Tagine With Chickpeas And Mint,Chickpeas (kabuli chana) soaked and parboiled ...,0.5070411692371333,https://www.sanjeevkapoor.com/Recipe/Chicken-T...,420.0-480.0,26-30,4,Moderate,Mild


In [12]:
list(rec.columns)

['recipe',
 'ingredients',
 'score',
 'url',
 'preperation_time',
 'cook_time',
 'serves',
 'level_of_cooking',
 'taste']