recipes-telegram-bot/src/recommendation_engine/data.py /

1. Importing Data

In [1]:
import os
import pandas as pd

DATA_CUISINE_PATH = "data/cuisine_data/"
DATA_RECIPES_PATH = "data/recipes_data/"

def import_data():
    train = pd.read_json(os.path.join(DATA_CUISINE_PATH, 'train.json'))
    test = pd.read_json(os.path.join(DATA_CUISINE_PATH, 'test.json'))
    return pd.concat([train,test],axis=0)

def import_recipes_main():
    data_path_ar = os.path.join(DATA_RECIPES_PATH, "recipes_raw_nosource_ar.json")
    data_path_epi = os.path.join(DATA_RECIPES_PATH, "recipes_raw_nosource_epi.json")
    data_path_fn = os.path.join(DATA_RECIPES_PATH, "recipes_raw_nosource_fn.json")
    
    data =  pd.concat([pd.read_json(data_path_ar, orient='index'), pd.read_json(data_path_epi, orient='index'), pd.read_json(data_path_fn, orient='index')])
    data = data.reset_index()
    data = data.drop(columns=['picture_link', 'index'])
    return data

In [30]:
def import_recipes_main_test():
    all_recipes = pd.read_json('./data/recipes_data/recipes_raw_nosource_allrecipes.json', orient='index')
    epicurious = pd.read_json('./data/recipes_data/recipes_raw_nosource_epicurious.json', orient='index')
    food_network = pd.read_json('./data/recipes_data/recipes_raw_nosource_foodnetwork.json', orient='index')
    recipes = pd.concat([all_recipes, epicurious, food_network], axis=0)
    
    recipes = recipes.reset_index()
    recipes = recipes.drop(columns=['index', 'picture_link'])
    return recipes

In [27]:
all_recipes = pd.read_json('./data/recipes_data/recipes_raw_nosource_allrecipes.json', orient='index')

epicurious = pd.read_json('./data/recipes_data/recipes_raw_nosource_epicurious.json', orient='index')

food_network = pd.read_json('./data/recipes_data/recipes_raw_nosource_foodnetwork.json', orient='index')

recipes = pd.concat([all_recipes, epicurious, food_network], axis=0)


In [31]:
import_recipes_main_test()

Unnamed: 0,title,ingredients,instructions
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ..."
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ..."
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...
...,...,...,...
124642,Summer Corn Salad,"[4 ears fresh corn, 2 heads Belgian endive, 2 ...",Watch how to make this recipe.\nPreheat a gril...
124643,Zucchini Stuffed Tomatoes,"[4 large plum tomatoes, Salt and sugar, 1 1/2 ...",Preheat the broiler. Cut the tomatoes in 1/2 c...
124644,Pepper Pasta Quick Cook,"[3 tablespoons olive oil, 2 tablespoons unsalt...",Heat the oil and butter in a large skillet ove...
124645,Chocolate Cake with Armagnac Ice Cream,"[8 ounces butter, 8 ounces bittersweet chocola...",Preheat oven to 350 degrees. On the top half o...


recipes-telegram-bot/src/recommendation_engine/feature_engineering.py /

2. Stopwords

In [2]:
import nltk
import re
import pandas as pd
from sklearn import feature_extraction, model_selection, pipeline, manifold, preprocessing

#from src.recommendation_engine.data import import_data

additional_stop_words = ["advertisement", "advertisements",
                         "cup", "cups",
                         "tablespoon", "tablespoons", 
                         "teaspoon", "teaspoons", 
                         "ounce", "ounces",
                         "salt", 
                         "pepper", 
                         "pound", "pounds",
                         ]
#nltk.download('wordnet')
#nltk.download("stopwords")

[nltk_data] Downloading package wordnet to /Users/reina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/reina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

3. Pre-Processing Text

In [3]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()

    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)

    ## Remove digits
    text = ''.join([i for i in text if not i.isdigit()])

    ## remove mutliple space
    text = re.sub(' +', ' ', text)

    return text

4. Pre-Process Data: Cuisine

In [11]:
def process_data():
    dataset = import_data() # cuisine

    def processing(row):
        ls = row['ingredients']
        return ' '.join(ls)

    dataset['ingredients'] = dataset.apply(lambda x: processing(x), axis=1)
    dataset.dropna(inplace=True)
    dataset = dataset.drop(columns=['id']).reset_index(drop=True)

    stop_word_list = nltk.corpus.stopwords.words("english")

    # Extend list of stop words
    stop_word_list.extend(additional_stop_words)

    dataset["ingredients_query"] = dataset["ingredients"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=stop_word_list))
    return dataset

5. Create Embeddings from Cuisine

In [None]:
def create_embeddings(dataset):
    ## Tf-Idf (advanced variant of BoW)
    vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))

    corpus = dataset["ingredients_query"]
    vectorizer.fit(corpus)
    embedded_ingredients = vectorizer.transform(corpus)
    dic_vocabulary = vectorizer.vocabulary_

    ## Chi squarred correlation embeddings reduction
    labels = dataset["cuisine"]
    names = vectorizer.get_feature_names()
    p_value_limit = 0.95
    dtf_features = pd.DataFrame()

    for cat in np.unique(labels):
        chi2, p = feature_selection.chi2(embedded_ingredients, labels==cat)
        dtf_features = dtf_features.append(pd.DataFrame(
                       {"feature":names, "score":1-p, "labels":cat}))
        dtf_features = dtf_features.sort_values(["labels","score"], 
                        ascending=[True,False])
        dtf_features = dtf_features[dtf_features["score"]>p_value_limit]
    names = dtf_features["feature"].unique().tolist()

    ## Check the main ingredients
    for cat in np.unique(labels):
        print("# {}:".format(cat))
        print("  . selected features:",len(dtf_features[dtf_features["labels"]==cat]))
        print("  . top features:", ",".join(dtf_features[dtf_features["labels"]==cat]["feature"].values[:10]))
        print(" ")
    
    ## New embeddings
    vectorizer = feature_extraction.text.TfidfVectorizer(vocabulary=names)
    vectorizer.fit(corpus)
    embedded_ingredients = vectorizer.transform(corpus)
    dic_vocabulary = vectorizer.vocabulary_

    return vectorizer

6. Pre-Process Recipes

In [35]:
def process_recipes(data): # Recipes dataset
    # list of stopwords
    stop_word_list = nltk.corpus.stopwords.words("english")

    # Extend list of stop words
    stop_word_list.extend(additional_stop_words)

    data["ingredients_query"] = data["ingredients"].apply(lambda x: 
            utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
            lst_stopwords=stop_word_list))
    return data

def get_tokenize_text(input_text):
    # list of stopwords
    stop_word_list = nltk.corpus.stopwords.words("english")

    # Extend list of stop words
    stop_word_list.extend(additional_stop_words)

    return utils_preprocess_text(input_text, flg_stemm=False, flg_lemm=True, lst_stopwords=stop_word_list) # same function to pre-process text

recipes-telegram-bot/src/recommendation_engine/create_model.py 

In [5]:
import pandas as pd
import numpy as np
import os
## for processing
import re
import nltk
## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing
## model & processing libraries
from sklearn import feature_selection
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics
from sklearn import utils
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
## DB accesses
import sqlite3 as sq
#from src.recommendation_engine.feature_engineering import process_data, create_embeddings

In [6]:
MODEL_PATH = "models/nlp"
MODEL_EMBEDDINGS_PATH = os.path.join(MODEL_PATH, 'similarity_embeddings')
CUISINE_CLASSES = ['brazilian','british','cajun_creole','chinese','filipino','french','greek','indian',
                   'irish','italian','jamaican','japanese','korean','mexican','moroccan','russian','southern_us',
                   'spanish','thai','vietnamese']

os.makedirs(MODEL_PATH, exist_ok=True)
os.makedirs(MODEL_EMBEDDINGS_PATH, exist_ok=True)

In [8]:
## Save to file in the current working directory
def save_pkl(file, pkl_filename):
    with open(pkl_filename, 'wb') as pkl_file:
        pickle.dump(file, pkl_file)

def compute_performances(predicted, predicted_prob, y_test):
    
    classes = np.unique(y_test)
    y_test_array = pd.get_dummies(y_test, drop_first=False).values

    ## Accuracy, Precision, Recall
    accuracy = metrics.accuracy_score(y_test, predicted)
    balance_accuracy = metrics.balanced_accuracy_score(y_test, predicted)
    auc = metrics.roc_auc_score(y_test, predicted_prob, 
                                multi_class="ovr")
    print("Balanced Accuracy:",  round(balance_accuracy,2))
    print("Accuracy:",  round(accuracy,2))
    print("Auc:", round(auc,2))
    print("Detail:")
    print(metrics.classification_report(y_test, predicted))


def create_model_cuisine_predictions():
    ## Process data
    dataset = process_data() # cuisine dataset

    ## Create embeddings
    vectorizer = feature_extraction.text.TfidfVectorizer() #create_embeddings(dataset)

    ## Model
    classifier = LogisticRegressionCV(cv=3,
                                      random_state=42,
                                      max_iter=300,
                                      n_jobs=-1,
                                      verbose=1) #naive_bayes.MultinomialNB()

    ## pipeline
    model = pipeline.Pipeline([("vectorizer", vectorizer),  
                                ("classifier", classifier)])

    ## Split the dataset
    dataset_train, dataset_test = model_selection.train_test_split(dataset, test_size=0.3, random_state=42)

    ## Create embeddings
    X_train = dataset_train['ingredients_query']; X_test = dataset_test['ingredients_query'];
    y_train = dataset_train['cuisine']; y_test = dataset_test['cuisine']; 

    ## train classifier
    model.fit(X_train, y_train)

    ## test
    predicted = model.predict(X_test)
    predicted_prob = model.predict_proba(X_test)

    ## Compute performance of the model
    compute_performances(predicted, predicted_prob, y_test)
    
    ## Save model and vectorizer to disk
    save_pkl(model, os.path.join(MODEL_PATH, "pickle_model.pkl"))

In [12]:
create_model_cuisine_predictions()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
 This problem is unconstrained.
 This problem is unconstrained.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

RUNNING THE L-BFGS-B CODE

           * * *

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
Machine precision = 2.220D-16
Machine precision = 2.220D-16
 N =        52460     M =           10
 N =        52460     M =           10
 N =        52460     M =           10

At X0         0 variables are exactly at the bounds


At X0         0 variables are exactly at the bounds
At X0         0 variables are exactly at the bounds

At iterate    0    f=  5.56038D+04    |proj g|=  2.72995D+03


At iterate    0    f=  5.56038D+04    |proj g|=  2.72995D+03
At iterate    0    f=  5.56008D+04    |proj g|=  2.73000D+03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient

 This problem is unconstrained.
 This problem is unconstrained.
 This problem is unconstrained.
 This problem is unconstrained.
 This problem is unconstrained.
 This problem is unconstrained.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
52460      8     10      1     0     0   7.749D-02   4.788D+04
  F =   47881.047253561956     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        52460     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.74245D+04    |proj g|=  1.88901D+02
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        52460     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.7

 This problem is unconstrained.
 This problem is unconstrained.
 This problem is unconstrained.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
52460     41     47      1     0     0   8.065D-02   3.501D+04
  F =   35009.298172277442     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
524

 This problem is unconstrained.
 This problem is unconstrained.
 This problem is unconstrained.



At iterate   50    f=  2.23595D+04    |proj g|=  1.44094D+00

At iterate   50    f=  2.23381D+04    |proj g|=  3.90576D+00

At iterate   50    f=  2.23894D+04    |proj g|=  1.32240D+01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
52460     90     98      1     0     0   3.400D-02   2.236D+04
  F =   22358.993533325283     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at

 This problem is unconstrained.
 This problem is unconstrained.
 This problem is unconstrained.



At iterate   50    f=  1.35008D+04    |proj g|=  1.35882D+01

At iterate   50    f=  1.35671D+04    |proj g|=  3.15303D+01

At iterate   50    f=  1.35968D+04    |proj g|=  2.24418D+01

At iterate  100    f=  1.34532D+04    |proj g|=  1.01507D+00

At iterate  100    f=  1.35189D+04    |proj g|=  1.12261D+00

At iterate  100    f=  1.35400D+04    |proj g|=  1.53418D+00

At iterate  150    f=  1.34523D+04    |proj g|=  3.73368D-01

At iterate  150    f=  1.35181D+04    |proj g|=  1.63889D-01

At iterate  150    f=  1.35389D+04    |proj g|=  6.58645D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
52460    193    202   

 This problem is unconstrained.
 This problem is unconstrained.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        52460     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.06370D+04    |proj g|=  3.44556D+00

At iterate   50    f=  8.29049D+03    |proj g|=  6.07121D+01

At iterate   50    f=  8.41115D+03    |proj g|=  1.12654D+01

At iterate   50    f=  8.40649D+03    |proj g|=  1.37756D+01

At iterate  100    f=  8.10091D+03    |proj g|=  3.24921D+00

At iterate  100    f=  8.20824D+03    |proj g|=  1.71817D+01

At iterate  100    f=  8.23178D+03    |proj g|=  1.29175D+01

At iterate  150    f=  8.06396D+03    |proj g|=  3.53062D+00

At iterate  150    f=  8.17850D+03    |proj g|=  2.52671D+00

At iterate  150    f=  8.19267D+03    |proj g|=  6.44058D+00

At iterate  200    f=  8.17282D+03    |proj g|=  3.28831D-01

At iterate  200    f=  8.05889D+03    |proj g|=  1.29783D+00

At iterate  200    f=  8.18697D+03    |proj g|=  3.21069D+00

At iterate  250    f=  8.0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the document


At iterate   50    f=  5.21823D+03    |proj g|=  5.39374D+01

At iterate   50    f=  5.35089D+03    |proj g|=  3.58976D+01

At iterate   50    f=  5.41283D+03    |proj g|=  1.69990D+01

At iterate  100    f=  5.05920D+03    |proj g|=  5.90032D+00

At iterate  100    f=  5.17527D+03    |proj g|=  6.08963D+00

At iterate  100    f=  5.23183D+03    |proj g|=  1.05174D+01

At iterate  150    f=  4.99827D+03    |proj g|=  1.43352D+01

At iterate  150    f=  5.16309D+03    |proj g|=  3.22070D+00

At iterate  150    f=  5.10791D+03    |proj g|=  4.23869D+00

At iterate  200    f=  4.97441D+03    |proj g|=  1.15422D+00

At iterate  200    f=  5.13069D+03    |proj g|=  2.95324D+00

At iterate  200    f=  5.07874D+03    |proj g|=  6.24578D+00

At iterate  250    f=  4.96252D+03    |proj g|=  4.91898D+00

At iterate  250    f=  5.11633D+03    |proj g|=  3.27159D+00

At iterate  250    f=  5.06525D+03    |proj g|=  1.63184D+00

At iterate  300    f=  4.95671D+03    |proj g|=  4.37842D+00

       

 This problem is unconstrained.
 This problem is unconstrained.
 This problem is unconstrained.



At iterate   50    f=  3.59973D+03    |proj g|=  6.58161D+00

At iterate   50    f=  3.77906D+03    |proj g|=  9.84665D+00

At iterate   50    f=  3.68625D+03    |proj g|=  4.62995D+01

At iterate  100    f=  3.44329D+03    |proj g|=  7.95565D+00

At iterate  100    f=  3.60750D+03    |proj g|=  1.17806D+01

At iterate  100    f=  3.50449D+03    |proj g|=  6.18328D+00

At iterate  150    f=  3.37316D+03    |proj g|=  2.48459D+00

At iterate  150    f=  3.55395D+03    |proj g|=  5.29341D+00

At iterate  150    f=  3.43891D+03    |proj g|=  1.42737D+01

At iterate  200    f=  3.32321D+03    |proj g|=  1.32358D+01

At iterate  200    f=  3.51003D+03    |proj g|=  2.79918D+01

At iterate  200    f=  3.39353D+03    |proj g|=  5.46077D+00

At iterate  250    f=  3.29855D+03    |proj g|=  2.01038D+00

At iterate  250    f=  3.36861D+03    |proj g|=  5.62755D+00

At iterate  250    f=  3.47758D+03    |proj g|=  4.72530D+00

At iterate  300    f=  3.28290D+03    |proj g|=  5.78711D+00

       

 This problem is unconstrained.
 This problem is unconstrained.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        52460     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.81624D+03    |proj g|=  5.62840D+00
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        52460     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.02355D+03    |proj g|=  2.57048D+00

At iterate   50    f=  2.78529D+03    |proj g|=  4.04669D+00

At iterate   50    f=  2.99861D+03    |proj g|=  3.45571D+00

At iterate   50    f=  2.79375D+03    |proj g|=  2.86910D+00

At iterate  100    f=  2.70123D+03    |proj g|=  8.04803D+00

At iterate  100    f=  2.74291D+03    |proj g|=  4.79262D+00

At iterate  100    f=  2.93562D+03    |proj g|=  2.97590D+01

At iterate  150    f=  2.58232D+03    |proj g|=  6.47243D+00

At iterate  150    f=  2.63085D+03    |proj g|=  8.23271D+00

At iterate  150    f=  2.85698D+03    |proj g

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   25.9s finished


Balanced Accuracy: 0.68
Accuracy: 0.79
Auc: 0.98
Detail:
              precision    recall  f1-score   support

   brazilian       0.76      0.54      0.63       147
     british       0.66      0.48      0.56       242
cajun_creole       0.78      0.67      0.72       495
     chinese       0.78      0.86      0.81       760
    filipino       0.71      0.59      0.64       200
      french       0.61      0.64      0.63       820
       greek       0.79      0.71      0.75       354
      indian       0.88      0.91      0.89       899
       irish       0.63      0.48      0.54       207
     italian       0.81      0.90      0.85      2351
    jamaican       0.89      0.67      0.76       135
    japanese       0.83      0.70      0.76       443
      korean       0.84      0.75      0.79       249
     mexican       0.91      0.92      0.92      1957
    moroccan       0.83      0.74      0.78       237
     russian       0.55      0.43      0.48       144
 southern_us       0.70 

In [40]:
def d2v_embeddings(data): # Recipe dataset
    data = data['ingredients_query'].tolist()
    tagged_data = [TaggedDocument(words=row.split(), tags=[str(index)]) for index, row in enumerate(data)]

    # hyperparmeters (tune later?)
    max_epochs = 20
    vec_size = 50
    alpha = 0.025

    model_embedding = Doc2Vec(vector_size=vec_size,
                        alpha=alpha, 
                        min_alpha=0.00025,
                        min_count=1,
                        dm =1)
  
    model_embedding.build_vocab(tagged_data)

    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model_embedding.train(tagged_data,
                    total_examples=model_embedding.corpus_count,
                    epochs=model_embedding.epochs)
        # decrease the learning rate
        model_embedding.alpha -= 0.0002
        # fix the learning rate, no decay
        model_embedding.min_alpha = model_embedding.alpha
    
    return model_embedding

recipes-telegram-bot/src/data_base/generate_db.py /

In [32]:
# Need to move this chunck down 
# Database Part I

import sqlite3 as sq
import pandas as pd
import os

#from src.recommendation_engine.data import import_recipes_main
#from src.recommendation_engine.feature_engineering import process_recipes
#from src.recommendation_engine.inference import load_pkl

MODEL_PATH = 'models/nlp'

def create_and_populate_db():
    data = import_recipes_main_test()
    
    # Process the data
    data = process_recipes(data)
    
    # Predict cuisine from trained model
    model = load_pkl(os.path.join(MODEL_PATH, 'pickle_model.pkl'))
    data["cuisine"] = model.predict(data["ingredients_query"].tolist())
    
    db = sq.connect('recipes.db')
    #Verify dtypes
    for col in data.columns:
        data[col] = data[col].astype('str')

    print(' ------------------ Check data before populating the db ------------------')
    print(data.columns)
    print(data.head())
    print(data.shape)
    data.to_sql('main_recipes', db, if_exists='replace')

In [33]:
# Database Part II

import sqlite3 as sq
import pandas as pd

def get_df_from_db(cuisine):
    db = sq.connect('recipes.db')
    sql_query = "SELECT title, instructions, ingredients, ingredients_query FROM main_recipes WHERE cuisine = ?"
    return pd.read_sql(sql_query, db, params=(cuisine,))

In [36]:
create_and_populate_db()

 ------------------ Check data before populating the db ------------------
Index(['title', 'ingredients', 'instructions', 'ingredients_query', 'cuisine'], dtype='object')
                               title  \
0  Slow Cooker Chicken and Dumplings   
1      Awesome Slow Cooker Pot Roast   
2               Brown Sugar Meatloaf   
3        Best Chocolate Chip Cookies   
4  Homemade Mac and Cheese Casserole   

                                         ingredients  \
0  ['4 skinless, boneless chicken breast halves A...   
1  ['2 (10.75 ounce) cans condensed cream of mush...   
2  ['1/2 cup packed brown sugar ADVERTISEMENT', '...   
3  ['1 cup butter, softened ADVERTISEMENT', '1 cu...   
4  ['8 ounces whole wheat rotini pasta ADVERTISEM...   

                                        instructions  \
0  Place the chicken, butter, soup, and onion in ...   
1  In a slow cooker, mix cream of mushroom soup, ...   
2  Preheat oven to 350 degrees F (175 degrees C)....   
3  Preheat oven to 350 degr

In [17]:
def train_model_embeddings():
    db = sq.connect('recipes.db')
    cursor = db.cursor()
    
    for cuisine in CUISINE_CLASSES:
        sql_query = "SELECT title, instructions, ingredients, ingredients_query FROM main_recipes WHERE cuisine = ?"
        data = pd.read_sql(sql_query, db, params=(cuisine,))
        
        model_embedding = d2v_embeddings(data)
        save_pkl(model_embedding, os.path.join(MODEL_EMBEDDINGS_PATH, f'd2v_{cuisine}.pkl'))

In [41]:
train_model_embeddings()

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19


recipes-telegram-bot/src/recommendation_engine/inference.py /

In [None]:
import pickle
import os

MODEL_PATH = 'models/nlp'
MODEL_EMBEDDINGS_PATH = os.path.join(MODEL_PATH, 'similarity_embeddings')
CUISINE_CLASSES = ['greek','southern_us','filipino','indian','jamaican','spanish','italian','mexican','chinese','british','thai','vietnamese','cajun_creole','brazilian','french','japanese','irish','korean','moroccan','russian']

#from src.recommendation_engine.feature_engineering import get_tokenize_text
#from src.data_base.inference import get_df_from_db

In [4]:
## Load from file
def load_pkl(pkl_filename):
    with open(pkl_filename, 'rb') as pkl_file:
        return pickle.load(pkl_file)

def infer_cuisine_type_on_recipes(data):
    model_path = os.path.join(MODEL_PATH, 'pickle_model.pkl')
    model = load_pkl(model_path)
    data["cuisine"] = model.predict(data["ingredients_query"])
    return data
    
def predict_cuisine(input_text):
    top = 5
    
    # Tokenize text
    tokenize_text = get_tokenize_text(input_text)
    
    # Get model
    model_path = os.path.join(MODEL_PATH, 'pickle_model.pkl')
    model = load_pkl(model_path)
    
    # Tokenize text
    tokenize_text = get_tokenize_text(input_text)

    # Get classes ordered by probability
    proba = model.predict_proba([tokenize_text])[0]

    # Sorted index list 
    indexes = sorted(range(len(proba)), key=lambda k: proba[k], reverse=True)

    # Get cuisine
    cuisine_labels = model.classes_.tolist()
    cusine_ordered = [cuisine_labels[ind] for ind in indexes]

    return cusine_ordered[:top]

def get_similar_recipes(input_text, cuisine, top_k=3):
    # Tokenize text
    tokenize_text = get_tokenize_text(input_text).split()
    
    # Load model from the selected cuisine
    d2v = load_pkl(os.path.join(MODEL_EMBEDDINGS_PATH, f'd2v_{cuisine}.pkl'))

    # Get embeddings
    embeddings = d2v.infer_vector(tokenize_text)
    best_recipes = d2v.docvecs.most_similar([embeddings]) #gives you top 10 document tags and their cosine similarity

    # Get recipes
    best_recipes_index = [int(output[0]) for output in best_recipes]
    
    # Get dDtaFrame
    df = get_df_from_db(cuisine)
    
    return df[df.index.isin(best_recipes_index)].head(top_k)

recipes-telegram-bot/src/data_base/inference.py /

In [None]:
# Import libraries
import argparse
import logging
from typing import Dict

from telegram import ReplyKeyboardMarkup, Update, InlineKeyboardMarkup, InlineKeyboardButton
from telegram.ext import (
    Updater,
    CommandHandler,
    MessageHandler,
    Filters,
    ConversationHandler,
    CallbackContext,
    CallbackQueryHandler,
)

from src.recommendation_engine.inference import predict_cuisine, get_similar_recipes
from src.recognition_engine.inference import classify_image

# Create the parser
my_parser = argparse.ArgumentParser(description='Give your personal token')

# Add the arguments
my_parser.add_argument('token', metavar='token', type=str, help='The token given by Fatherbot')


# Enable logging
logging.basicConfig(
    #filename= 'telgramBot.log',
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

logger = logging.getLogger(__name__)

# Main interactions
CHOOSING, GET_TEXT, GET_IMAGE = range(3)
# Callback data
CALLBACK1, CALLBACK2 = range(3,5)

reply_keyboard = [
    ['Show ingredients', 'Get recipes'],
    ['Remove item', 'Done'],
]
markup = ReplyKeyboardMarkup(reply_keyboard, one_time_keyboard=False)

def start(update: Update, context: CallbackContext) -> int:
    user = update.message.from_user
    logger.info(f"{user.first_name}: Start")

    context.user_data['chat_id'] = update.message.chat_id

    update.message.reply_text(
        "Hi! I am you recipe bot. What ingredients do you currently have?"
        "You can send an image or add ingredients by typing it in one or two words",
        reply_markup=markup,
    )
    return CHOOSING

def get_basket_txt(list_ingredients):
    txt = 'Here are your current ingredients:\n'
    for ingredient in list_ingredients:
        txt += f'   - {ingredient}\n'
    return txt

def received_image_information(update: Update, context: CallbackContext) -> int:
    user = update.message.from_user
    photo_file = update.message.photo[-1].get_file()
    photo_file.download('infer_image.png')
    logger.info("Photo of %s: %s", user.first_name, 'infer_image.jpg')
    update.message.reply_text(
        'Thanks the photo is being processed'
    )

    user_data = context.user_data
    
    # Infer image prediction
    ingredient = classify_image('infer_image.png')
    
    keyboard = [
        [
            InlineKeyboardButton(ingredient[0], callback_data=ingredient[0]),
            InlineKeyboardButton(ingredient[1], callback_data=ingredient[1]),
            InlineKeyboardButton(ingredient[2], callback_data=ingredient[2])],
        [
            InlineKeyboardButton(ingredient[3], callback_data=ingredient[3]),
            InlineKeyboardButton(ingredient[4], callback_data=ingredient[4]),
        ]
    ]
    reply_markup = InlineKeyboardMarkup(keyboard)
    # Send message with text and appended InlineKeyboard
    update.message.reply_text("Chose the ingredients you have on your image!", reply_markup=reply_markup)

    return CALLBACK1

def button1(update: Update, context: CallbackContext) -> int:
    logger.info(f": button1")

    query = update.callback_query
    query.answer()

    user_data = context.user_data    
    if 'ingredients_list' not in user_data:
        user_data['ingredients_list'] = [query.data]
    else:
        user_data['ingredients_list'].append(query.data)

    query.edit_message_text(text=f"Ok you selected: {query.data}")
    
    txt = get_basket_txt(user_data['ingredients_list'])
    context.bot.send_message(chat_id=context.user_data['chat_id'], text=txt)

    return CHOOSING

def recipes_query(update: Update, context: CallbackContext) -> int:
    """ Get recipes """
    user = update.message.from_user
    logger.info(f"{user.first_name}: recipes_query")

    user_data = context.user_data

    input_text = ' '.join(user_data['ingredients_list'])

    # Predict cuisine
    cuisine = predict_cuisine(input_text)

    keyboard = [
        [
            InlineKeyboardButton(cuisine[0], callback_data=cuisine[0]),
            InlineKeyboardButton(cuisine[1], callback_data=cuisine[1]),
            InlineKeyboardButton(cuisine[2], callback_data=cuisine[2])],
        [
            InlineKeyboardButton(cuisine[3], callback_data=cuisine[3]),
            InlineKeyboardButton(cuisine[4], callback_data=cuisine[4]),
        ]
    ]
    reply_markup = InlineKeyboardMarkup(keyboard)
    # Send message with text and appended InlineKeyboard
    update.message.reply_text("Chose the type of cuisine you want!", reply_markup=reply_markup)

    return CALLBACK2

def button2(update: Update, context: CallbackContext) -> int:
    #user = update.message.from_user
    logger.info(f"button2")

    query = update.callback_query
    query.answer()

    # Get recipes
    recipes = get_similar_recipes(context.user_data['ingredients_list'], query.data)

    sep = '\n\n'
    for index, row in recipes.iterrows():

        title = 'Title: ' + row['title'] 
        ingredients=''
        list_ing = row['ingredients'].replace('ADVERTISEMENT', '').strip('][').split(', ')
        for ingredient in list_ing:
            ingredients+= ingredient.replace("'", "") + '\n'
        ingredients = 'Ingredients: ' + '\n' + ingredients
        instructions = 'Instruction: '+ '\n' + row['instructions']

        txt = title + sep + ingredients + sep + instructions

        context.bot.send_message(chat_id=context.user_data['chat_id'], text=txt)

    return CHOOSING

def show_basket(update: Update, context: CallbackContext) -> int:
    user = update.message.from_user
    logger.info(f"{user.first_name}: show_basket")

    user_data = context.user_data
    
    txt = get_basket_txt(user_data['ingredients_list'])
    
    update.message.reply_text(
        txt,
        reply_markup=markup,
    )
    return CHOOSING

def received_text_information(update: Update, context: CallbackContext) -> int:
    user = update.message.from_user
    logger.info(f"{user.first_name}: received_text_information")

    user_data = context.user_data
    text = update.message.text
    
    if 'ingredients_list' not in user_data:
        user_data['ingredients_list'] = [text]
    else:
        user_data['ingredients_list'].append(text)

    txt = get_basket_txt(user_data['ingredients_list'])
    update.message.reply_text(
        txt,
        reply_markup=markup,
    )
    return CHOOSING

def remove_item(update: Update, context: CallbackContext) -> int:
    user = update.message.from_user
    logger.info(f"{user.first_name}: remove_item")

    user_data = context.user_data
    if 'ingredients_list' in user_data:
        del user_data['ingredients_list'][-1]
    
    introduction = 'You have deleted the last ingredient. '
    txt = get_basket_txt(user_data['ingredients_list'])
    update.message.reply_text(
        introduction + txt,
        reply_markup=markup,
    )
    return CHOOSING

def done(update: Update, context: CallbackContext) -> int:
    user = update.message.from_user
    logger.info(f"{user.first_name}: done")

    user_data = context.user_data
    if 'ingredients_list' in user_data:
        del user_data['ingredients_list']

    update.message.reply_text(
        f"Bye bye until next time!"
    )

    user_data.clear()
    return ConversationHandler.END    

def main(bot_token) -> None:
    # Create the Updater and pass it your bot's token.
    # Make sure to set use_context=True to use the new context based callbacks
    # Post version 12 this will no longer be necessary
    updater = Updater(bot_token, use_context=True)

    # Get the dispatcher to register handlers
    dispatcher = updater.dispatcher

    # Add conversation handler with the states CHOOSING, TYPING_CHOICE and TYPING_REPLY
    conv_handler = ConversationHandler(
        entry_points=[CommandHandler('start', start)],
        states={
            CHOOSING: [
                MessageHandler(Filters.photo & ~(Filters.command | Filters.regex('^(Done|Get recipes|Show ingredients|Remove item)$')), received_image_information),
                MessageHandler(Filters.text & ~(Filters.command | Filters.regex('^(Done|Get recipes|Show ingredients|Remove item)$')), received_text_information),
                MessageHandler(Filters.regex('^Get recipes$'), recipes_query),
                MessageHandler(Filters.regex('^Show ingredients$'), show_basket),
                MessageHandler(Filters.regex('^Remove item$'), remove_item),
            ],
            CALLBACK1: [
                CallbackQueryHandler(button1)],
            CALLBACK2: [
                CallbackQueryHandler(button2)],
        },
        fallbacks=[MessageHandler(Filters.regex('^Done$'), done)],
        per_message=False,
    )

    dispatcher.add_handler(conv_handler)

    # Start the Bot
    updater.start_polling()

    # Run the bot until you press Ctrl-C or the process receives SIGINT,
    # SIGTERM or SIGABRT. This should be used most of the time, since
    # start_polling() is non-blocking and will stop the bot gracefully.
    updater.idle()

if __name__ == '__main__':
    # Execute the parse_args() method
    args = my_parser.parse_args()
    main(args.token)