In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import ast
import torch
from torch.nn.utils.rnn import pad_sequence
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
# Connect to the sqlite database containing recipes and read into pandas df.
conn = sqlite3.connect('13k-recipes.db')
recipes_init = pd.read_sql_query("SELECT * FROM recipes", conn)
conn.close()

In [3]:
recipes_init.head()

Unnamed: 0,id,Title,Ingredients,Instructions
0,1,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ..."
1,2,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...
2,3,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...
3,4,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...
4,5,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...


In [4]:
# Drop unnecessary column (for now)
recipes_init = recipes_init.drop(columns = ['Title', 'id'])

In [5]:
stopwords_set = set(stopwords.words('english'))
additional_stopwords = {
    "divided", "plus", "whole", "good-quality", "sturdy", "small", "medium", "large", 
    "about", "total", "such", "as", "good", "quality", "freshly", "ground", "unsalted",
    "pinch", "dash", "can", "jar", "package", "stick", "slice", "clove", "bunch",
    "finely", "chopped", "fresh", "extra", "more"
}
stopwords_set.update(additional_stopwords)

In [6]:
def simplify_ingredients(ingredients):
    '''
    Generalized simplification for a large dataset:
    - Removes quantities, fractions, parentheses, and Unicode fractions.
    - Eliminates stopwords and common modifiers.
    - Focuses on essential ingredient keywords.
    '''
    res = []
    for ingredient in ingredients:
        item = re.sub(r'\d+/\d+|\d+|\(.*?\)', '', ingredient)
        item = re.sub(r'[\u00BC-\u00BE\u2150-\u215E]', '', item)
        item = re.sub(
            r'\b(tbsp|tsp|cup|gram|kg|ml|oz|liter|lb|pound|teaspoon|tablespoon)\b', 
            '', item, flags=re.IGNORECASE
        )
        item = word_tokenize(item.lower())
        no_stop_words = [x for x in item if x not in stopwords_set]
        item = '_'.join(no_stop_words)
        item = re.sub(r'[^a-zA-Z_]', '', item)
        item = re.sub(r'_{2,}', '_', item).strip('_')
        if item:
            res.append(item)
    return res

In [7]:
def simplify_instructions(instructions):
  '''
  Simplifies each string of instructions -- removes hyphens, words, whitespace,
  special characters, punctuation, symbols.
  '''
  item = re.sub(r'-', ' ', instructions)
  item = re.sub(r'[^\w\s]', '', item)
  item = re.sub(r'\s+', ' ', item)
  item = item.strip()
  item = item.lower()
  return item

In [8]:

recipes_init['Ingredients'] = recipes_init['Ingredients'].apply(ast.literal_eval)

In [9]:
recipes_init.dtypes

Ingredients     object
Instructions    object
dtype: object

In [10]:
recipes_init['Simplified_Ingredients'] = recipes_init['Ingredients'].apply(simplify_ingredients)
recipes_init['Simplified_Instructions'] = recipes_init['Instructions'].apply(simplify_instructions)

In [11]:
recipes_init.head()

Unnamed: 0,Ingredients,Instructions,Simplified_Ingredients,Simplified_Instructions
0,"[1 (3½–4-lb.) whole chicken, 2¾ tsp. kosher sa...","Pat chicken dry with paper towels, season all ...","[chicken, kosher_salt, acorn_squash, sage, ros...",pat chicken dry with paper towels season all o...
1,"[2 large egg whites, 1 pound new potatoes (abo...",Preheat oven to 400°F and line a rimmed baking...,"[egg_whites, new_potatoes, teaspoons_kosher_sa...",preheat oven to 400f and line a rimmed baking ...
2,"[1 cup evaporated milk, 1 cup whole milk, 1 ts...",Place a rack in middle of oven; preheat to 400...,"[evaporated_milk, milk, garlic_powder, onion_p...",place a rack in middle of oven preheat to 400 ...
3,"[1 (¾- to 1-pound) round Italian loaf, cut int...",Preheat oven to 350°F with rack in middle. Gen...,"[round_italian_loaf_cut_inch_cubes, tablespoon...",preheat oven to 350f with rack in middle gener...
4,"[1 teaspoon dark brown sugar, 1 teaspoon hot w...",Stir together brown sugar and hot water in a c...,"[dark_brown_sugar, hot_water, bourbon, lemon_j...",stir together brown sugar and hot water in a c...


In [12]:
non_word_tokens = ['<PAD>', '<START>', '<END>', '<UNK>']

total_words = []
total_words.extend(non_word_tokens)

# Tokenize ingredients and instructions, and construct the vocabulary from these
# words.

for idx, recipe in recipes_init.iterrows():
  for ingredient in recipe['Simplified_Ingredients']:
    total_words.append(ingredient)
  total_words.extend(word_tokenize(recipe['Simplified_Instructions']))

total_words = sorted(list(set(total_words)))

# Dictionary of word to index mappings

indexed_words = {word : idx for idx, word in enumerate(total_words)}

In [13]:
# Add tokenized instructions to the table

recipes_init['Instr_Tok_Wrds'] = recipes_init['Simplified_Instructions'].apply(word_tokenize)

In [14]:
recipes_init.head()

Unnamed: 0,Ingredients,Instructions,Simplified_Ingredients,Simplified_Instructions,Instr_Tok_Wrds
0,"[1 (3½–4-lb.) whole chicken, 2¾ tsp. kosher sa...","Pat chicken dry with paper towels, season all ...","[chicken, kosher_salt, acorn_squash, sage, ros...",pat chicken dry with paper towels season all o...,"[pat, chicken, dry, with, paper, towels, seaso..."
1,"[2 large egg whites, 1 pound new potatoes (abo...",Preheat oven to 400°F and line a rimmed baking...,"[egg_whites, new_potatoes, teaspoons_kosher_sa...",preheat oven to 400f and line a rimmed baking ...,"[preheat, oven, to, 400f, and, line, a, rimmed..."
2,"[1 cup evaporated milk, 1 cup whole milk, 1 ts...",Place a rack in middle of oven; preheat to 400...,"[evaporated_milk, milk, garlic_powder, onion_p...",place a rack in middle of oven preheat to 400 ...,"[place, a, rack, in, middle, of, oven, preheat..."
3,"[1 (¾- to 1-pound) round Italian loaf, cut int...",Preheat oven to 350°F with rack in middle. Gen...,"[round_italian_loaf_cut_inch_cubes, tablespoon...",preheat oven to 350f with rack in middle gener...,"[preheat, oven, to, 350f, with, rack, in, midd..."
4,"[1 teaspoon dark brown sugar, 1 teaspoon hot w...",Stir together brown sugar and hot water in a c...,"[dark_brown_sugar, hot_water, bourbon, lemon_j...",stir together brown sugar and hot water in a c...,"[stir, together, brown, sugar, and, hot, water..."


In [15]:
reverse_vocab = {v : k for k, v in indexed_words.items()}

In [16]:
# exports word <-> index dictionaries to word_dict.py file

if os.path.exists('word_dict.py'):
  os.remove('word_dict.py')

with open('word_dict.py', 'w') as dict_file:
  dict_file.write(f'indexed_words = {indexed_words}\n')
  dict_file.write(f'reverse_vocab = {reverse_vocab}\n')

In [17]:
print(len(total_words))

54233


In [18]:
def convert_to_tokens(item_words, indexed_words):
  '''
  Convert an ingredients array or instructions array to
  tokens using the word to index mapping.  Adds start and
  end tokens to each tokenized array.
  '''
  res = []
  res.append(indexed_words['<START>'])

  for word in item_words:
    if word in indexed_words:
      res.append(indexed_words[word])
      
  res.append(indexed_words['<END>'])
  return res

In [19]:
# Apply tokenization function
recipes_init['Ingredients_Tok'] = recipes_init['Simplified_Ingredients'].apply(lambda x: convert_to_tokens(x, indexed_words))
recipes_init['Instructions_Tok'] = recipes_init['Simplified_Instructions'].apply(lambda x: convert_to_tokens(x, indexed_words))

In [20]:
recipes_init.head()

Unnamed: 0,Ingredients,Instructions,Simplified_Ingredients,Simplified_Instructions,Instr_Tok_Wrds,Ingredients_Tok,Instructions_Tok
0,"[1 (3½–4-lb.) whole chicken, 2¾ tsp. kosher sa...","Pat chicken dry with paper towels, season all ...","[chicken, kosher_salt, acorn_squash, sage, ros...",pat chicken dry with paper towels season all o...,"[pat, chicken, dry, with, paper, towels, seaso...","[1002, 7280, 24753, 1322, 40329, 39949, 5605, ...","[1002, 32829, 1009, 45603, 5799, 21078, 22700,..."
1,"[2 large egg whites, 1 pound new potatoes (abo...",Preheat oven to 400°F and line a rimmed baking...,"[egg_whites, new_potatoes, teaspoons_kosher_sa...",preheat oven to 400f and line a rimmed baking ...,"[preheat, oven, to, 400f, and, line, a, rimmed...","[1002, 15973, 28327, 49267, 3879, 39949, 50508...","[1002, 32829, 37882, 15751, 21078, 15751, 1009..."
2,"[1 cup evaporated milk, 1 cup whole milk, 1 ts...",Place a rack in middle of oven; preheat to 400...,"[evaporated_milk, milk, garlic_powder, onion_p...",place a rack in middle of oven preheat to 400 ...,"[place, a, rack, in, middle, of, oven, preheat...","[1002, 16721, 27238, 19315, 29079, 43299, 3879...","[1002, 32829, 24817, 1009, 5799, 15751, 1009, ..."
3,"[1 (¾- to 1-pound) round Italian loaf, cut int...",Preheat oven to 350°F with rack in middle. Gen...,"[round_italian_loaf_cut_inch_cubes, tablespoon...",preheat oven to 350f with rack in middle gener...,"[preheat, oven, to, 350f, with, rack, in, midd...","[1002, 40082, 47386, 36793, 5540, 29125, 6812,...","[1002, 32829, 37882, 15751, 21078, 15751, 1009..."
4,"[1 teaspoon dark brown sugar, 1 teaspoon hot w...",Stir together brown sugar and hot water in a c...,"[dark_brown_sugar, hot_water, bourbon, lemon_j...",stir together brown sugar and hot water in a c...,"[stir, together, brown, sugar, and, hot, water...","[1002, 13986, 22592, 4705, 25316, 48768, 19452...","[1002, 40282, 45603, 22700, 37882, 45603, 2856..."


In [21]:
# Get length of each simplified data array in the dataframe

recipes_init['Ingr_Len'] = recipes_init['Simplified_Ingredients'].apply(len)
recipes_init['Instr_Len'] = recipes_init['Simplified_Instructions'].apply(len)

In [22]:
# Convert the tokenized instruction and ingredient columns to tensors
recipes_init['Ingr_Tnsr'] = recipes_init['Ingredients_Tok'].apply(lambda x: torch.tensor(x))
recipes_init['Instr_Tnsr'] = recipes_init['Instructions_Tok'].apply(lambda x: torch.tensor(x))

### FOR NOW, LIMIT TO RECIPES WITH INSTRUCTION LENGTHS LESS THAN 500###
recipes_init = recipes_init[recipes_init['Instr_Len'] <= 500]

# Pad all instructions and ingredients to be the same length
def apply_padding(sequences, pad_value):
    padded_seqs = pad_sequence(sequences, batch_first=True, padding_value=pad_value)
    return padded_seqs

ingr_tnsr = list(recipes_init['Ingr_Tnsr'])
instr_tnsr = list(recipes_init['Instr_Tnsr'])

ingr_pd = apply_padding(ingr_tnsr, indexed_words['<PAD>'])
instr_pd = apply_padding(instr_tnsr, indexed_words['<PAD>'])

recipes_init['Ingr_Tnsr'] = ingr_pd.tolist()
recipes_init['Instr_Tnsr'] = instr_pd.tolist()


display(recipes_init['Ingr_Tnsr'][:25])

4      [1002, 13986, 22592, 4705, 25316, 48768, 19452...
5      [1002, 7041, 39122, 25316, 1494, 1000, 1001, 1...
6      [1002, 20071, 1804, 33472, 22489, 25316, 19467...
17     [1002, 52579, 39620, 24753, 12717, 39576, 5077...
31     [1002, 19182, 29146, 6771, 38371, 20643, 47386...
32     [1002, 7759, 41593, 50508, 29426, 33234, 8270,...
35     [1002, 26061, 13901, 17505, 29352, 1000, 1001,...
41     [1002, 20469, 5989, 6339, 1678, 3906, 24753, 5...
45     [1002, 18988, 46891, 19267, 21983, 51536, 3314...
51     [1002, 50912, 5495, 24753, 3879, 1000, 1001, 1...
64     [1002, 52370, 9355, 1000, 1001, 1001, 1001, 10...
79     [1002, 12820, 8764, 52788, 49147, 40424, 8753,...
88     [1002, 9731, 24602, 40836, 7997, 25721, 24753,...
104    [1002, 15639, 25316, 42335, 29344, 1000, 1001,...
105    [1002, 44839, 39937, 2001, 25316, 43455, 1000,...
107    [1002, 28394, 8325, 19382, 1000, 1001, 1001, 1...
108    [1002, 41174, 16856, 25737, 43688, 50814, 2722...
119    [1002, 48368, 47735, 479

In [23]:
# Get padded lengths to ensure they are all the same
recipes_init['Padded_Length_Ingr'] = recipes_init['Ingr_Tnsr'].apply(len)
recipes_init['Padded_Length_Instr'] = recipes_init['Instr_Tnsr'].apply(len)

In [24]:
recipes_init.head()

Unnamed: 0,Ingredients,Instructions,Simplified_Ingredients,Simplified_Instructions,Instr_Tok_Wrds,Ingredients_Tok,Instructions_Tok,Ingr_Len,Instr_Len,Ingr_Tnsr,Instr_Tnsr,Padded_Length_Ingr,Padded_Length_Instr
4,"[1 teaspoon dark brown sugar, 1 teaspoon hot w...",Stir together brown sugar and hot water in a c...,"[dark_brown_sugar, hot_water, bourbon, lemon_j...",stir together brown sugar and hot water in a c...,"[stir, together, brown, sugar, and, hot, water...","[1002, 13986, 22592, 4705, 25316, 48768, 19452...","[1002, 40282, 45603, 22700, 37882, 45603, 2856...",6,264,"[1002, 13986, 22592, 4705, 25316, 48768, 19452...","[1002, 40282, 45603, 22700, 37882, 45603, 2856...",25,410
5,"[2 chamomile tea bags, 1½ oz. reposado tequila...",Place 2 chamomile tea bags in a heatsafe vesse...,"[chamomile_tea_bags, reposado_tequila, lemon_j...",place 2 chamomile tea bags in a heatsafe vesse...,"[place, 2, chamomile, tea, bags, in, a, heatsa...","[1002, 7041, 39122, 25316, 1494, 1000]","[1002, 32829, 24817, 1009, 5799, 15751, 324, 5...",4,340,"[1002, 7041, 39122, 25316, 1494, 1000, 1001, 1...","[1002, 32829, 24817, 1009, 5799, 15751, 324, 5...",25,410
6,"[3 oz. Grand Marnier, 1 oz. Amaro Averna, Smal...","Add 3 oz. Grand Marnier, 1 oz. Amaro Averna, a...","[grand_marnier, amaro_averna, pat_salted_butte...",add 3 oz grand marnier 1 oz amaro averna and a...,"[add, 3, oz, grand, marnier, 1, oz, amaro, ave...","[1002, 20071, 1804, 33472, 22489, 25316, 19467...","[1002, 1009, 13907, 13907, 493, 28569, 18689, ...",6,452,"[1002, 20071, 1804, 33472, 22489, 25316, 19467...","[1002, 1009, 13907, 13907, 493, 28569, 18689, ...",25,410
17,"[6 Tbsp. virgin coconut oil, 4 ripe (spotted) ...",Heat oil in a large nonstick skillet over medi...,"[virgin_coconut_oil, ripe_pltanos_manzanos_rip...",heat oil in a large nonstick skillet over medi...,"[heat, oil, in, a, large, nonstick, skillet, o...","[1002, 52579, 39620, 24753, 12717, 39576, 5077...","[1002, 21078, 15751, 1009, 45603, 28569, 22700...",6,279,"[1002, 52579, 39620, 24753, 12717, 39576, 5077...","[1002, 21078, 15751, 1009, 45603, 28569, 22700...",25,410
31,"[3 garlic cloves, minced, 2 large onions, chop...","In a large heavy kettle cook garlic, onions, c...","[garlic_cloves_minced, onions_fine, celery, re...",in a large heavy kettle cook garlic onions cel...,"[in, a, large, heavy, kettle, cook, garlic, on...","[1002, 19182, 29146, 6771, 38371, 20643, 47386...","[1002, 22700, 28097, 1009, 24817, 1009, 37882,...",14,397,"[1002, 19182, 29146, 6771, 38371, 20643, 47386...","[1002, 22700, 28097, 1009, 24817, 1009, 37882,...",25,410


In [25]:
# Export to csv
recipes_init.to_csv('preprocessed_recipes.csv', index = False)
