In [3]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import json
import platform
import time
import pathlib
import os
from tqdm import tqdm

In [4]:
# combine three files into one

def combine_datasets():
    file_names = ['recipes_raw_nosource_ar.json', 'recipes_raw_nosource_epi.json', 'recipes_raw_nosource_fn.json']
    dataset = []
    
    path = os.getcwd()
    os.chdir("../")
    for file in file_names:
        with open(os.getcwd() + "/recipes_data/" + file) as data_file:
            data_dict = json.load(data_file)
            data_list = list(data_dict.values())
            dataset += data_list
    os.chdir(path)
    return dataset

raw_data = combine_datasets()

In [5]:
# remove incomplete data
def check_complete(recipe):
    required_keys = ['title', 'ingredients', 'instructions']
    
    if not recipe:
        return False
    
    for required_key in required_keys:
        if not recipe[required_key]:
            return False
        elif type(recipe[required_key]) == list and len(recipe[required_key]) == 0:
            return False
    
    return True

full_data = [recipe for recipe in raw_data if check_complete(recipe)]

In [6]:
# convert recipes to strings
def create_recipe_string(recipe):
    ingredients = ""
    for i in recipe['ingredients']:
        i = i.replace("ADVERTISEMENT", "")
        if i:
            ingredients += "- " + i + "\n"

    instructions = ""
    for i in recipe['instructions'].split('\n'):
        i = i.replace("ADVERTISEMENT", "")
        if i:
            instructions += "- " + i + "\n"

    recipe_str = "TITLE: " + recipe['title']
    recipe_str += "\n\nINGREDIENTS:\n" + ingredients
    recipe_str += "\nINSTRUCTIONS:\n" + instructions
    return recipe_str

string_data = [create_recipe_string(recipe) for recipe in full_data]

In [7]:
# filter out long recipes
filtered_data = []
for recipe in string_data:
    if len(recipe) <= 2012: # filter to about 100k recipes under this length
        filtered_data.append(recipe)

# save to file
path = os.getcwd()
os.chdir("../")
with open(os.getcwd() + "/recipes_data/recipes_filtered.txt", "w") as f:
    for recipe in filtered_data:
        line = recipe.replace("\n", "~~")
        f.write(line + "\n")
os.chdir(path)