In [29]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import os
import re
from functools import partial
import json
from collections import defaultdict
from src.utils import list_apply

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
tqdm.pandas()

# Load and preprocess the data

In [4]:
data_path = '../data/interim/'

In [5]:
df_list = []
pbar = tqdm(total = len(os.listdir(data_path)), leave=False)
for f in os.listdir(data_path):
    df_list.append(pd.read_pickle(data_path + f))
    pbar.update(1)
pbar.close()
df = pd.concat(df_list)

  0%|          | 0/285 [00:00<?, ?it/s]

In [6]:
# Yay making typos!
df.drop('my_calories', axis=1, inplace=True)
df.rename({'myt_calories' : 'my_calories'}, axis=1, inplace=True)

In [7]:
df.shape[1]

28

In [8]:
column_order = ['id', 'title_lower', 'source', 'ingredients', 'my_ingredients', 'instructions', 'rating', 'review', 'tags', 'time', 'url', 'title', 
                'calories', 'carb_pdv', 'protien_pdv', 'saturated_fat_pdv', 'sodium_pdv', 'sugar_pdv', 'total_fat_pdv',
               'ingredients_half', 'ingredients_twelve']

In [10]:
df = df[column_order]

In [11]:
df = df.sort_values('title')

In [13]:
# Preprocessing functions for my ingredient pull
def remove_html_brackets(string):
    return re.sub("\<[^<>]*\>", '', string)
def squash_quantity_spaces(string):
    return re.sub("(\d+) -(\d+)", r"\1-\2", string)
def remove_extra_spaces(string):
    return re.sub(' +', ' ', string)

In [14]:
def find_all_chars(string, ch):
    return [i for i, letter in enumerate(string) if letter == ch]
# Nested Parenthesis are annoying for the parser. 
# Occurs ~.1% of the time
# Replace them with spaces
def drop_nested_parens(string):
    open_indicies = find_all_chars(string, '(')
    close_indicies = find_all_chars(string, ')')
    all_indicies = np.array(open_indicies + close_indicies)
    is_open = np.array([True] * len(open_indicies) + [False] * len(close_indicies))
    string = list(string)
    
    sorted_indicies = np.argsort(all_indicies)
    all_indicies = all_indicies[sorted_indicies]
    is_open = is_open[sorted_indicies]
    curr_sum = 0 # +1 for open, -1 for closed
    for idx, open in zip(all_indicies, is_open):
        curr_sum += 1 if open else -1
        if curr_sum > 1 or (curr_sum==1 and not open):
            string[idx] = ' '
    string = ''.join(string)
    string = re.sub('\( +', '(', string)
    string = re.sub(' +\)', ')', string)
    return string

In [15]:
df['my_ingredients'] = df['my_ingredients'].progress_apply(partial(list_apply, [remove_html_brackets, squash_quantity_spaces, drop_nested_parens, remove_extra_spaces]))

  0%|          | 0/985728 [00:00<?, ?it/s]

In [16]:
df['ingredients'] = df.progress_apply(lambda x: x['my_ingredients'] if len(x['my_ingredients']) > 0 else x['ingredients'], axis=1)

  0%|          | 0/985728 [00:00<?, ?it/s]

In [17]:
df.drop(['my_ingredients', 'ingredients_twelve', 'ingredients_half'], axis=1, inplace=True)

In [18]:
drop_rows = [4]
df.drop(drop_rows, axis=0, inplace=True)
df.reset_index(inplace=True)

In [19]:
df['source'].value_counts()

food               494501
tastykitchen        74969
cookpad             61104
cookeatshare        60498
foodnetwork         52867
allrecipes          47406
epicurious          45249
kraftrecipes        37761
recipeland          25012
foodandwine         17505
cooking.nytimes     17174
foodgeeks            9201
cookstr              8961
myrecipes            6477
chowhound            6277
online-cookbook      5626
vegetariantimes      4574
delish               3819
landolakes           2484
foodrepublic         2321
lovefood             1940
                        1
Name: source, dtype: int64

### Join the data to the image data

In [21]:
with open('./data/recipe_1m/layer2.json', 'r') as f:
    img_data = json.load(f)

In [22]:
id_to_img_list = defaultdict(lambda: [])
for mapper in img_data:
    these_urls = []
    for i in mapper['images']:
        these_urls.append(i['url'])
    id_to_img_list[mapper['id']] = these_urls

In [23]:
df['img_list'] = df['id'].map(id_to_img_list)

In [26]:
df.drop('index', axis=1, inplace=True)

In [28]:
save_path = '../data/processed/'
if not os.path.exists(save_path):
    os.mkdir(save_path)
df.to_pickle(save_path + 'processedv2.pkl')