In [3]:
import json
import pprint
import uuid
import itertools
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import sys
sys.path.append('..')
from src.utils import fast_string_sim

# Join Kaggle and Recipes 1m

In [4]:
tqdm.pandas()

In [5]:
def load_json(path):
    with open(path, 'r') as f:
        return json.load(f)

In [6]:
# Get a website from a url in onem recipes dataset
def get_website(url):
    return url[(max(url.find('//') + 2, url.find('www.') + 4)):url.find('.com')].strip()

In [7]:
score_threshhold = .5

In [8]:
pd.set_option('display.max_colwidth', 9999)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Kaggle Food: https://www.kaggle.com/shuyangli94/food-com-recipes-and-user-interactions
* First, make the data look nice

In [None]:
kaggle_datapath = '../data/kaggle_food/'

In [None]:
kaggle_recipes = pd.read_csv(kaggle_datapath + 'RAW_recipes.csv')

In [None]:
kaggle_reviews = pd.read_csv(kaggle_datapath + 'RAW_interactions.csv')

In [None]:
kaggle_recipes['ingredients'] = kaggle_recipes['ingredients'].progress_apply(eval)
kaggle_recipes['steps'] = kaggle_recipes['steps'].progress_apply(eval)
kaggle_recipes['tags'] = kaggle_recipes['tags'].progress_apply(eval)
kaggle_recipes['nutrition'] = kaggle_recipes['nutrition'].progress_apply(eval)

In [None]:
def pbar_list(elems):
    pbar.update(1)
    return list(elems)

pbar = tqdm(total = 2*kaggle_reviews['recipe_id'].nunique(), leave=False)
kaggle_reviews = kaggle_reviews.groupby('recipe_id').agg({'rating' : pbar_list, 'review' : pbar_list})
pbar.close()

In [None]:
kaggle_df = kaggle_recipes.merge(kaggle_reviews, left_on='id', right_on='recipe_id')

In [None]:
# Expand nutrition field based on this info: https://www.kaggle.com/shuyangli94/food-com-recipes-and-user-interactions/discussion/121778
# entries are: [calories (#), total fat (PDV), sugar (PDV) , sodium (PDV) , protein (PDV) , saturated fat (PDV) , and carbohydrates (PDV)]
# pdv = percent daily value

In [None]:
kaggle_df['calories'] = kaggle_df['nutrition'].apply(lambda x: x[0]).astype(float)
kaggle_df['total_fat_pdv'] = kaggle_df['nutrition'].apply(lambda x: x[1] / 100).astype(float)
kaggle_df['sugar_pdv'] = kaggle_df['nutrition'].apply(lambda x: x[2] / 100).astype(float)
kaggle_df['sodium_pdv'] = kaggle_df['nutrition'].apply(lambda x: x[3] / 100).astype(float)
kaggle_df['protien_pdv'] = kaggle_df['nutrition'].apply(lambda x: x[4] / 100).astype(float)
kaggle_df['saturated_fat_pdv'] = kaggle_df['nutrition'].apply(lambda x: x[5] / 100).astype(float)
kaggle_df['carb_pdv'] = kaggle_df['nutrition'].apply(lambda x: x[6] / 100).astype(float)

In [None]:
kaggle_df['time'] = kaggle_df['minutes'].astype(float)

In [None]:
(kaggle_df['n_ingredients']==kaggle_df['ingredients'].apply(len)).mean()

In [None]:
drop_cols = ['id', 'contributor_id', 'submitted', 'submitted', 'nutrition', 'n_steps', 'n_ingredients', 'minutes']

In [None]:
kaggle_df.drop(drop_cols, axis=1, inplace=True)

In [None]:
kaggle_df.sample(2)

# Recipes 1M: http://pic2recipe.csail.mit.edu/
* Download From: http://data.csail.mit.edu/im2recipe/recipe1M_layers.tar.gz

In [None]:
recipe1m_path = '../data/recipe_1m/'

In [None]:
onem_recipes = load_json(recipe1m_path + 'layer1.json')

In [None]:
len(onem_recipes)

In [None]:
def print_raw_from_id(id):
    for r in onem_recipes:
        if r['id']==id:
            pprint.pprint(r)
            

In [None]:
print_raw_from_id("000018c8a5")

In [None]:
onem_processed = []
for recipe in tqdm(onem_recipes):
    ingredients = [i['text'] for i in recipe['ingredients']]
    instructions = [i['text'] for i in recipe['instructions']]
    onem_processed.append({
        'title': recipe['title'],
        'ingredients' : ingredients,
        'instructions' : instructions,
        'url': recipe['url'],
        'id' : recipe['id']
    })

In [None]:
onem_df = pd.DataFrame.from_dict(onem_processed)

In [None]:
onem_df.shape

In [None]:
onem_df.sample(5)

In [None]:
onem_df['source'] = onem_df['url'].progress_apply(get_website)

In [None]:
onem_df['title_lower'] = onem_df['title'].apply(lambda x: x.lower())

## 1M recipes has duplication issues... lets remove them...
* Assumption I haven't verified: If names are different, then can't be a duplicate
    * Maybe run into issues with some extra punctuation in titles so some duplicates aren't caught
    * But like groupby makes things sooooo easy

In [None]:
# Lots of poentital duplicates
(onem_df['title_lower'].value_counts() > 1).mean()

In [None]:
onem_df[onem_df['title_lower']=="a good easy garlic chicken"]

In [None]:
onem_df[onem_df['title_lower']=='almond chocolate coffee']

In [None]:
onem_df[onem_df['title_lower']=='apple pepper jelly']

In [None]:
deduped_rows = []

In [None]:
def dedup(grouped_df):
    if grouped_df.shape[0]==1:
        deduped_rows.append(grouped_df.iloc[0].tolist())
    else:
        # used: either appended to the uniuqe list OR not including b/c its a duplicate
        used = np.zeros(grouped_df.shape[0]).astype(bool)
        steps = [' '.join(s) for s in grouped_df['instructions'].tolist()]
        kept = []
        for i in range(grouped_df.shape[0]):
            if used[i]:
                continue
            kept.append(i)
            used[i] = True
            similarities = [-1] * (i+1) + []
            for j in range(i+1, grouped_df.shape[0]):
                similarities.append(fast_string_sim(steps[i], steps[j]))
            similarities = np.array(similarities)
            dup_indexer = similarities > score_threshhold
            used[dup_indexer] = True
            deduped_rows.append(grouped_df.iloc[i].tolist())
        

In [None]:
_ = onem_df.groupby('title_lower').progress_apply(dedup)

In [None]:
onem_df.shape[0]

In [None]:
onem_df = pd.DataFrame.from_records(deduped_rows, columns = list(onem_df))

In [None]:
# It's not 1 million. THEY LIED TO ME :p
onem_df.shape[0]

In [None]:
# Lots of poentital duplicates
(onem_df['title_lower'].value_counts() > 1).mean()

In [None]:
# Worked on those three cases at least! :)
display(onem_df[onem_df['title_lower']=="a good easy garlic chicken"])
display(onem_df[onem_df['title_lower']=='almond chocolate coffee'])
display(onem_df[onem_df['title_lower']=='apple pepper jelly'])

# Join the datasets!
### Join Logic:
* There are multiple entries with the same names in both datasets, so we can't use name to join :(
* Looks like we can use name + instructions though
    * For each row in Kaggle dataset, get # of same named ones
    * If there's just a single one, then they're a match
    * If there are multiple in food, then compute the % word overlap in the instructions to pick the best match. Also compare length so you don't just have a really really long recipe coincidentally contain all the right words. I'm guessing this is faster than doing an edit distance type thingy, but maybe that'd be better. IDK for sure

In [None]:
# make some dictionary temporary vars s.t. things move faster

In [None]:
def instruction_agg(instructions):
    pbar.update(1)
    return [' '.join(i).lower() for i in list(instructions)]

pbar = tqdm(total = onem_df['title_lower'].nunique(), leave=False)
name_to_instructions = onem_df.groupby('title_lower').agg({'instructions' : instruction_agg, 'id' : list}).to_dict(orient='index')
pbar.close()

In [None]:
for k, v in name_to_instructions.items():
    print(k)
    print(v)
    break

In [None]:
onem_id_to_uid = defaultdict(lambda: -1)
kaggle_uid_list = []
# Record the similarity history to pick a cutoff
sim_hist = []

In [None]:
pbar = tqdm(total = kaggle_df.shape[0], leave=False)
for name, instruction in zip(kaggle_df['name'].tolist(), kaggle_df['steps'].tolist()):
    this_uid = str(uuid.uuid4())
    kaggle_uid_list.append(this_uid)
    
    if name in name_to_instructions:
        possible_instructions = name_to_instructions[name]
        instruction = ' '.join(instruction)

        similarities = []
        for potential_match in possible_instructions['instructions']:
            similarities.append(fast_string_sim(instruction, potential_match))
            
        match_idx = -1
        curr_max_val = -1
        for i, s in enumerate(similarities):
            if s > score_threshhold and s > curr_max_val:
                match_idx = i
                curr_max_val = s
        if match_idx >=0:
            onem_id_to_uid[possible_instructions['id'][match_idx]] = this_uid
            
        sim_hist.append(similarities)
    pbar.update(1)
pbar.close()

In [None]:
pd.Series(list(itertools.chain.from_iterable(sim_hist))).hist()
plt.title('Similarities on potential joins')
plt.show()

In [None]:
kaggle_df['uid'] = kaggle_uid_list
onem_df['uid'] = onem_df['id'].map(onem_id_to_uid)

In [None]:
# Join %
kaggle_df.shape[0], (onem_df['uid']!=-1).sum(), onem_df[onem_df['source']=='food']['title'].nunique()

In [None]:
onem_df.shape

In [None]:
kaggle_df.drop(['name', 'steps', 'description', 'ingredients'], axis=1, inplace=True)

In [None]:
onem_df.shape, kaggle_df.shape

In [None]:
merged = onem_df.merge(kaggle_df, on='uid', how='left')

In [None]:
merged.shape

In [None]:
merged.head(5)

In [None]:
merged.drop('uid', axis=1, inplace=True)

# EDA Before preprocessing

### 1. Get an estimate on the number of times fraction bars are omitted (e.g. 1/2 --> 12)
* I'd like the recipes to be accurate, so need to clean them up

In [None]:
# all URLS seem to be reasonable, so if needed, can just scrape url and check
merged['url'].isnull().mean(), merged['url'].apply(lambda x: len(x) < 5).mean()

In [None]:
def has_twelve(ingredient_list):
    return any(['12' in x for x in ingredient_list])
def has_one_half(ingredient_list):
    return any(['1/2' in x for x in ingredient_list])

In [None]:
merged['ingredients_twelve'] = merged['ingredients'].apply(has_twelve)
merged['ingredients_half'] = merged['ingredients'].apply(has_one_half)

In [None]:
merged.groupby('source').agg({'ingredients_twelve' : {'mean', 'sum'}}).sort_values(('ingredients_twelve', 'mean'), ascending=False)

## Seems to be a frequent issue in food, but not everywhere else

| Source      | Description |
| ----------- | ----------- |
| Food      | Lots of issues       |
| online-cookbook   | Reason lots of twelves show up because liquid ingredients include amount in ml. Often see (125 ml, e.g.)|
| cookstr | Generally 12s make sense|
| foodnetwork| Generally 12s make sens|
| tastykitchen| Generally 12s make sense|  
  
     
* Just sampling the "worst" ones interms of 1/2s, looks like food.com ones are the worst by a decent amount
* So those are the only ones that I'm going to rescrape

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
merged.shape

In [None]:
import re
import time
def process_ingredient_list(ingredient_list):
    for i in range(len(ingredient_list)):
        ingredient_list[i] = re.sub(' +', ' ', ingredient_list[i])
        ingredient_list[i] = ingredient_list[i].strip()
        ingredient_list[i] = re.sub(r'^https?:\/\/.*[\r\n]*', '', ingredient_list[i])
    return ingredient_list

def update_ingredients(row):
    source = row['source']
    row['my_ingredients'] = []
    if not row['source']=='food':
        return row
    url = row['url']
    time.sleep(np.random.uniform(.25, .75))
    response = requests.get(url)
    ingredient_list = []
    try:
        if not response.status_code >= 400:
            soup = BeautifulSoup(response.text, 'html.parser')
            info_dict = soup.find_all(type='application/ld+json')[0].text
            if not 'recipeIngredient' in info_dict:
                return row
            info_dict = eval(info_dict)
            ingredient_list = info_dict['recipeIngredient']
        else:
            print('%^d @ %s'%(response.status_code, url))
            
        ingredient_list = process_ingredient_list(ingredient_list)
    except (ValueError, AttributeError, TypeError):
        return row
    except requests.ConnectionError:
        print('Connection Reset, Sleeping for a while...')
        time.sleep(20*60)
        return update_ingredients(row)

    row['my_ingredients'] = ingredient_list
    
    return row

Break up into lots of chunks b/c hits their website a bunch

In [None]:
merged = merged.sort_values('title')

In [None]:
chunk_indicies = np.arange(317)*3125

In [None]:
save_path = '../data/interim/'
if not os.path.exists(save_path):
    os.mkdir(save_path)

In [None]:
for i in range(len(chunk_indicies)-1):
    print('Chunk %d'%i)
    sub_df = merged.iloc[chunk_indicies[i]:chunk_indicies[i+1]]
    sub_df = sub_df.progress_apply(update_ingredients, axis=1)
    sub_df.to_pickle(save_path + 'after_scraping_chunk_%d.pkl'%i)