In [275]:
import pandas as pd
import numpy as np
import unicodedata
import json
from pprint import pprint

In [5]:
nyt = pd.read_csv("https://raw.githubusercontent.com/NYTimes/ingredient-phrase-tagger/master/nyt-ingredients-snapshot-2015.csv")

In [6]:
nyt.head(3)

Unnamed: 0,index,input,name,qty,range_end,unit,comment
0,0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,2,"1 medium-size onion, peeled and chopped",onion,1.0,0.0,,"medium-size, peeled and chopped"


In [59]:
pd.set_option('max_colwidth', 800)
nyt['input'][0:1]

0    1 1/4 cups cooked and pureed fresh butternut squash, or 1 10-ounce package frozen squash, defrosted
Name: input, dtype: object

## concatenate json files and create dataframe

In [130]:
file_list = ["fullhelping_items.json", "iLoveVegan_items.json", "MinimalistBaker_items.json", "ohsheglows_items.json",
             "PaleOMG_items.json"]
full = []
with open("allRecipes.json", "w") as outfile:
    for f in file_list:
        with open(f, 'rb') as infile:
            file_data = json.load(infile)
            full += file_data
    json.dump(full, outfile)


In [26]:
full[0]

{u'category': [],
 u'ingredientsList': [],
 u'instructions': [],
 u'summary': [],
 u'title': [],
 u'url': u'http://www.thefullhelping.com/recipes/breakfasts/'}

In [47]:
fullDF = pd.DataFrame(full)

In [31]:
fullDF.head(20)

Unnamed: 0,category,ingredientsList,instructions,summary,title,url
0,[],[],[],[],[],http://www.thefullhelping.com/recipes/breakfasts/
1,[],[],[],[],[],http://www.thefullhelping.com/
2,[],[],[],[],[],http://www.thefullhelping.com/recipes/nut-milks/
3,[],[],[],[],[],http://www.thefullhelping.com/recipes/hummus/
4,[],[],[],[],[],http://www.thefullhelping.com/green-recovery/
5,[],[],[],[],[],http://www.thefullhelping.com/recipes/dips-spr...
6,[],[],[],[],[],http://www.thefullhelping.com/2009/02/
7,[],[],[],[],[],http://www.thefullhelping.com/blog/
8,[],[],[],[],[],http://www.thefullhelping.com/2009/04/
9,[],[],[],[],[],http://www.thefullhelping.com/recipes/chia-pud...


In [48]:
fullDF1 = fullDF[fullDF.ingredientsList.str.len() != 0]

In [142]:
recipeDF = fullDF1.reset_index(drop=True)

In [143]:
recipeDF.index.tolist()[1]

1

## Write ingredients to file for NYT ingredient tagger

In [144]:
f = open("input.txt", "a")
ingredientList = []

for row in range(0,len(recipeDF)):
    ingredients = recipeDF.ingredientsList.iloc[row]
    for i in range(0,len(ingredients)):
        line = ingredients[i].encode('ascii','ignore')
        ingredientList.append([row,i,line])
        f.write(line + '\n')
f.close()
        

In [145]:
ingredientDF = pd.DataFrame(ingredientList, columns=['recipeIndex','ingredientIndex','recipeText'])

In [146]:
ingredientDF.head(3)

Unnamed: 0,recipeIndex,ingredientIndex,recipeText
0,0,0,"1 pound purple asparagus, washed, woody ends trimmed off, and cut into 2 inch pieces"
1,0,1,2 teaspoons olive oil
2,0,2,Salt and pepper


In [147]:
len(ingredientDF.recipeIndex.unique())

1342

## merge NYT ingredient tags with ingredientDF

In [155]:
with open('results.json', 'rb') as infile:
    ingredientTags = json.load(infile)

In [150]:
ingredientTagsDF = pd.DataFrame(ingredientTags)

In [326]:
ingredientDF2 = ingredientDF[ingredientDF['recipeText'].str.contains("\n|\t") == False]
ingredientDF2 = ingredientDF2.reset_index(drop=True)

In [331]:
ingredientDF2.head(3)

Unnamed: 0,recipeIndex,ingredientIndex,recipeText
0,0,0,"1 pound purple asparagus, washed, woody ends trimmed off, and cut into 2 inch pieces"
1,0,1,2 teaspoons olive oil
2,0,2,Salt and pepper


In [332]:
ingredientTagsDF.head(3)

Unnamed: 0,comment,display,input,name,other,qty,range_end,unit
0,washed woody ends trimmed off and cut into 2 inch pieces,"<span class='qty'>1</span><span class='unit'>pound</span><span class='name'>purple asparagus</span><span class='other'>,</span><span class='comment'>washed</span><span class='other'>,</span><span class='comment'>woody ends trimmed off</span><span class='other'>,</span><span class='comment'>and cut into 2 inch pieces</span>","1 pound purple asparagus, washed, woody ends trimmed off, and cut into 2 inch pieces",purple asparagus,",, ,",1.0,,pound
1,,<span class='qty'>2</span><span class='unit'>teaspoons</span><span class='name'>olive oil</span>,2 teaspoons olive oil,olive oil,,2.0,,teaspoon
2,,<span class='name'>Salt and pepper</span>,Salt and pepper,Salt and pepper,,,,


In [336]:
recipeTaggedDF = pd.concat([ingredientDF2,ingredientTagsDF], axis = 1)

In [337]:
recipeTaggedDF.head(3)

Unnamed: 0,recipeIndex,ingredientIndex,recipeText,comment,display,input,name,other,qty,range_end,unit
0,0,0,"1 pound purple asparagus, washed, woody ends trimmed off, and cut into 2 inch pieces",washed woody ends trimmed off and cut into 2 inch pieces,"<span class='qty'>1</span><span class='unit'>pound</span><span class='name'>purple asparagus</span><span class='other'>,</span><span class='comment'>washed</span><span class='other'>,</span><span class='comment'>woody ends trimmed off</span><span class='other'>,</span><span class='comment'>and cut into 2 inch pieces</span>","1 pound purple asparagus, washed, woody ends trimmed off, and cut into 2 inch pieces",purple asparagus,",, ,",1.0,,pound
1,0,1,2 teaspoons olive oil,,<span class='qty'>2</span><span class='unit'>teaspoons</span><span class='name'>olive oil</span>,2 teaspoons olive oil,olive oil,,2.0,,teaspoon
2,0,2,Salt and pepper,,<span class='name'>Salt and pepper</span>,Salt and pepper,Salt and pepper,,,,


## create Ingredient Documents / Strings