In [4]:
!curl -L -o https://github.com/sameergarg/scala-elasticsearch/blob/master/conf/recipeitems-latest.json.gz

/bin/sh: 1: curl: not found


In [None]:
'''
The above wasn't working to get the recipes database on windows system at school.
So used gitbash to run linux like commands

First,
curl -O https://s3.amazonaws.com/openrecipes/20170107-061401-recipeitems.json.gz
# might have to use winpty before linuxcommands in gitbash sometimes. But for now, the above worked.
# check https://github.com/curl/curl/issues/573

Then,
gunzip 20170107-061401-recipeitems.json.gz

'''

In [7]:
import pandas as pd

try:
    recipes = pd.read_json('datasets/recipeitems.json')
except ValueError as e:
    print("ValueError:", e)

'''
it seems that it’s due to using a file in which each line is itself a
valid JSON, but the full file is not.
'''

ValueError: Trailing data


In [8]:
'''
Now, let's check whether each line is a valid json

In online json formattor and validator:
{  
   "_id":{  
      "$oid":"5160756b96cc62079cc2db15"
   },
   "name":"Drop Biscuits and Sausage Gravy",
   "ingredients":"Biscuits\n3 cups All-purpose Flour\n2 Tablespoons Baking Powder\n1/2 teaspoon Salt\n1-1/2 stick (3/4 Cup) Cold Butter, Cut Into Pieces\n1-1/4 cup Butermilk\n SAUSAGE GRAVY\n1 pound Breakfast Sausage, Hot Or Mild\n1/3 cup All-purpose Flour\n4 cups Whole Milk\n1/2 teaspoon Seasoned Salt\n2 teaspoons Black Pepper, More To Taste",
   "url":"http://thepioneerwoman.com/cooking/2013/03/drop-biscuits-and-sausage-gravy/",
   "image":"http://static.thepioneerwoman.com/cooking/files/2013/03/bisgrav.jpg",
   "ts":{  
      "$date":1365276011104
   },
   "cookTime":"PT30M",
   "source":"thepioneerwoman",
   "recipeYield":"12",
   "datePublished":"2013-03-11",
   "prepTime":"PT10M",
   "description":"Late Saturday afternoon, after Marlboro Man had returned home with the soccer-playing girls, and I had returned home with the..."
}
'''

with open('datasets/recipeitems.json') as f:
    line = f.readline()
pd.read_json(line).shape # this should break if it isn't a valid json

# shape: 2,12 --> 2 json sub-elements (_id & ts), then 12 elements including _id and ts

(2, 12)

In [9]:
'''
Since each line is valid json, we will need to string them together.
One way to do -- construct a str representation containing all the JSON entries and then load it with pd.read_json
'''

# read the entire file into python array
with open('datasets/recipeitems.json', 'r') as f:
    #extract each line
    data = (line.strip() for line in f)
    #reformat so that each line is the element of a list
    data_json = "[{0}]".format(','.join(data)) # prepending each line in the entire dataset with a comma and then adding the entire data to a list
    
# read the result as a json
recipes = pd.read_json(data_json)

recipes.shape
# nearly 200k recipes, 17 columns

(173278, 17)

In [10]:
recipes.iloc[0]

_id                                {'$oid': '5160756b96cc62079cc2db15'}
cookTime                                                          PT30M
creator                                                             NaN
dateModified                                                        NaN
datePublished                                                2013-03-11
description           Late Saturday afternoon, after Marlboro Man ha...
image                 http://static.thepioneerwoman.com/cooking/file...
ingredients           Biscuits\n3 cups All-purpose Flour\n2 Tablespo...
name                                    Drop Biscuits and Sausage Gravy
prepTime                                                          PT10M
recipeCategory                                                      NaN
recipeInstructions                                                  NaN
recipeYield                                                          12
source                                                  thepione

In [13]:
# let's look at ingredients list
recipes.ingredients.str.len().describe()

count    173278.000000
mean        244.617926
std         146.705285
min           0.000000
25%         147.000000
50%         221.000000
75%         314.000000
max        9067.000000
Name: ingredients, dtype: float64

In [14]:
# recipe with longest ingredient list
import numpy as np
recipes.name[np.argmax(recipes.ingredients.str.len())] # np.argmax returns index of the max element


'Carrot Pineapple Spice &amp; Brownie Layer Cake with Whipped Cream &amp; Cream Cheese Frosting and Marzipan Carrots'

In [15]:
#let's see how many recipes are for breakfast food
recipes.ingredients.str.contains('[Bb]reakfast').sum()

233

In [17]:
# how many recipes contain cinnamon as an ingredient
recipes.ingredients.str.contains('[Cc]innamon').sum()

10526

In [18]:
# to check if any recipes misspelled the ingredient as cinamon
recipes.ingredients.str.contains('[Cc]inamon').sum()

11

In [19]:
############ A simple recipe recommender ########### ####### ADVISE: TRy to do ##########
### Given a list of ingredients, find a recipe that uses all those ingredients

'''
While concep‐tually straightforward, the task is complicated by the heterogeneity of the data: 
there is no easy operation, for example, to extract a clean list of ingredients from each row.

we’ll start with a list of common ingredients, and simply search
to see whether they are in each recipe’s ingredient list.
'''
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley', 'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']

In [20]:
## build a df consisting of T and F values indicating whether this ingredient appears in the list
import re
spice_df = pd.DataFrame(dict((spice, recipes.ingredients.str.contains(spice, re.IGNORECASE))
                            for spice in spice_list))
spice_df.head()

Unnamed: 0,cumin,oregano,paprika,parsley,pepper,rosemary,sage,salt,tarragon,thyme
0,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,True,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False


In [21]:
# Ex: If we want to find a recipe that uses parsley, paprika and tarragon. Use high perf eval() and query()

selection = spice_df.query('parsley & paprika & tarragon')
len(selection)
# only 10 recipes

10

In [22]:
# use the index returned by this selection to discover the names of the recipes that have this combination
recipes.name[selection.index]

2069      All cremat with a Little Gem, dandelion and wa...
74964                         Lobster with Thermidor butter
93768      Burton's Southern Fried Chicken with White Gravy
113926                     Mijo's Slow Cooker Shredded Beef
137686                     Asparagus Soup with Poached Eggs
140530                                 Fried Oyster Po’boys
158475                Lamb shank tagine with herb tabbouleh
158486                 Southern fried chicken in buttermilk
163175            Fried Chicken Sliders with Pickles + Slaw
165243                        Bar Tartine Cauliflower Salad
Name: name, dtype: object